require 'uri' def full_url(rel, url) return rel if rel.match /^[\w]*:\/\// uri = URI(url) if rel[0] == '/' "#{uri.scheme}://#{uri.host}#{rel}" else path = uri.path.split('/')[0..-2].select{|m| !m.empty?}.join('/') "#{uri.scheme}://#{uri.host}/#{path}/#{rel}" end end def catch_data(url,html_flag=false) html = %x[curl '#{url}'] html = Nokogiri.HTML(html) html.css("table.baseTB").each do |v| titles = v.css('span.ptname a') img_url = v.css('a.item-image img').map{|v| full_url(v.attr('src'),url)} if titles.count>0 && img_url.count>0 return [titles.map{|v| catch_album_image_data(full_url(v.attr('href'),url))},titles.map{|v| v.text()},img_url] + (html_flag ? [html]: []) end end;nil end def catch_album_image_data(url) html = %x[curl '#{url}'] html = Nokogiri.HTML(html) type1_out = html.css('script').map{|v| v.content}.join.scan(/imgLink\[\"\d+\"\] *\= *["']((?:(?![\t\n;]).)+)["']|imgAlt\[\"\d+\"\] *\= *["']((?:(?![\t\n;]).)*)["']/m) if type1_out.count>0 puts 'type1_out' links = type1_out.map{|v| v[0].blank? ? nil : full_url(v[0],url)}.compact alts = type1_out.map{|v| v[1]}.compact return (0...links.length).collect{|i| [alts[i],links[i]]} end type2_out = html.css('#album_show img,ul.ad-thumb-list img,#slider img,div.photoFlow ul img').map{|v| [v.attr('alt'),full_url(v.attr('src'),url)]} if type2_out.count>0 puts 'type2_out' return type2_out end end def get_data_from_index_url(url) all_titles = [] all_img_url = [] all_album_images = [] album_images,titles,img_url,html = catch_data(url,true) page_nums = html.css('.pagenum,.pager-last a') all_titles += titles if titles all_img_url += img_url if img_url all_album_images += album_images if page_nums.count > 0 last_page = page_nums.last.attr('href') last_num = full_url(last_page,url).scan(/\?page=\d+|-\d+.php/)[0].scan(/\d+/)[0].to_i for i in 2..last_num search_url = last_page.gsub(/\?page=(\d+)|-(\d+).php/){|v| v.gsub(/\d+/,i.to_s)} album_images,titles,img_url = catch_data(search_url) all_titles += titles if titles all_img_url += img_url if img_url all_album_images += album_images end end [all_album_images,all_titles,all_img_url] end def create_album(url) data = get_data_from_index_url(url) data end url = 'https://audslp.asia.edu.tw/files/131-1080-20-1.php?Lang=zh-tw' d=get_data_from_index_url(url) """ d[0][i] => [alt,link] d[1][i] => album title d[2][i] => 封面img_url (改catch_album_image_data)內頁缺card版本(9) (改catch_data)封面差 7,8,9 連結在line 缺album描述 類別: $('.module.module-path a.path') or $('#page-title') 已完成: 1,2,3( 澳門聾人協會演講[無資料], 106.09.24-25 Orientation Camp[無資料]), """ url = 'https://pt.asia.edu.tw/files/131-1116-45-1.php?Lang=zh-tw' html = %x[curl '#{url}'] html = Nokogiri.HTML(html) 109-1新生茶會 1~6 10 11 $('table.baseTB').filter(function(i,v){var tp=$(v).find('span.ptname').length>0&$(v).find('img').length>0;return tp}) 8 $('ul.list').filter(function(i,v){var tp=$(v).find('div.photo').length>0;return tp}) 9 $('div.row').filter(function(i,v){var tp=$(v).find('div.card').length>0&$(v).find('div.row').length==0;return tp}) 頁數:$('.pagenum') => -頁數.php 7 $('div#content-body') page:$('.item-list') =>