# encoding: utf-8 require 'rss' require 'mongo' SITES = { "總務處-各單位公告" => "0", "總務處-文書組" => "1", "總務處-出納組" => "2", "總務處-事務組" => "3", "總務處-保管組" => "5", "總務處-採購組" => "6", "總務處-經營管理組" => "7", "總務處-駐衛警察隊" => "9", "總務處-營繕組" => "10", "總務處-總務處" => "11", "社會科學院-社會科學院總務分處" => "4", "醫學院-醫學院總務分處" => "8" } SITE_KEYS = SITES.keys DB_BASE_NAME = "production" all = {} continue = true i = 1 while continue do open("http://ann.cc.ntu.edu.tw/asp/rss.asp?page=#{i}") do |rss| feed = RSS::Parser.parse(rss.read.encode('utf-8', 'big5', invalid: :replace, undef: :replace, replace: '').gsub('Wes,', 'Wed,').gsub(/(encoding=\"big5\")/, 'encoding="utf-8"')) feed.items.size feed.items.each do |item| if SITE_KEYS.include?(item.author) author = item.author.strip category = item.category.to_s.gsub(/\<(\/)*category\>/, '') if all[author] all[author][item.link.strip] = {title: item.title.strip, author: author, link: item.link.strip, date: item.pubDate, category: category, description: item.description.gsub("\r\n", '
').strip} else all[author] = {item.link.strip => {title: item.title.strip, author: author, link: item.link.strip, date: item.pubDate, category: category, description: item.description.gsub("\r\n", '
').strip}} end end end continue = false if feed.items.size < 100 end i += 1 end # Get corresponding category_id or create a new one def get_category_id(category, categories, coll_cat) if categories.keys.include? "rss_#{category}" [categories["rss_#{category}"], categories] else cat = { _type: "BulletinCategory", key: "rss_#{category}", disable: false, title: {:zh_tw => category}, created_at: Time.now, updated_at: Time.now } categories["rss_#{category}"] = result = coll_cat.save(cat) [result, categories] end end # Get categories and id based on a given site number def get_mongo_and_categories(site_number="0") db = Mongo::Connection.new("localhost", 27017).db("#{DB_BASE_NAME}_#{site_number}") coll_bulletin = db["bulletins"] coll_cat = db["bulletin_categories"] categories = coll_cat.find().to_a.inject({}) do |categories, category| categories[category['key']] = category['_id'] categories end [categories, coll_bulletin, coll_cat] end # Get main site (總務處) categories @main_categories, @main_coll_bulletin, @main_coll_cat = get_mongo_and_categories all.each do |key, value| # Loop through all the authors site_number = SITES[key] categories, coll_bulletin, coll_cat = get_mongo_and_categories(site_number) # Get current's site categories value.each_value do |bul| # Loop through all the items category_id, categories = get_category_id(bul[:category], categories, coll_cat) unless coll_bulletin.find_one(rss_link: bul[:link]) bulletin = { _type: "Bulletin", postdate: bul[:date], created_at: bul[:date], updated_at: bul[:date], is_checked: true, is_pending: false, is_rejected: false, bulletin_category_id: category_id, title: {:zh_tw => bul[:title]}, text: {:zh_tw => bul[:description]}, available_for_zh_tw: true, rss_link: bul[:link], is_top: false, is_hot: false, is_hidden: false } coll_bulletin.save(bulletin) unless site_number.eql? "0" # Copy the item to the main site category_id, @main_categories = get_category_id(bul[:category], @main_categories, @main_coll_cat) main_bulletin = bulletin.clone main_bulletin['_id'] = BSON::ObjectId.new main_bulletin[:bulletin_category_id] = category_id @main_coll_bulletin.save(main_bulletin) end end end end