This repository has been archived on 2024-03-16. You can view files and clone it, but cannot push or open issues or pull requests.
orbit-4-1/lib/rss_ntu_job.rb

159 lines
6.0 KiB
Ruby
Raw Normal View History

2014-01-23 06:30:27 +00:00
# encoding: utf-8
require 'rss'
require 'mongo'
SITES = { "總務處-各單位公告" => "0",
"總務處-文書組" => "1",
"總務處-出納組" => "2",
"總務處-事務組" => "3",
"總務處-保管組" => "5",
"總務處-採購組" => "6",
"總務處-經營管理組" => "7",
"總務處-駐衛警察隊" => "9",
"總務處-營繕組" => "10",
"總務處-總務處" => "11",
"社會科學院-社會科學院總務分處" => "4",
"醫學院-醫學院總務分處" => "8" }
SITE_KEYS = SITES.keys
DB_BASE_NAME = "production_new"
all = {}
continue = true
i = 1
yesterday = Time.now - 86400
while continue do
open("http://ann.cc.ntu.edu.tw/asp/rss.asp?page=#{i}") do |rss|
feed = RSS::Parser.parse(rss.read.encode('utf-8', 'big5', invalid: :replace, undef: :replace, replace: '').gsub('<pubDate>Wes,', '<pubDate>Wed,').gsub(/(encoding=\"big5\")/, 'encoding="utf-8"'))
feed.items.each do |item|
if item.pubDate > yesterday
if SITE_KEYS.include?(item.author)
author = item.author.strip
category = item.category.to_s.gsub(/\<(\/)*category\>/, '')
if all[author]
all[author][item.link.strip] = {title: item.title.strip, author: author, link: item.link.strip, date: item.pubDate, category: category, description: item.description.gsub("\r\n", '<br/>').strip}
else
all[author] = {item.link.strip => {title: item.title.strip, author: author, link: item.link.strip, date: item.pubDate, category: category, description: item.description.gsub("\r\n", '<br/>').strip}}
end
end
else
continue = false
break
end
end
end
i += 1
end
# Get corresponding category_id or create a new one
def get_category_id(category, categories, coll_cat, bulletin_module_id)
if categories.keys.include? "rss_#{category}"
[categories["rss_#{category}"], categories]
else
cat = {
_type: "Category",
module_app_id: bulletin_module_id,
key: "rss_#{category}",
disable: false,
custom: false,
title: {:zh_tw => category},
created_at: Time.now,
updated_at: Time.now
}
categories["rss_#{category}"] = result = coll_cat.save(cat)
[result, categories]
end
end
# Get categories and id based on a given site number
def get_mongo_and_categories(site_number="0")
db = Mongo::Connection.new("localhost", 27017).db("#{DB_BASE_NAME}_#{site_number}")
bulletin_module_id = db["module_apps"].find(key: "announcement").first
coll_bulletin = db["bulletins"]
coll_buffer_cat = db["buffer_categories"]
coll_cat = db["categories"]
coll_bulletin_cat = db["categories"].find(module_app_id: bulletin_module_id["_id"])
categories = coll_bulletin_cat.find().to_a.inject({}) do |categories, category|
categories[category['key']] = category['_id']
categories
end
[categories, coll_bulletin, coll_cat, bulletin_module_id["_id"],coll_buffer_cat]
end
# Get main site (總務處) categories
@main_categories, @main_coll_bulletin, @main_coll_cat, @main_bulletin_module_id, @main_coll_buffer_cat = get_mongo_and_categories
@copy_categories, @copy_coll_bulletin, @copy_coll_cat, @copy_bulletin_module_id, @copy_coll_buffer_cat = get_mongo_and_categories('11')
all.each do |key, value| # Loop through all the authors
site_number = SITES[key]
categories, coll_bulletin, coll_cat, bulletin_module_id, coll_buffer_cat = get_mongo_and_categories(site_number) # Get current's site categories
value.each_value do |bul| # Loop through all the items
category_id, categories = get_category_id(bul[:category], categories, coll_cat, bulletin_module_id)
unless coll_bulletin.find_one(rss_link: bul[:link])
bulletin = { _type: "Bulletin",
postdate: bul[:date],
created_at: bul[:date],
updated_at: bul[:date],
public: true,
is_checked: true,
is_pending: false,
is_rejected: false,
category_id: category_id,
title: {:zh_tw => bul[:title]},
text: {:zh_tw => bul[:description]},
available_for_en: false,
available_for_zh_tw: true,
rss_link: bul[:link],
is_top: false,
is_hot: false,
is_hidden: false }
bs = coll_bulletin.save(bulletin)
buffer_cat = { _type: "BufferCategory",
category_id: category_id,
categorizable_type: "Bulletin",
categorizable_id: bs }
coll_buffer_cat.save(buffer_cat)
unless site_number.eql?("0") || @main_coll_bulletin.find_one(rss_link: bul[:link]) # Copy the item to the main site
category_id, @main_categories = get_category_id(bul[:category], @main_categories, @main_coll_cat, @main_bulletin_module_id)
main_bulletin = bulletin.clone
main_bulletin['_id'] = BSON::ObjectId.new
main_bulletin[:category_id] = category_id
main_bs = @main_coll_bulletin.save(main_bulletin)
main_buffer_cat = { _type: "BufferCategory",
category_id: category_id,
categorizable_type: "Bulletin",
categorizable_id: main_bs }
@main_coll_buffer_cat.save(main_buffer_cat)
category_id, @copy_categories = get_category_id(bul[:category], @copy_categories, @copy_coll_cat, @copy_bulletin_module_id)
copy_bulletin = bulletin.clone
copy_bulletin['_id'] = BSON::ObjectId.new
copy_bulletin[:category_id] = category_id
@copy_coll_bulletin.save(copy_bulletin)
copy_bs = @copy_coll_bulletin.save(copy_bulletin)
copy_buffer_cat = { _type: "BufferCategory",
category_id: category_id,
categorizable_type: "Bulletin",
categorizable_id: copy_bs }
@copy_coll_buffer_cat.save(copy_buffer_cat)
end
end
end
end