Class: BNextRobot
- Inherits:
-
Object
- Object
- BNextRobot
- Includes:
- Crawler, FeedFilter
- Defined in:
- lib/ext_class/bnext_robot.rb
Overview
BNextRobot Extract titles and links of daily/ weekly hot feeds.
Constant Summary collapse
- FEED_XPATH =
"//a[contains(@class, 'item_title block_link')]/@href"- TITLE_XPATH =
"//div[contains(@class, 'main_title')]"- TAG_XPATH =
"//a[contains(@class, 'tag_link')]"- INFO_XPATH =
"//span[contains(@class, 'info')]"- CONTENT_XPATH =
"//div[contains(@class, 'content htmlview')]"- IMGS_XPATH =
"//div[contains(@class, 'content htmlview')]/p/img/@src"
Instance Attribute Summary collapse
-
#day_rank_feeds ⇒ Object
Returns the value of attribute day_rank_feeds.
-
#week_rank_feeds ⇒ Object
Returns the value of attribute week_rank_feeds.
Attributes included from Crawler
Instance Method Summary collapse
- #_extract_feed(feed_id) ⇒ Object
- #analyze ⇒ Object
- #get_feeds(cat, page_no) ⇒ Object
- #init_rank_feeds ⇒ Object
-
#initialize ⇒ BNextRobot
constructor
A new instance of BNextRobot.
- #show_day_rank ⇒ Object
- #show_week_rank ⇒ Object
Methods included from FeedFilter
Methods included from Crawler
Constructor Details
#initialize ⇒ BNextRobot
Returns a new instance of BNextRobot.
22 23 24 25 26 |
# File 'lib/ext_class/bnext_robot.rb', line 22 def initialize load_page('http://www.bnext.com.tw/') analyze init_rank_feeds end |
Instance Attribute Details
#day_rank_feeds ⇒ Object
Returns the value of attribute day_rank_feeds.
20 21 22 |
# File 'lib/ext_class/bnext_robot.rb', line 20 def day_rank_feeds @day_rank_feeds end |
#week_rank_feeds ⇒ Object
Returns the value of attribute week_rank_feeds.
20 21 22 |
# File 'lib/ext_class/bnext_robot.rb', line 20 def week_rank_feeds @week_rank_feeds end |
Instance Method Details
#_extract_feed(feed_id) ⇒ Object
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
# File 'lib/ext_class/bnext_robot.rb', line 85 def _extract_feed(feed_id) query_url = @domain[0..-2] + "#{feed_id}" document = Oga.parse_html(open(query_url)) title = nil; = nil; date = nil; content = nil; = nil; imgs = nil; begin title = document.xpath(TITLE_XPATH).text.force_encoding('utf-8') rescue end begin = document.xpath(INFO_XPATH)[0].text.gsub('撰文者:'.force_encoding('ascii-8bit'), '').force_encoding('utf-8') rescue end begin date = document.xpath(INFO_XPATH)[1].text.gsub('發表日期:'.force_encoding('ascii-8bit'), '').force_encoding('utf-8') rescue end begin content = document.xpath(CONTENT_XPATH).text.force_encoding('utf-8') rescue end begin = document.xpath(TAG_XPATH).map { |i| i.text.force_encoding('utf-8') } rescue end begin imgs = document.xpath(IMGS_XPATH).map { |i| i.text.force_encoding('utf-8') } rescue end Feed.new(title, , date, , query_url, content, imgs) end |
#analyze ⇒ Object
28 29 30 31 32 33 34 35 36 37 38 |
# File 'lib/ext_class/bnext_robot.rb', line 28 def analyze = @web_data.scan(/<li>.*?<\/li>/) = .map { |x| x.match(/<a.*?<\/a>/).to_s } hrefs = .map { |x| x.match(/href=\".*?\"/).to_s[7..-2] } cat_names = .map { |x| x.match(/>.+?</).to_s[1..-2] } cats_pair = cat_names.zip(hrefs).select { |n, ref| ref.start_with? 'categories' } @cats = Hash.new(false) cats_pair.map { |n, ref| @cats[n] = @domain + ref } nil end |
#get_feeds(cat, page_no) ⇒ Object
70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
# File 'lib/ext_class/bnext_robot.rb', line 70 def get_feeds(cat, page_no) # TODO: parse all feeds @ page: page_no query_url = @domain + "categories/#{cat}/?p=#{page_no}" document = Oga.parse_html(open(query_url)) path = document.xpath(FEED_XPATH).map(&:text) # path.each do |feed_id| # feed = _extract_feed(feed_id) # puts "Title: #{feed.title}" # puts "Author: #{feed.author}" # puts "Date: #{feed.date}" # puts "Tags: " + feed.tags.join(", ") # end path.map { |feed_id| _extract_feed(feed_id) } end |
#init_rank_feeds ⇒ Object
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# File 'lib/ext_class/bnext_robot.rb', line 50 def init_rank_feeds token_gen = ["//div[@id = '", "_rank']//a[@class = 'content']"] document = Oga.parse_html(@web_data) day_rank_hrefs = document.xpath(token_gen.join('day') + '/@href').map(&:text) week_rank_hrefs = document.xpath(token_gen.join('week') + '/@href').map(&:text) day_rank_titles = document.xpath(token_gen.join('day')).map(&:text) week_rank_titles = document.xpath(token_gen.join('week')).map(&:text) day_rank = day_rank_titles.zip(day_rank_hrefs).select { |title, href| href.start_with? '/' } day_rank = day_rank.map { |title, href| [title, @domain + href[1..-1]] } week_rank = week_rank_titles.zip(week_rank_hrefs).select { |title, href| href.start_with? '/' } week_rank = week_rank.map { |title, href| [title, @domain + href[1..-1]] } @day_rank_feeds = day_rank.map { |title, href| Feed.new(title, "", "", [], href, "") } @week_rank_feeds = week_rank.map { |title, href| Feed.new(title, "", "", [], href, "") } nil end |
#show_day_rank ⇒ Object
40 41 42 43 |
# File 'lib/ext_class/bnext_robot.rb', line 40 def show_day_rank @day_rank_feeds.map { |feed| puts "#{feed.title}: #{feed.link}" } nil end |
#show_week_rank ⇒ Object
45 46 47 48 |
# File 'lib/ext_class/bnext_robot.rb', line 45 def show_week_rank @week_rank_feeds.map { |feed| puts "#{feed.title}: #{feed.link}" } nil end |