Class: BNextRobot

Inherits:
Object
  • Object
show all
Includes:
Crawler, FeedFilter
Defined in:
lib/ext_class/bnext_robot.rb

Overview

BNextRobot Extract titles and links of daily/ weekly hot feeds.

Constant Summary collapse

FEED_XPATH =
"//a[contains(@class, 'item_title block_link')]/@href"
TITLE_XPATH =
"//div[contains(@class, 'main_title')]"
TAG_XPATH =
"//a[contains(@class, 'tag_link')]"
INFO_XPATH =
"//span[contains(@class, 'info')]"
CONTENT_XPATH =
"//div[contains(@class, 'content htmlview')]"
IMGS_XPATH =
"//div[contains(@class, 'content htmlview')]/p/img/@src"

Instance Attribute Summary collapse

Attributes included from Crawler

#cats, #domain, #web_data

Instance Method Summary collapse

Methods included from FeedFilter

#filter_feeds

Methods included from Crawler

#load_page

Constructor Details

#initializeBNextRobot

Returns a new instance of BNextRobot.



22
23
24
25
26
# File 'lib/ext_class/bnext_robot.rb', line 22

def initialize
  load_page('http://www.bnext.com.tw/')
  analyze
  init_rank_feeds
end

Instance Attribute Details

#day_rank_feedsObject

Returns the value of attribute day_rank_feeds.



20
21
22
# File 'lib/ext_class/bnext_robot.rb', line 20

def day_rank_feeds
  @day_rank_feeds
end

#week_rank_feedsObject

Returns the value of attribute week_rank_feeds.



20
21
22
# File 'lib/ext_class/bnext_robot.rb', line 20

def week_rank_feeds
  @week_rank_feeds
end

Instance Method Details

#_extract_feed(feed_id) ⇒ Object



85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/ext_class/bnext_robot.rb', line 85

def _extract_feed(feed_id)
  query_url = @domain[0..-2] + "#{feed_id}"
  document = Oga.parse_html(open(query_url))
  title = nil;
  author = nil;
  date = nil;
  content = nil;
  tags = nil;
  imgs = nil;

  begin
    title = document.xpath(TITLE_XPATH).text.force_encoding('utf-8')
  rescue
  end
  begin
    author = document.xpath(INFO_XPATH)[0].text.gsub('撰文者:'.force_encoding('ascii-8bit'), '').force_encoding('utf-8')
  rescue
  end
  begin
    date = document.xpath(INFO_XPATH)[1].text.gsub('發表日期:'.force_encoding('ascii-8bit'), '').force_encoding('utf-8')
  rescue
  end
  begin
    content = document.xpath(CONTENT_XPATH).text.force_encoding('utf-8')
  rescue
  end
  begin
    tags = document.xpath(TAG_XPATH).map { |i| i.text.force_encoding('utf-8') }
  rescue
  end
  begin
    imgs = document.xpath(IMGS_XPATH).map { |i| i.text.force_encoding('utf-8') }
  rescue
  end
  Feed.new(title, author, date, tags, query_url, content, imgs)
end

#analyzeObject



28
29
30
31
32
33
34
35
36
37
38
# File 'lib/ext_class/bnext_robot.rb', line 28

def analyze
  cat_tags = @web_data.scan(/<li>.*?<\/li>/)
  atags = cat_tags.map { |x| x.match(/<a.*?<\/a>/).to_s }
  hrefs = atags.map { |x| x.match(/href=\".*?\"/).to_s[7..-2] }
  cat_names = atags.map { |x| x.match(/>.+?</).to_s[1..-2] }
  cats_pair = cat_names.zip(hrefs).select { |n, ref| ref.start_with? 'categories' }

  @cats = Hash.new(false)
  cats_pair.map { |n, ref| @cats[n] = @domain + ref }
  nil
end

#get_feeds(cat, page_no) ⇒ Object



70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/ext_class/bnext_robot.rb', line 70

def get_feeds(cat, page_no)
  # TODO: parse all feeds @ page: page_no
  query_url = @domain + "categories/#{cat}/?p=#{page_no}"
  document = Oga.parse_html(open(query_url))
  path = document.xpath(FEED_XPATH).map(&:text)
  # path.each do |feed_id|
  #   feed = _extract_feed(feed_id)
  #   puts "Title: #{feed.title}"
  #   puts "Author: #{feed.author}"
  #   puts "Date: #{feed.date}"
  #   puts "Tags: " + feed.tags.join(", ")
  # end
  path.map { |feed_id| _extract_feed(feed_id) }
end

#init_rank_feedsObject



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/ext_class/bnext_robot.rb', line 50

def init_rank_feeds
  token_gen = ["//div[@id = '", "_rank']//a[@class = 'content']"]
  document = Oga.parse_html(@web_data)

  day_rank_hrefs = document.xpath(token_gen.join('day') + '/@href').map(&:text)
  week_rank_hrefs = document.xpath(token_gen.join('week') + '/@href').map(&:text)

  day_rank_titles = document.xpath(token_gen.join('day')).map(&:text)
  week_rank_titles = document.xpath(token_gen.join('week')).map(&:text)

  day_rank = day_rank_titles.zip(day_rank_hrefs).select { |title, href| href.start_with? '/' }
  day_rank = day_rank.map { |title, href| [title, @domain + href[1..-1]] }
  week_rank = week_rank_titles.zip(week_rank_hrefs).select { |title, href| href.start_with? '/' }
  week_rank = week_rank.map { |title, href| [title, @domain + href[1..-1]] }

  @day_rank_feeds = day_rank.map { |title, href| Feed.new(title, "", "", [], href, "") }
  @week_rank_feeds = week_rank.map { |title, href| Feed.new(title, "", "", [], href, "") }
  nil
end

#show_day_rankObject



40
41
42
43
# File 'lib/ext_class/bnext_robot.rb', line 40

def show_day_rank
  @day_rank_feeds.map { |feed| puts "#{feed.title}: #{feed.link}" }
  nil
end

#show_week_rankObject



45
46
47
48
# File 'lib/ext_class/bnext_robot.rb', line 45

def show_week_rank
  @week_rank_feeds.map { |feed| puts "#{feed.title}: #{feed.link}" }
  nil
end