Class: TaiwaneseNewsParser::Parser::Cna
Instance Attribute Summary
#article, #url
Class Method Summary
collapse
Instance Method Summary
collapse
applicable?, applicable_parser, #clean_up, #initialize, subclasses
Class Method Details
.domain ⇒ Object
2
3
4
|
# File 'lib/taiwanese_news_parser/parser/cna.rb', line 2
def self.domain
'cna.com.tw'
end
|
.names ⇒ Object
6
7
8
|
# File 'lib/taiwanese_news_parser/parser/cna.rb', line 6
def self.names
%{中央社}
end
|
.parse_url_id(url) ⇒ Object
56
57
58
|
# File 'lib/taiwanese_news_parser/parser/cna.rb', line 56
def self.parse_url_id(url)
url[%r{/(\d+)(?:\-\d)?\.},1]
end
|
Instance Method Details
#doc ⇒ Object
10
11
12
13
|
# File 'lib/taiwanese_news_parser/parser/cna.rb', line 10
def doc
@raw = open(url).read
@doc = Nokogiri::HTML(@raw)
end
|
#parse ⇒ Object
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
|
# File 'lib/taiwanese_news_parser/parser/cna.rb', line 16
def parse
@article[:title] = doc.at_css('.news_content h1, .news_content h2').text
@article[:company_name] = '中央社'
@article[:content] = doc.css('.news_content .box_2').text
@article[:reporter_name] = parse_reporter_name()
match = doc.css('.news_content .box_2').text.strip.match( /(\d{3})(\d{2})(\d{2})/ )
date = []
date[0] = match[1].to_i + 1911
date[1] = match[2]
date[2] = match[3]
date_string = date.join('/') + ' ' + doc.css('.date').text
@article[:published_at] = Time.parse(date_string)
clean_up
@article
end
|
#parse_reporter_name ⇒ Object
38
39
40
41
42
43
44
45
46
47
48
49
50
|
# File 'lib/taiwanese_news_parser/parser/cna.rb', line 38
def parse_reporter_name
text = doc.css('.news_content .box_2').text
text = text[/(中央社(.*?)\d{1,2}日/,1]
cities = %w{台北 新北 台中 台南 高雄 基隆 新竹 嘉義 桃園 新竹 苗栗 彰化 南投 雲林 嘉義 屏東 宜蘭 花蓮 台東 澎湖 金門 連江}
cities.find do |city|
text.gsub!(/#{city}(?:縣市)?$/,'')
end
if match = text.match(%r{記者(.+)})
reporter_name = match[1]
end
reporter_name
end
|
#reproduced? ⇒ Boolean
52
53
54
|
# File 'lib/taiwanese_news_parser/parser/cna.rb', line 52
def reproduced?
false
end
|