Class: TaiwaneseNewsParser::Parser::AppleDaily
Instance Attribute Summary
#article, #url
Class Method Summary
collapse
Instance Method Summary
collapse
applicable?, applicable_parser, #clean_up, #initialize, #reproduced?, subclasses
Class Method Details
.domain ⇒ Object
2
3
4
|
# File 'lib/taiwanese_news_parser/parser/apple_daily.rb', line 2
def self.domain
'appledaily.com.tw'
end
|
.names ⇒ Object
6
7
8
|
# File 'lib/taiwanese_news_parser/parser/apple_daily.rb', line 6
def self.names
%w{蘋果日報}
end
|
.parse_time(raw_time) ⇒ Object
55
56
57
58
59
60
61
62
63
64
65
66
67
68
|
# File 'lib/taiwanese_news_parser/parser/apple_daily.rb', line 55
def self.parse_time(raw_time)
valid_formats = ['%Y年%m月%d日%H:%M', '%Y年%m月%d日']
date = nil
valid_formats.each do |format|
begin
date = DateTime.strptime(raw_time, format)
rescue
end
break if !date.nil?
end
return date
end
|
.parse_url_id(url) ⇒ Object
50
51
52
53
|
# File 'lib/taiwanese_news_parser/parser/apple_daily.rb', line 50
def self.parse_url_id(url)
url[%r{http://www.appledaily\.com\.tw/\w+/article/\w+/((?:\d+/)+)},1][0..-2]
end
|
Instance Method Details
#clean_url ⇒ Object
46
47
48
|
# File 'lib/taiwanese_news_parser/parser/apple_daily.rb', line 46
def clean_url
@article[:url].gsub!(%r{/([^/]*)$},'')
end
|
#doc ⇒ Object
10
11
12
13
|
# File 'lib/taiwanese_news_parser/parser/apple_daily.rb', line 10
def doc
@raw = open(url).read
@doc = Nokogiri::HTML(@raw)
end
|
#parse ⇒ Object
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
|
# File 'lib/taiwanese_news_parser/parser/apple_daily.rb', line 16
def parse
@article[:title] = doc.at_css('#h1').text
@article[:company_name] = parse_company_name
@article[:content] = doc.css('.articulum').css('p,h2').text
@article[:reporter_name] = parse_reporter_name()
@article[:published_at] = self.class.parse_time(doc.css('.gggs time').text)
clean_up
@article
end
|
#parse_company_name ⇒ Object
32
33
34
|
# File 'lib/taiwanese_news_parser/parser/apple_daily.rb', line 32
def parse_company_name
'蘋果日報'
end
|
#parse_reporter_name ⇒ Object
36
37
38
39
40
41
42
43
44
|
# File 'lib/taiwanese_news_parser/parser/apple_daily.rb', line 36
def parse_reporter_name
text = doc.css('.articulum').css('p,h2').text.strip
if match = text.match(%r{◎記者(.+)$})
return reporter_name = match[1]
elsif match = text.match(%r{【(?:記者)?(.+?)[//╱]})
reporter_name = match[1]
end
reporter_name
end
|