Class: TaiwaneseNewsParser::Parser::LibertyTimesNews
Instance Attribute Summary
#article, #url
Class Method Summary
collapse
Instance Method Summary
collapse
applicable_parser, #clean_up, #initialize, #reproduced?, subclasses
Class Method Details
.applicable?(url) ⇒ Boolean
10
11
12
|
# File 'lib/taiwanese_news_parser/parser/liberty_times_news.rb', line 10
def self.applicable?(url)
url.include?('news.ltn.com.tw')
end
|
.domain ⇒ Object
2
3
4
|
# File 'lib/taiwanese_news_parser/parser/liberty_times_news.rb', line 2
def self.domain
'ltn.com.tw'
end
|
.names ⇒ Object
6
7
8
|
# File 'lib/taiwanese_news_parser/parser/liberty_times_news.rb', line 6
def self.names
%{自由時報}
end
|
.parse_url_id(url) ⇒ Object
63
64
65
|
# File 'lib/taiwanese_news_parser/parser/liberty_times_news.rb', line 63
def self.parse_url_id(url)
url[%r{\w+/\w+/\w+/(\d+)},1]
end
|
Instance Method Details
#clean_url ⇒ Object
58
59
60
61
|
# File 'lib/taiwanese_news_parser/parser/liberty_times_news.rb', line 58
def clean_url
cleaner = TaiwaneseNewsParser::UrlCleaner.new()
@article[:url] = cleaner.clean(@article[:url])
end
|
#doc ⇒ Object
14
15
16
17
|
# File 'lib/taiwanese_news_parser/parser/liberty_times_news.rb', line 14
def doc
@raw = open(url).read
@doc = Nokogiri::HTML(@raw)
end
|
#parse ⇒ Object
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
|
# File 'lib/taiwanese_news_parser/parser/liberty_times_news.rb', line 20
def parse
@article[:title] = doc.at_css('.content h1').text
@article[:company_name] = parse_company_name
@article[:content] = doc.css('#newstext p').text
time = doc.at_css('#newstext span').text[%r{\d{4}-\d{1,2}-\d{1,2}\W*\d{2}:\d{2}}]
if time.nil?
match = doc.at_css('#newstext span').text.match(%r{(\d{2}):(\d{2})})
now = Time.now
today = Date.today
@article[:published_at] = Time.new(today.year, today.month, today.day, match[1].to_i, match[2].to_i)
else
@article[:published_at] = Time.parse("#{time}:00")
end
@article[:reporter_name] = parse_reporter_name()
clean_up
@article
end
|
#parse_company_name ⇒ Object
54
55
56
|
# File 'lib/taiwanese_news_parser/parser/liberty_times_news.rb', line 54
def parse_company_name
'自由時報'
end
|
#parse_reporter_name ⇒ Object
43
44
45
46
47
48
49
50
51
52
|
# File 'lib/taiwanese_news_parser/parser/liberty_times_news.rb', line 43
def parse_reporter_name
if match = @article[:content].match(%r{〔(.*?)[//╱](.*?)〕})
reporter_name = match[1][%r{記者(.+)},1]
elsif match = @article[:content].match(%r{記者(.+?)[//╱]})
reporter_name = match[1]
elsif match = @article[:content].match(%r{(文/(.*?))})
reporter_name = match[1]
end
reporter_name
end
|