Class: TaiwaneseNewsParser::Parser::AppleDaily

Inherits:
TaiwaneseNewsParser::Parser show all
Defined in:
lib/taiwanese_news_parser/parser/apple_daily.rb

Instance Attribute Summary

Attributes inherited from TaiwaneseNewsParser::Parser

#article, #url

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from TaiwaneseNewsParser::Parser

applicable?, applicable_parser, #clean_up, #initialize, #reproduced?, subclasses

Constructor Details

This class inherits a constructor from TaiwaneseNewsParser::Parser

Class Method Details

.domainObject



2
3
4
# File 'lib/taiwanese_news_parser/parser/apple_daily.rb', line 2

def self.domain
  'appledaily.com.tw'
end

.namesObject



6
7
8
# File 'lib/taiwanese_news_parser/parser/apple_daily.rb', line 6

def self.names
  %w{蘋果日報}
end

.parse_time(raw_time) ⇒ Object



55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/taiwanese_news_parser/parser/apple_daily.rb', line 55

def self.parse_time(raw_time)
  valid_formats = ['%Y年%m月%d日%H:%M', '%Y年%m月%d日']

  date = nil
  valid_formats.each do |format|
    begin
      date = DateTime.strptime(raw_time, format)
    rescue
    end
    break if !date.nil?
  end

  return date
end

.parse_url_id(url) ⇒ Object



50
51
52
53
# File 'lib/taiwanese_news_parser/parser/apple_daily.rb', line 50

def self.parse_url_id(url)
  # removes trailing slash
  url[%r{http://www.appledaily\.com\.tw/\w+/article/\w+/((?:\d+/)+)},1][0..-2]
end

Instance Method Details

#clean_urlObject



46
47
48
# File 'lib/taiwanese_news_parser/parser/apple_daily.rb', line 46

def clean_url
  @article[:url].gsub!(%r{/([^/]*)$},'')
end

#docObject



10
11
12
13
# File 'lib/taiwanese_news_parser/parser/apple_daily.rb', line 10

def doc
  @raw = open(url).read
  @doc = Nokogiri::HTML(@raw)
end

#parseObject



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/taiwanese_news_parser/parser/apple_daily.rb', line 16

def parse
  @article[:title] = doc.at_css('#h1').text

  @article[:company_name] = parse_company_name

  @article[:content] = doc.css('.articulum').css('p,h2').text

  @article[:reporter_name] = parse_reporter_name()

  @article[:published_at] = self.class.parse_time(doc.css('.gggs time').text)

  clean_up

  @article
end

#parse_company_nameObject



32
33
34
# File 'lib/taiwanese_news_parser/parser/apple_daily.rb', line 32

def parse_company_name
  '蘋果日報'
end

#parse_reporter_nameObject



36
37
38
39
40
41
42
43
44
# File 'lib/taiwanese_news_parser/parser/apple_daily.rb', line 36

def parse_reporter_name
  text = doc.css('.articulum').css('p,h2').text.strip
  if match = text.match(%r{◎記者(.+)$})
    return reporter_name = match[1]
  elsif match = text.match(%r{【(?:記者)?(.+?)[//╱]})
    reporter_name = match[1]
  end
  reporter_name
end