Class: TaiwaneseNewsParser::Parser

Inherits:
Object
  • Object
show all
Extended by:
Memoist
Defined in:
lib/taiwanese_news_parser/parser.rb

Defined Under Namespace

Classes: AppleDaily, ChinaTimes, ChinaTimesMoney, Cna, Cts, Ettoday, LibertyTimes, LibertyTimesBig5, LibertyTimesNews, NowNews, Tvbs, Udn

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ Parser

Returns a new instance of Parser.



25
26
27
28
29
30
31
# File 'lib/taiwanese_news_parser/parser.rb', line 25

def initialize(url)
  @url = url
  @article = {}
  @article[:url] = url
  @article[:web_domain] = self.class.domain()
  @article[:url_id] = self.class.parse_url_id(url)
end

Instance Attribute Details

#articleObject (readonly)

Returns the value of attribute article.



8
9
10
# File 'lib/taiwanese_news_parser/parser.rb', line 8

def article
  @article
end

#urlObject

Returns the value of attribute url.



7
8
9
# File 'lib/taiwanese_news_parser/parser.rb', line 7

def url
  @url
end

Class Method Details

.applicable?(url) ⇒ Boolean

Returns:

  • (Boolean)


10
11
12
# File 'lib/taiwanese_news_parser/parser.rb', line 10

def self.applicable?(url)
  url.include?(domain())
end

.applicable_parser(url) ⇒ Object



14
15
16
17
18
19
20
21
22
23
# File 'lib/taiwanese_news_parser/parser.rb', line 14

def self.applicable_parser(url)
  redirected_url = open(url).base_uri.to_s

  parser_class = subclasses.find do |parser_class|
    parser_class.applicable?(redirected_url)
  end
  if parser_class
    parser_class.new(redirected_url)
  end
end

.domainObject

Raises:

  • (NotImplementedError)


56
57
58
# File 'lib/taiwanese_news_parser/parser.rb', line 56

def self.domain
  raise NotImplementedError
end

.subclassesObject



52
53
54
# File 'lib/taiwanese_news_parser/parser.rb', line 52

def self.subclasses
  [ Udn, LibertyTimes, LibertyTimesBig5, LibertyTimesNews, ChinaTimes, ChinaTimesMoney, Cna, AppleDaily, Ettoday, Tvbs, Cts, NowNews ]
end

Instance Method Details

#clean_upObject



39
40
41
42
43
44
45
# File 'lib/taiwanese_news_parser/parser.rb', line 39

def clean_up
  [:content, :title, :reporter_name, :company_name].each do |attr|
    @article[attr].strip! if @article[attr]
  end
  clean_url if respond_to?(:clean_url)
  @article[:reproduced] = reproduced?
end

#docObject



33
34
35
36
# File 'lib/taiwanese_news_parser/parser.rb', line 33

def doc
  @raw = open(url).read.encode('utf-8', 'big5', :invalid => :replace, :undef => :replace, :replace => '')
  @doc = ::Nokogiri::HTML(@raw,url)
end

#reproduced?Boolean

Returns:

  • (Boolean)


47
48
49
# File 'lib/taiwanese_news_parser/parser.rb', line 47

def reproduced?
  !self.class.names.include?(parse_company_name)
end