Class: WebPageParser::RTPageParserV1
Overview
RTPageParserV1 parses RT web pages using html parsing.
Instance Attribute Summary
Attributes inherited from BaseParser
#url
Instance Method Summary
collapse
Methods inherited from BaseParser
#guid, #guid_from_url, #hash, #initialize, #page, #retrieve_page
Instance Method Details
#content ⇒ Object
28
29
30
31
32
33
34
35
36
37
|
# File 'lib/web-page-parser/parsers/rt_page_parser.rb', line 28
def content
return @content if @content
story_summary = html_doc.css('div.article__summary').text.strip
story_body = html_doc.css('div.article__text > *').select do |e|
e.name == 'p' or e.name == 'h2' or e.name == 'h3'
end
story_body.collect! { |p| p.text.empty? ? nil : p.text.strip }.compact
story_body.unshift story_summary unless story_summary.empty?
story_body
end
|
#date ⇒ Object
39
40
41
|
# File 'lib/web-page-parser/parsers/rt_page_parser.rb', line 39
def date
@date ||= DateTime.parse(html_doc.at_css('div.article time.date:first').text.strip)
end
|
#filter_url(url) ⇒ Object
43
44
45
46
|
# File 'lib/web-page-parser/parsers/rt_page_parser.rb', line 43
def filter_url(url)
url.to_s.gsub("www.guprod.gnl", "www.guardian.co.uk")
end
|
#html_doc ⇒ Object
20
21
22
|
# File 'lib/web-page-parser/parsers/rt_page_parser.rb', line 20
def html_doc
@html_document ||= Nokogiri::HTML(page)
end
|
#title ⇒ Object
24
25
26
|
# File 'lib/web-page-parser/parsers/rt_page_parser.rb', line 24
def title
@title ||= html_doc.css('h1.article__heading').text.strip
end
|