Class: WebPageParser::GuardianPageParserV2
- Inherits:
-
BaseParser
- Object
- BaseParser
- WebPageParser::GuardianPageParserV2
show all
- Defined in:
- lib/web-page-parser/parsers/guardian_page_parser.rb
Overview
GuardianPageParserV2 parses Guardian web pages using html parsing. It can parse articles old and new but sometimes has slightly different results due to it stripping most html tags (like <strong>) which the V1 parser didn’t do.
Instance Attribute Summary
Attributes inherited from BaseParser
#url
Instance Method Summary
collapse
Methods inherited from BaseParser
#guid, #guid_from_url, #hash, #initialize, #page, #retrieve_page
Instance Method Details
#content ⇒ Object
77
78
79
80
81
82
83
|
# File 'lib/web-page-parser/parsers/guardian_page_parser.rb', line 77
def content
return @content if @content
story_body = html_doc.css('div#article-body-blocks *, div[itemprop=articleBody] *').select do |e|
e.name == 'p' or e.name == 'h2' or e.name == 'h3'
end
story_body.collect { |p| p.text.empty? ? nil : p.text.strip }.compact
end
|
#css_first_text(top_e, *selectors) ⇒ Object
59
60
61
62
63
64
65
66
67
68
|
# File 'lib/web-page-parser/parsers/guardian_page_parser.rb', line 59
def css_first_text(top_e, *selectors)
selectors.each do |s|
top_e.css(s).each do |e|
next if e.nil?
text = e.text.strip
return text unless text.empty?
end
end
nil
end
|
#date ⇒ Object
85
86
87
88
89
90
91
|
# File 'lib/web-page-parser/parsers/guardian_page_parser.rb', line 85
def date
return @date if @date
if date_meta = html_doc.at_css('meta[property="article:published_time"]')
@date = DateTime.parse(date_meta['content']) rescue nil
end
@date
end
|
#filter_url(url) ⇒ Object
93
94
95
96
|
# File 'lib/web-page-parser/parsers/guardian_page_parser.rb', line 93
def filter_url(url)
url.to_s.gsub("www.guprod.gnl", "www.guardian.co.uk")
end
|
#html_doc ⇒ Object
55
56
57
|
# File 'lib/web-page-parser/parsers/guardian_page_parser.rb', line 55
def html_doc
@html_document ||= Nokogiri::HTML(page)
end
|
#title ⇒ Object
70
71
72
73
74
75
|
# File 'lib/web-page-parser/parsers/guardian_page_parser.rb', line 70
def title
return @title if @title
@title = css_first_text(html_doc, 'h1[itemprop=headline]', 'div#main-article-info h1:first')
@title = html_doc.css('title').text.split('|').first.strip if @title.nil?
@title
end
|