Class: WebPageParser::RTPageParserV1

Inherits:
BaseParser show all
Defined in:
lib/web-page-parser/parsers/rt_page_parser.rb

Overview

RTPageParserV1 parses RT web pages using html parsing.

Instance Attribute Summary

Attributes inherited from BaseParser

#url

Instance Method Summary collapse

Methods inherited from BaseParser

#guid, #guid_from_url, #hash, #initialize, #page, #retrieve_page

Constructor Details

This class inherits a constructor from WebPageParser::BaseParser

Instance Method Details

#contentObject



28
29
30
31
32
33
34
35
36
37
# File 'lib/web-page-parser/parsers/rt_page_parser.rb', line 28

def content
  return @content if @content
  story_summary = html_doc.css('div.article__summary').text.strip
  story_body = html_doc.css('div.article__text > *').select do |e|
    e.name == 'p' or e.name == 'h2' or e.name == 'h3'
  end
  story_body.collect! { |p| p.text.empty? ? nil : p.text.strip }.compact
  story_body.unshift story_summary unless story_summary.empty?
  story_body
end

#dateObject



39
40
41
# File 'lib/web-page-parser/parsers/rt_page_parser.rb', line 39

def date
  @date ||= DateTime.parse(html_doc.at_css('div.article time.date:first').text.strip)
end

#filter_url(url) ⇒ Object



43
44
45
46
# File 'lib/web-page-parser/parsers/rt_page_parser.rb', line 43

def filter_url(url)
  # some wierd guardian problem with some older articles
  url.to_s.gsub("www.guprod.gnl", "www.guardian.co.uk") 
end

#html_docObject



20
21
22
# File 'lib/web-page-parser/parsers/rt_page_parser.rb', line 20

def html_doc
  @html_document ||= Nokogiri::HTML(page)
end

#titleObject



24
25
26
# File 'lib/web-page-parser/parsers/rt_page_parser.rb', line 24

def title
  @title ||= html_doc.css('h1.article__heading').text.strip
end