Class: WebPageParser::RTPageParserV1

Inherits:

BaseParser

Object
BaseParser
WebPageParser::RTPageParserV1

show all

Defined in:: lib/web-page-parser/parsers/rt_page_parser.rb

Overview

RTPageParserV1 parses RT web pages using html parsing.

Instance Attribute Summary

Attributes inherited from BaseParser

#url

Instance Method Summary collapse

Methods inherited from BaseParser

#guid, #guid_from_url, #hash, #initialize, #page, #retrieve_page

Constructor Details

This class inherits a constructor from WebPageParser::BaseParser

Instance Method Details

#content ⇒ `Object`

# File 'lib/web-page-parser/parsers/rt_page_parser.rb', line 28

def content
  return @content if @content
  story_summary = html_doc.css('div.article__summary').text.strip
  story_body = html_doc.css('div.article__text > *').select do |e|
    e.name == 'p' or e.name == 'h2' or e.name == 'h3'
  end
  story_body.collect! { |p| p.text.empty? ? nil : p.text.strip }.compact
  story_body.unshift story_summary unless story_summary.empty?
  story_body
end

#date ⇒ `Object`



39
40
41

# File 'lib/web-page-parser/parsers/rt_page_parser.rb', line 39

def date
  @date ||= DateTime.parse(html_doc.at_css('div.article time.date:first').text.strip)
end

#filter_url(url) ⇒ `Object`

# File 'lib/web-page-parser/parsers/rt_page_parser.rb', line 43

def filter_url(url)
  # some wierd guardian problem with some older articles
  url.to_s.gsub("www.guprod.gnl", "www.guardian.co.uk") 
end

#html_doc ⇒ `Object`



20
21
22

# File 'lib/web-page-parser/parsers/rt_page_parser.rb', line 20

def html_doc
  @html_document ||= Nokogiri::HTML(page)
end

#title ⇒ `Object`



24
25
26

# File 'lib/web-page-parser/parsers/rt_page_parser.rb', line 24

def title
  @title ||= html_doc.css('h1.article__heading').text.strip
end

Class: WebPageParser::RTPageParserV1

Overview

Instance Attribute Summary

Attributes inherited from BaseParser

Instance Method Summary collapse

Methods inherited from BaseParser

Constructor Details

Instance Method Details

#content ⇒ Object

#date ⇒ Object

#filter_url(url) ⇒ Object

#html_doc ⇒ Object

#title ⇒ Object

#content ⇒ `Object`

#date ⇒ `Object`

#filter_url(url) ⇒ `Object`

#html_doc ⇒ `Object`

#title ⇒ `Object`