Class: WebPageParser::GuardianPageParserV2

Inherits:
BaseParser
  • Object
show all
Defined in:
lib/web-page-parser/parsers/guardian_page_parser.rb

Overview

GuardianPageParserV2 parses Guardian web pages using html parsing. It can parse articles old and new but sometimes has slightly different results due to it stripping most html tags (like <strong>) which the V1 parser didn’t do.

Direct Known Subclasses

GuardianPageParserV3

Instance Attribute Summary

Attributes inherited from BaseParser

#url

Instance Method Summary collapse

Methods inherited from BaseParser

#guid, #guid_from_url, #hash, #initialize, #page, #retrieve_page

Constructor Details

This class inherits a constructor from WebPageParser::BaseParser

Instance Method Details

#contentObject



77
78
79
80
81
82
83
# File 'lib/web-page-parser/parsers/guardian_page_parser.rb', line 77

def content
  return @content if @content
  story_body = html_doc.css('div#article-body-blocks *, div[itemprop=articleBody] *').select do |e|
    e.name == 'p' or e.name == 'h2' or e.name == 'h3'
  end
  story_body.collect { |p| p.text.empty? ? nil : p.text.strip }.compact
end

#css_first_text(top_e, *selectors) ⇒ Object



59
60
61
62
63
64
65
66
67
68
# File 'lib/web-page-parser/parsers/guardian_page_parser.rb', line 59

def css_first_text(top_e, *selectors)
  selectors.each do |s|
    top_e.css(s).each do |e|
      next if e.nil?
      text = e.text.strip
      return text unless text.empty?
    end
  end
  nil
end

#dateObject



85
86
87
88
89
90
91
# File 'lib/web-page-parser/parsers/guardian_page_parser.rb', line 85

def date
  return @date if @date
  if date_meta = html_doc.at_css('meta[property="article:published_time"]')
    @date = DateTime.parse(date_meta['content']) rescue nil
  end
  @date
end

#filter_url(url) ⇒ Object



93
94
95
96
# File 'lib/web-page-parser/parsers/guardian_page_parser.rb', line 93

def filter_url(url)
  # some wierd guardian problem with some older articles
  url.to_s.gsub("www.guprod.gnl", "www.guardian.co.uk") 
end

#html_docObject



55
56
57
# File 'lib/web-page-parser/parsers/guardian_page_parser.rb', line 55

def html_doc
  @html_document ||= Nokogiri::HTML(page)
end

#titleObject



70
71
72
73
74
75
# File 'lib/web-page-parser/parsers/guardian_page_parser.rb', line 70

def title
  return @title if @title
  @title = css_first_text(html_doc, 'h1[itemprop=headline]', 'div#main-article-info h1:first')
  @title = html_doc.css('title').text.split('|').first.strip if @title.nil?
  @title
end