Class: WebPageParser::WashingtonPostPageParserV2

Inherits:
BaseParser
  • Object
show all
Defined in:
lib/web-page-parser/parsers/washingtonpost_page_parser.rb

Overview

WashingtonPostPageParserV2 parses washpo web pages using html parsing. Works since June 2018.

Instance Attribute Summary

Attributes inherited from BaseParser

#url

Instance Method Summary collapse

Methods inherited from BaseParser

#guid, #hash, #initialize, #page, #retrieve_page

Constructor Details

This class inherits a constructor from WebPageParser::BaseParser

Instance Method Details

#contentObject



79
80
81
82
83
# File 'lib/web-page-parser/parsers/washingtonpost_page_parser.rb', line 79

def content
  return @content if @content
  story_body = html_doc.css('article:first p,article:first div.subhead').collect { |p| p.text.strip }
  @content = story_body.select { |p| !p.empty? }
end

#dateObject



85
86
87
88
89
90
91
# File 'lib/web-page-parser/parsers/washingtonpost_page_parser.rb', line 85

def date
  return @date if @date
  if date_meta = html_doc.at_css('*[itemprop="datePublished"]')
    @date = DateTime.parse(date_meta['content']).new_offset(0) rescue nil
  end
  @date
end

#guid_from_urlObject

WashPo articles have a uuid in the url



66
67
68
69
# File 'lib/web-page-parser/parsers/washingtonpost_page_parser.rb', line 66

def guid_from_url
  # get the last large number from the url, if there is one
  url.to_s.scan(/[a-f0-9-]{30,40}/).last
end

#html_docObject



71
72
73
# File 'lib/web-page-parser/parsers/washingtonpost_page_parser.rb', line 71

def html_doc
  @html_document ||= Nokogiri::HTML(page)
end

#titleObject



75
76
77
# File 'lib/web-page-parser/parsers/washingtonpost_page_parser.rb', line 75

def title
  @title ||= html_doc.css('h1[itemprop="headline"]').text.strip
end