Class: WebPageParser::WashingtonPostPageParserV1

Inherits:
BaseParser
  • Object
show all
Defined in:
lib/web-page-parser/parsers/washingtonpost_page_parser.rb

Overview

WashingtonPostPageParserV1 parses washpo web pages using html parsing. Doesn’t work since 2018

Instance Attribute Summary

Attributes inherited from BaseParser

#url

Instance Method Summary collapse

Methods inherited from BaseParser

#guid, #hash, #initialize, #page, #retrieve_page

Constructor Details

This class inherits a constructor from WebPageParser::BaseParser

Instance Method Details

#contentObject



34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/web-page-parser/parsers/washingtonpost_page_parser.rb', line 34

def content
  return @content if @content
  story_body = html_doc.css('div.article_body *,div#main-content article *,article[itemprop="articleBody"] *').select do |e|
    next false if e.attributes['class'].to_s["pin-and-stack"]
    e.name == 'p' or e.name == 'blockquote'
  end
  story_body.collect! do |p| 
    p.search('script,object').remove
    p = p.text.strip
  end
  @content = story_body.select { |p| !p.empty? }
end

#dateObject



47
48
49
50
51
52
53
54
55
56
57
# File 'lib/web-page-parser/parsers/washingtonpost_page_parser.rb', line 47

def date
  return @date if @date
  # date in url is best source of first published date
  @date = DateTime.parse(url.scan(/[0-9]{4}\/[0-9]{2}\/[0-9]{2}/).first.to_s) rescue nil
  return @date if @date
  # failing that, get DC.date.issued which is actually last updated
  if date_meta = html_doc.at_css('meta[name="DC.date.issued"]')
    @date = DateTime.parse(date_meta['content']) rescue nil
  end
  @date
end

#guid_from_urlObject

WashPo articles have a guid in the url (as of Jan 2014, a uuid)



21
22
23
24
# File 'lib/web-page-parser/parsers/washingtonpost_page_parser.rb', line 21

def guid_from_url
  # get the last large number from the url, if there is one
  url.to_s.scan(/[a-f0-9-]{30,40}/).last
end

#html_docObject



26
27
28
# File 'lib/web-page-parser/parsers/washingtonpost_page_parser.rb', line 26

def html_doc
  @html_document ||= Nokogiri::HTML(page)
end

#titleObject



30
31
32
# File 'lib/web-page-parser/parsers/washingtonpost_page_parser.rb', line 30

def title
  @title ||= html_doc.css('h1[property="dc.title"],div#article-topper > h1').text.strip
end