Class: WebPageParser::TheInterceptPageParserV1

Inherits:
BaseParser
  • Object
show all
Defined in:
lib/web-page-parser/parsers/the_intercept_page_parser.rb

Overview

TheInterceptPageParserV1 parses “The Intercept” web pages using html parsing.

Instance Attribute Summary

Attributes inherited from BaseParser

#url

Instance Method Summary collapse

Methods inherited from BaseParser

#guid, #hash, #initialize, #page, #retrieve_page

Constructor Details

This class inherits a constructor from WebPageParser::BaseParser

Instance Method Details

#contentObject



43
44
45
46
47
48
49
# File 'lib/web-page-parser/parsers/the_intercept_page_parser.rb', line 43

def content
  return @content if @content
  story_body = html_doc.css('article div.ti-body p').collect do |p|
    p.text.strip.gsub(160.chr(Encoding::UTF_8), ' ') # convert   to actual space
  end
  @content = story_body.select { |p| !p.empty? }
end

#dateObject



51
52
53
54
55
56
57
58
59
60
# File 'lib/web-page-parser/parsers/the_intercept_page_parser.rb', line 51

def date
  return @date if @date
  if date_meta = html_doc.at_css('meta[property="article:published_time"]')
    date_string = date_meta['content'].scan(/[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\+[0-9]{2}:[0-9]{2}/).first
    @date = DateTime.parse(date_string) rescue nil
  end
  return @date if @date
  # failing that, get it from the url
  @date = DateTime.parse(url.scan(/[0-9]{4}\/[0-9]{2}\/[0-9]{2}/).first.to_s) rescue nil
end

#guid_from_urlObject

WashPo articles have a guid in the url (as of Jan 2014, a uuid)



20
21
22
23
# File 'lib/web-page-parser/parsers/the_intercept_page_parser.rb', line 20

def guid_from_url
  # get the last large number from the url, if there is one
  url.to_s.scan(/https:\/\/firstlook.org\/theintercept\/[0-9]{4}\/[0-9]{2}\/[0-9]{2}\/[a-z0-9-]+/).last
end

#html_docObject



25
26
27
# File 'lib/web-page-parser/parsers/the_intercept_page_parser.rb', line 25

def html_doc
  @html_document ||= Nokogiri::HTML(page)
end

#titleObject



29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/web-page-parser/parsers/the_intercept_page_parser.rb', line 29

def title
  return @title if @title
  title_meta = html_doc.at_css('meta[property="og:title"]')
  title = nil
  if title_meta
    title = title_meta['content'].to_s.strip
  end
  if title.nil?
    title = html_doc.css('head title').text.strip
  end
  title = title.gsub(/- The Intercept$/,'')
  @title = title.strip
end