Class: WebPageParser::GuardianPageParserV3

Inherits:
GuardianPageParserV2 show all
Defined in:
lib/web-page-parser/parsers/guardian_page_parser.rb

Instance Attribute Summary

Attributes inherited from BaseParser

#url

Instance Method Summary collapse

Methods inherited from GuardianPageParserV2

#css_first_text, #date, #filter_url, #html_doc, #title

Methods inherited from BaseParser

#date, #guid, #guid_from_url, #hash, #initialize, #page, #retrieve_page, #title

Constructor Details

This class inherits a constructor from WebPageParser::BaseParser

Instance Method Details

#contentObject



100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/web-page-parser/parsers/guardian_page_parser.rb', line 100

def content
  return @content if @content
  story_body = html_doc.css('div#article-body-blocks *, div[itemprop=articleBody] *').select do |e|
    e.name == 'p' or e.name == 'h2' or e.name == 'h3' or e.name == 'ul'
  end
  story_body.collect do |p|
    if p.name == 'ul'
      p.css('li').collect { |li| li.text.empty? ? nil : li.text.strip }
    else
      p.text.empty? ? nil : p.text.strip
    end
  end.flatten.compact
end