Class: WebPageParser::BbcNewsPageParserV3

Inherits:
BbcNewsPageParserV2 show all
Defined in:
lib/web-page-parser/parsers/bbc_news_page_parser.rb

Direct Known Subclasses

BbcNewsPageParserV4

Constant Summary collapse

CONTENT_RE =
ORegexp.new('<div id="story\-body">(.*?)<div class="bookmark-list">', 'm')
STRIP_FEATURES_RE =
ORegexp.new('<div class="story-feature">(.*?)</div>', 'm')
STRIP_MARKET_DATA_WIDGET_RE =
ORegexp.new('<\!\-\- S MD_WIDGET.*? E MD_WIDGET \-\->')

Constants inherited from BbcNewsPageParserV2

WebPageParser::BbcNewsPageParserV2::DATE_RE, WebPageParser::BbcNewsPageParserV2::PARA_RE, WebPageParser::BbcNewsPageParserV2::STRIP_BLOCKS_RE, WebPageParser::BbcNewsPageParserV2::STRIP_CAPTIONS_RE, WebPageParser::BbcNewsPageParserV2::STRIP_COMMENTS_RE, WebPageParser::BbcNewsPageParserV2::STRIP_TAGS_RE, WebPageParser::BbcNewsPageParserV2::TITLE_RE, WebPageParser::BbcNewsPageParserV2::WHITESPACE_RE

Constants inherited from BaseRegexpParser

WebPageParser::BaseRegexpParser::DATE_RE, WebPageParser::BaseRegexpParser::HTML_ENTITIES_DECODER, WebPageParser::BaseRegexpParser::KILL_CHARS_RE, WebPageParser::BaseRegexpParser::TITLE_RE

Instance Attribute Summary

Attributes inherited from BaseParser

#url

Instance Method Summary collapse

Methods inherited from BaseRegexpParser

#content, #date, #decode_entities, #encode, #initialize, #page, #retrieve_page, #title

Methods inherited from BaseParser

#content, #date, #guid, #guid_from_url, #hash, #initialize, #page, #retrieve_page, #title

Constructor Details

This class inherits a constructor from WebPageParser::BaseRegexpParser

Instance Method Details

#content_processorObject

BBC news is now in utf8



101
102
103
104
105
# File 'lib/web-page-parser/parsers/bbc_news_page_parser.rb', line 101

def content_processor
  @content = STRIP_FEATURES_RE.gsub(@content, '')
  @content = STRIP_MARKET_DATA_WIDGET_RE.gsub(@content, '')
  super
end