Class: RDig::ContentExtractors::HpricotContentExtractor

Inherits:
ContentExtractor show all
Defined in:
lib/rdig/content_extractors/hpricot.rb

Overview

extracts title, content and links from html documents using the hpricot library

Instance Method Summary collapse

Methods inherited from ContentExtractor

#can_do, extractor_instances, extractors, inherited, process

Constructor Details

#initialize(config) ⇒ HpricotContentExtractor

Returns a new instance of HpricotContentExtractor.



16
17
18
19
20
# File 'lib/rdig/content_extractors/hpricot.rb', line 16

def initialize(config)
  super(config.hpricot)
  # if not configured, refuse to handle any content:
  @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/ if config.hpricot 
end

Instance Method Details

#content_element(doc) ⇒ Object

Retrieve the root element to extract document content from



79
80
81
# File 'lib/rdig/content_extractors/hpricot.rb', line 79

def content_element(doc)
  tag_from_config(doc, :content_tag_selector) || doc.at('body')
end

#extract_content(doc) ⇒ Object

Extracts textual content from the HTML tree.

  • First, the root element to use is determined using the

content_element method, which itself uses the content_tag_selector from RDig.configuration.

  • Then, this element is processed by extract_text, which will give

all textual content contained in the root element and all it’s children.



44
45
46
47
48
49
50
# File 'lib/rdig/content_extractors/hpricot.rb', line 44

def extract_content(doc)
  if ce = content_element(doc)
    return strip_tags(strip_comments(ce.inner_html))
  end
    # return (ce.inner_text || '').gsub(Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' ').strip
  return ''
end

extracts the href attributes of all a tags, except internal links like <a href=“#top”>



54
55
56
57
58
59
60
61
# File 'lib/rdig/content_extractors/hpricot.rb', line 54

def extract_links(doc)
  {'a' => 'href', 'area' => 'href', 'frame' => 'src'}.map do |tag, attr|
    (doc/tag).map do |tag|
      value = tag[attr]
      CGI.unescapeHTML(value) if value && value !~ /^#/
    end
  end.flatten.compact
end

#extract_title(doc) ⇒ Object

Extracts the title from the given html tree



64
65
66
67
68
# File 'lib/rdig/content_extractors/hpricot.rb', line 64

def extract_title(doc)
  the_title_tag = title_tag(doc)
  return the_title_tag unless the_title_tag.respond_to? :inner_html
  strip_tags(the_title_tag.inner_html)
end

#process(content) ⇒ Object

returns: { :content => ‘extracted clear text’,

:title => 'Title',
:links => [array of urls] }


26
27
28
29
30
31
32
33
34
# File 'lib/rdig/content_extractors/hpricot.rb', line 26

def process(content)
  entities = HTMLEntities.new
  doc = Hpricot(content)
  { 
    :title => entities.decode(extract_title(doc)).strip,
    :links => extract_links(doc),
    :content => entities.decode(extract_content(doc))
  }
end

#strip_comments(string) ⇒ Object

Return the given string minus all html comments



89
90
91
# File 'lib/rdig/content_extractors/hpricot.rb', line 89

def strip_comments(string)
  string.gsub Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), ''
end

#strip_tags(string) ⇒ Object



93
94
95
96
97
98
99
100
# File 'lib/rdig/content_extractors/hpricot.rb', line 93

def strip_tags(string)
  string.gsub! Regexp.new('<(script|style).*?>.*?<\/(script|style).*?>', 
                         Regexp::MULTILINE, 'u'), ''
  string.gsub! Regexp.new('<.+?>',
                         Regexp::MULTILINE, 'u'), ''
  string.gsub! Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' '
  string.strip
end

#tag_from_config(doc, config_key) ⇒ Object



83
84
85
86
# File 'lib/rdig/content_extractors/hpricot.rb', line 83

def tag_from_config(doc, config_key)
  cfg = @config.send(config_key)
  cfg.is_a?(String) ? doc/cfg : cfg.call(doc) if cfg
end

#title_tag(doc) ⇒ Object

Returns the element to extract the title from.

This may return a string, e.g. an attribute value selected from a meta tag, too.



74
75
76
# File 'lib/rdig/content_extractors/hpricot.rb', line 74

def title_tag(doc)
  tag_from_config(doc, :title_tag_selector) || doc.at('title')
end