Class: RDig::ContentExtractors::HpricotContentExtractor
- Inherits:
-
ContentExtractor
- Object
- ContentExtractor
- RDig::ContentExtractors::HpricotContentExtractor
- Defined in:
- lib/rdig/content_extractors/hpricot.rb
Overview
extracts title, content and links from html documents using the hpricot library
Instance Method Summary collapse
-
#content_element(doc) ⇒ Object
Retrieve the root element to extract document content from.
-
#extract_content(doc) ⇒ Object
Extracts textual content from the HTML tree.
-
#extract_links(doc) ⇒ Object
extracts the href attributes of all a tags, except internal links like <a href=“#top”>.
-
#extract_title(doc) ⇒ Object
Extracts the title from the given html tree.
-
#initialize(config) ⇒ HpricotContentExtractor
constructor
A new instance of HpricotContentExtractor.
-
#process(content) ⇒ Object
returns: { :content => ‘extracted clear text’, :title => ‘Title’, :links => [array of urls] }.
-
#strip_comments(string) ⇒ Object
Return the given string minus all html comments.
- #strip_tags(string) ⇒ Object
- #tag_from_config(doc, config_key) ⇒ Object
-
#title_tag(doc) ⇒ Object
Returns the element to extract the title from.
Methods inherited from ContentExtractor
#can_do, extractor_instances, extractors, inherited, process
Constructor Details
#initialize(config) ⇒ HpricotContentExtractor
Returns a new instance of HpricotContentExtractor.
16 17 18 19 20 |
# File 'lib/rdig/content_extractors/hpricot.rb', line 16 def initialize(config) super(config.hpricot) # if not configured, refuse to handle any content: @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/ if config.hpricot end |
Instance Method Details
#content_element(doc) ⇒ Object
Retrieve the root element to extract document content from
79 80 81 |
# File 'lib/rdig/content_extractors/hpricot.rb', line 79 def content_element(doc) tag_from_config(doc, :content_tag_selector) || doc.at('body') end |
#extract_content(doc) ⇒ Object
Extracts textual content from the HTML tree.
-
First, the root element to use is determined using the
content_element method, which itself uses the content_tag_selector from RDig.configuration.
-
Then, this element is processed by
extract_text, which will give
all textual content contained in the root element and all it’s children.
44 45 46 47 48 49 50 |
# File 'lib/rdig/content_extractors/hpricot.rb', line 44 def extract_content(doc) if ce = content_element(doc) return (strip_comments(ce.inner_html)) end # return (ce.inner_text || '').gsub(Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' ').strip return '' end |
#extract_links(doc) ⇒ Object
extracts the href attributes of all a tags, except internal links like <a href=“#top”>
54 55 56 57 58 59 60 61 |
# File 'lib/rdig/content_extractors/hpricot.rb', line 54 def extract_links(doc) {'a' => 'href', 'area' => 'href', 'frame' => 'src'}.map do |tag, attr| (doc/tag).map do |tag| value = tag[attr] CGI.unescapeHTML(value) if value && value !~ /^#/ end end.flatten.compact end |
#extract_title(doc) ⇒ Object
Extracts the title from the given html tree
64 65 66 67 68 |
# File 'lib/rdig/content_extractors/hpricot.rb', line 64 def extract_title(doc) the_title_tag = title_tag(doc) return the_title_tag unless the_title_tag.respond_to? :inner_html (the_title_tag.inner_html) end |
#process(content) ⇒ Object
returns: { :content => ‘extracted clear text’,
:title => 'Title',
:links => [array of urls] }
26 27 28 29 30 31 32 33 34 |
# File 'lib/rdig/content_extractors/hpricot.rb', line 26 def process(content) entities = HTMLEntities.new doc = Hpricot(content) { :title => entities.decode(extract_title(doc)).strip, :links => extract_links(doc), :content => entities.decode(extract_content(doc)) } end |
#strip_comments(string) ⇒ Object
Return the given string minus all html comments
89 90 91 |
# File 'lib/rdig/content_extractors/hpricot.rb', line 89 def strip_comments(string) string.gsub Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '' end |
#strip_tags(string) ⇒ Object
93 94 95 96 97 98 99 100 |
# File 'lib/rdig/content_extractors/hpricot.rb', line 93 def (string) string.gsub! Regexp.new('<(script|style).*?>.*?<\/(script|style).*?>', Regexp::MULTILINE, 'u'), '' string.gsub! Regexp.new('<.+?>', Regexp::MULTILINE, 'u'), '' string.gsub! Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' ' string.strip end |
#tag_from_config(doc, config_key) ⇒ Object
83 84 85 86 |
# File 'lib/rdig/content_extractors/hpricot.rb', line 83 def tag_from_config(doc, config_key) cfg = @config.send(config_key) cfg.is_a?(String) ? doc/cfg : cfg.call(doc) if cfg end |
#title_tag(doc) ⇒ Object
Returns the element to extract the title from.
This may return a string, e.g. an attribute value selected from a meta tag, too.
74 75 76 |
# File 'lib/rdig/content_extractors/hpricot.rb', line 74 def title_tag(doc) tag_from_config(doc, :title_tag_selector) || doc.at('title') end |