Class: Html2rss::AutoSource::Scraper::Html

Inherits:
Object
  • Object
show all
Includes:
Enumerable
Defined in:
lib/html2rss/auto_source/scraper/html.rb

Overview

Scrapes articles from HTML pages by finding similar structures around anchor tags in the parsed_body.

Constant Summary collapse

TAGS_TO_IGNORE =
/(nav|footer|header)/i

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(parsed_body, url:) ⇒ Html

Returns a new instance of Html.



33
34
35
36
37
# File 'lib/html2rss/auto_source/scraper/html.rb', line 33

def initialize(parsed_body, url:)
  @parsed_body = parsed_body
  @url = url
  @selectors = Hash.new(0)
end

Instance Attribute Details

#parsed_bodyObject (readonly)

Returns the value of attribute parsed_body.



39
40
41
# File 'lib/html2rss/auto_source/scraper/html.rb', line 39

def parsed_body
  @parsed_body
end

Class Method Details

.articles?(parsed_body) ⇒ Boolean

Returns:

  • (Boolean)


16
17
18
# File 'lib/html2rss/auto_source/scraper/html.rb', line 16

def self.articles?(parsed_body)
  new(parsed_body, url: '').any?
end

.parent_until_condition(node, condition) ⇒ Object



20
21
22
23
24
25
# File 'lib/html2rss/auto_source/scraper/html.rb', line 20

def self.parent_until_condition(node, condition)
  return nil if !node || node.document? || node.parent.name == 'html'
  return node if condition.call(node)

  parent_until_condition(node.parent, condition)
end

.simplify_xpath(xpath) ⇒ Object

Simplify an XPath selector by removing the index notation.



29
30
31
# File 'lib/html2rss/auto_source/scraper/html.rb', line 29

def self.simplify_xpath(xpath)
  xpath.gsub(/\[\d+\]/, '')
end

Instance Method Details

#article_condition(node) ⇒ Object



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/html2rss/auto_source/scraper/html.rb', line 78

def article_condition(node)
  # Ignore tags that are below a tag which is in TAGS_TO_IGNORE.
  return false if node.path.match?(TAGS_TO_IGNORE)

  # Ignore tags that are below a tag which has a class which matches TAGS_TO_IGNORE.
  return false if self.class.parent_until_condition(node, proc do |current_node|
    current_node.classes.any? { |klass| klass.match?(TAGS_TO_IGNORE) }
  end)

  return true if %w[body html].include?(node.name)

  return true if node.parent.css('a').size > 1

  false
end

#each {|The| ... } ⇒ Enumerator

Returns Enumerator for the scraped articles.

Yield Parameters:

  • The (Hash)

    scraped article hash

Returns:

  • (Enumerator)

    Enumerator for the scraped articles



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/html2rss/auto_source/scraper/html.rb', line 44

def each
  return enum_for(:each) unless block_given?

  return if frequent_selectors.empty?

  frequent_selectors.each do |selector|
    parsed_body.xpath(selector).each do |selected_tag|
       = self.class.parent_until_condition(selected_tag, method(:article_condition))

      if  && (article_hash = SemanticHtml::Extractor.new(, url: @url).call)
        yield article_hash
      end
    end
  end
end

#frequent_selectors(root = @parsed_body.at_css('body'), min_frequency: 2) ⇒ Set<String>

Find all the anchors in root.

Parameters:

  • root (Nokogiri::XML::Node) (defaults to: @parsed_body.at_css('body'))

    The root node to search for anchors

Returns:

  • (Set<String>)

    The set of XPath selectors which exist at least min_frequency times



64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/html2rss/auto_source/scraper/html.rb', line 64

def frequent_selectors(root = @parsed_body.at_css('body'), min_frequency: 2)
  @frequent_selectors ||= begin
    root.traverse do |node|
      next if !node.element? || node.name != 'a'

      @selectors[self.class.simplify_xpath(node.path)] += 1
    end

    @selectors.keys
              .select { |selector| (@selectors[selector]).to_i >= min_frequency }
              .to_set
  end
end