Class: Sc::VisualSelector

Inherits:
Selector show all
Defined in:
lib/scrappy/extractor/selectors/visual.rb

Instance Method Summary collapse

Methods inherited from Selector

#select

Methods included from Scrappy::Formats

#format

Constructor Details

#initialize(args = {}) ⇒ VisualSelector

Returns a new instance of VisualSelector.



4
5
6
7
# File 'lib/scrappy/extractor/selectors/visual.rb', line 4

def initialize args={}
  super
  @cache = {}
end

Instance Method Details

#filter(doc) ⇒ Object



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/scrappy/extractor/selectors/visual.rb', line 9

def filter doc
  @cache[doc] ||= begin
    # By initializing variables, we avoid getting data from a hash (slow)
    min_relative_x  = (sc::min_relative_x.first.to_i if sc::min_relative_x.first)
    max_relative_x  = (sc::max_relative_x.first.to_i if sc::max_relative_x.first)
    min_relative_y  = (sc::min_relative_y.first.to_i if sc::min_relative_y.first)
    max_relative_y  = (sc::max_relative_y.first.to_i if sc::max_relative_y.first)
    min_x           = (sc::min_x.first.to_i if sc::min_x.first)
    max_x           = (sc::max_x.first.to_i if sc::max_x.first)
    min_y           = (sc::min_y.first.to_i if sc::min_y.first)
    max_y           = (sc::max_y.first.to_i if sc::max_y.first)
    min_width       = (sc::min_width.first.to_i if sc::min_width.first)
    max_width       = (sc::max_width.first.to_i if sc::max_width.first)
    min_height      = (sc::min_height.first.to_i if sc::min_height.first)
    max_height      = (sc::max_height.first.to_i if sc::max_height.first)
    min_font_size   = (sc::min_font_size.first.to_i if sc::min_font_size.first)
    max_font_size   = (sc::max_font_size.first.to_i if sc::max_font_size.first)
    min_font_weight = (sc::min_font_weight.first.to_i if sc::min_font_weight.first)
    max_font_weight = (sc::max_font_weight.first.to_i if sc::max_font_weight.first)
    font_family     =  sc::font_family.first
    attributes      =  sc::attribute
    formats         =  sc::format
    tag             =  sc::tag
    
    elements  = doc[:content].search((tag - ["text"]).first || "*")
    elements += Nokogiri::XML::NodeSet.new(doc[:content].document, [doc[:content]]) if tag.include?(doc[:content].name)
    
    elements.select do |node|
      relative_x = node['vx'].to_i - doc[:content]['vx'].to_i
      relative_y = node['vy'].to_i - doc[:content]['vy'].to_i
      
      !node.text? and
      ( (node['vfont']   and node.name!="a" and node.name!="img") or !tag.include?("text") ) and
      ( !min_relative_x  or relative_x           >= min_relative_x) and
      ( !max_relative_x  or relative_x           <= max_relative_x) and
      ( !min_relative_y  or relative_y           >= min_relative_y) and
      ( !max_relative_y  or relative_y           <= max_relative_y) and
      
      ( !min_x           or node['vx'].to_i      >= min_x) and
      ( !max_x           or node['vx'].to_i      <= max_x) and
      ( !min_y           or node['vy'].to_i      >= min_y) and
      ( !max_y           or node['vy'].to_i      <= max_y) and
      
      ( !min_width       or node['vw'].to_i      >= min_width) and
      ( !max_width       or node['vw'].to_i      <= max_width) and
      ( !min_height      or node['vh'].to_i      >= min_height) and
      ( !max_height      or node['vh'].to_i      <= max_height) and
      
      ( !min_font_size   or node['vsize'].to_i   >= min_font_size) and
      ( !max_font_size   or node['vsize'].to_i   <= max_font_size) and
      ( !min_font_weight or node['vweight'].to_i >= min_font_weight) and
      ( !max_font_weight or node['vweight'].to_i <= max_font_weight) and
      ( !font_family     or node['vfont']        == font_family)
    end.map do |content|
      if attributes.first
        # Select node's attribute if given
        attributes.map { |attribute| { :uri=>doc[:uri], :content=>content, :value=>content[attribute].clean, :attribute=>attribute } }
      else
        [ { :uri=>doc[:uri], :content=>content, :value=>format(content, formats, doc[:uri]) } ]
      end
    end.flatten
  end
end