Class: Scrubyt::TreeFilter

Inherits:
BaseFilter show all
Defined in:
lib/scrubyt/core/scraping/filters/tree_filter.rb

Constant Summary

Constants inherited from BaseFilter

BaseFilter::EXAMPLE_TYPE_CHILDREN, BaseFilter::EXAMPLE_TYPE_COMPOUND, BaseFilter::EXAMPLE_TYPE_IMAGE, BaseFilter::EXAMPLE_TYPE_REGEXP, BaseFilter::EXAMPLE_TYPE_STRING, BaseFilter::EXAMPLE_TYPE_XPATH

Instance Attribute Summary

Attributes inherited from BaseFilter

#constraints, #example, #example_type, #final_result, #parent_pattern, #regexp, #temp_sink, #xpath

Instance Method Summary collapse

Methods inherited from BaseFilter

create, #method_missing, #throw_method_missing

Dynamic Method Handling

This class handles dynamic methods through the method_missing method in the class Scrubyt::BaseFilter

Instance Method Details

#evaluate(source) ⇒ Object



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/scrubyt/core/scraping/filters/tree_filter.rb', line 4

def evaluate(source)
  return [@final_result] if @final_result
  #Crude hack! Drop it after it will be supported in Hpricot
  if @xpath =~ /.+\/@.+$/
    @example = @xpath
    @xpath = @xpath.scan(/^(.+?)\/@/)[0][0]
  end
  result = source/@xpath

  Scrubyt.log :ACTION, "Evaluating #{@parent_pattern.name} with #{@xpath}"

  xpath_results = Hpricot::Elements === result ? result : [result]

  if @example =~ /.+\/@.+$/
    result_attribute = @example.scan(/.+\/@(.+?)$/)[0][0]
    xpath_results.map! {|r| r.attributes[result_attribute] }
  end
  if @regexp == nil
    xpath_results
  else
    regexp_results = []
    xpath_results.each do |entry|
      text = SharedUtils.prepare_text_for_comparison(result.inner_html)
      if text =~ @regexp
        regexp_results << $1
      end
    end
    regexp_results
  end
end

#generate_regexp_for_exampleObject



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/scrubyt/core/scraping/filters/tree_filter.rb', line 35

def generate_regexp_for_example
  return if @example_type != EXAMPLE_TYPE_STRING
  return if @temp_sink.nil?
  return if @temp_sink.is_a? String
  return if @example =~ /.+\[.+\]$/

  text = SharedUtils.prepare_text_for_comparison(@temp_sink.inner_html.gsub(/<.*?>/, ''))
  match_range = @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0)
  return if match_range == (0..text.length)

  @regexp = text
  @temp_sink.changing_ranges.sort.reverse.each do |range|
    @regexp[range] = if range == match_range then '<<<regexp_selection>>>' else '<<<regexp_changing>>>' end
  end
  @regexp = Regexp.escape(@regexp)
  @regexp = @regexp.gsub('<<<regexp_changing>>>', '.*?')
  @regexp = @regexp.gsub('<<<regexp_selection>>>', '(.*?)')
  @regexp = '^' + @regexp + '$'
  @regexp = /#{@regexp}/
end

#generate_relative_XPath(parent_xpath) ⇒ Object



130
131
132
133
134
135
# File 'lib/scrubyt/core/scraping/filters/tree_filter.rb', line 130

def generate_relative_XPath(parent_xpath)
  parent_xpath = XPathUtils.to_full_XPath(@parent_pattern.extractor.get_hpricot_doc,
                                          parent_xpath,
                                          @parent_pattern.parent.generalize) if parent_xpath =~ /(\[@.+=.+\])$/
  @xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_xpath, @xpath) if (@xpath =~ /^\/html/) #TODO: should not rely on <html> being the root node
end

#generate_XPath_for_example(next_page_example = false) ⇒ Object

For all the tree patterns, generate an XPath based on the given example Also this method should not be called directly; It is automatically called for every tree pattern directly after wrapper definition



60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# File 'lib/scrubyt/core/scraping/filters/tree_filter.rb', line 60

def generate_XPath_for_example(next_page_example=false)
  #puts "generating example for: #{@parent_pattern.name}"
  #puts @example_type
  case @example_type
  when EXAMPLE_TYPE_XPATH
    @xpath = @example
  when EXAMPLE_TYPE_STRING
    @temp_sink = SimpleExampleLookup.find_node_from_text(@parent_pattern.extractor.get_hpricot_doc,
                                                         @example,
                                                         next_page_example)
    return if @temp_sink == nil
    if @temp_sink.is_a? String
      @final_result = @temp_sink
      return
    end

    mark_changing_ranges = lambda { |element, range|
      element.instance_eval do
        @changing_ranges ||= [] << range
        def changing_ranges
          @changing_ranges
        end
      end
    }
    mark_changing_ranges.call(@temp_sink, @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0))
    write_indices = next_page_example ? true : !@parent_pattern.generalize
    @xpath = XPathUtils.generate_XPath(@temp_sink, nil, write_indices)
  when EXAMPLE_TYPE_CHILDREN
    current_example_index = 0
    loop do
      all_child_temp_sinks = []
      @parent_pattern.children.each do |child_pattern|
        all_child_temp_sinks << child_pattern.filters[current_example_index].temp_sink if child_pattern.filters[current_example_index].temp_sink
      end
      result = all_child_temp_sinks.pop
      if all_child_temp_sinks.empty?
        result = result.parent
      else
        all_child_temp_sinks.each do |child_sink|
          result = XPathUtils.lowest_common_ancestor(result, child_sink)
        end
      end
      xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(result, nil, false) :
                                           XPathUtils.generate_XPath(result, nil, true)
      if @parent_pattern.filters.size < current_example_index + 1
        @parent_pattern.filters << Scrubyt::BaseFilter.create(@parent_pattern)
      end
      @parent_pattern.filters[current_example_index].xpath = xpath
      @parent_pattern.filters[current_example_index].temp_sink = result
      @parent_pattern.children.each do |child_pattern|
      next if child_pattern.type == :detail_page
        child_pattern.filters[current_example_index].xpath =
        child_pattern.generalize ? XPathUtils.generate_generalized_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result) :
        XPathUtils.generate_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result)
      end
      break if @parent_pattern.children[0].filters.size == current_example_index + 1
      current_example_index += 1
    end
  when EXAMPLE_TYPE_IMAGE
    @temp_sink = XPathUtils.find_image(@parent_pattern.extractor.get_hpricot_doc, @example)
    @xpath = XPathUtils.generate_XPath(@temp_sink, nil, true)
  when EXAMPLE_TYPE_COMPOUND
    @temp_sink = CompoundExampleLookup.find_node_from_compund_example(@parent_pattern.extractor.get_hpricot_doc,
                                                                      @example,
                                                                      next_page_example)
    @xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
                                          XPathUtils.generate_XPath(@temp_sink, nil, true)
  end
end

#to_sexpObject



137
138
139
140
141
142
143
# File 'lib/scrubyt/core/scraping/filters/tree_filter.rb', line 137

def to_sexp
  if @example =~ /.+\[@.+\]$/
    [:str, "#{@xpath}/@#{@example.scan(/\[@(.+?)\]/)[0][0]}"]
  else
    [:str, @xpath]
  end
end