Class: Scrubyt::Extractor

Inherits:
Object
  • Object
show all
Includes:
FetchAction
Defined in:
lib/scrubyt/core/shared/extractor.rb

Overview

Driving the whole extraction process

Extractor is a performer class - it gets an extractor definition and carries out the actions and evaluates the wrappers sequentially.

Originally also the navigation actions were here, but since the class got too big, they were factored out to an own class, NavigationAction.

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from FetchAction

get_current_doc_url, #get_host_name, get_hpricot_doc, get_mechanize_doc, #restore_host_name, #restore_page, #store_host_name, #store_page

Constructor Details

#initialize(mode, extractor_definition) ⇒ Extractor

Returns a new instance of Extractor.



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/scrubyt/core/shared/extractor.rb', line 40

def initialize(mode, extractor_definition)
  @mode = mode
  @root_patterns = []
  @next_page_pattern = nil
  #      @hpricot_doc = nil
  #      @hpricot_doc_url = nil
  @evaluating_extractor_definition = false
  @next_page_list = []
  @processed_pages = []
  
  backtrace = SharedUtils.get_backtrace
  parts = backtrace[1].split(':')
  source_file = parts[0]
  
  Scrubyt.log :MODE, mode == :production ? 'Production' : 'Learning'
  
  @evaluating_extractor_definition = true
  context = Object.new
  context.extend NavigationActions
  context.instance_eval do
    def extractor=(value)
      @extractor = value
    end
    
    def next_page(*args)
      @extractor.next_page_pattern = Scrubyt::Pattern.new('next_page', args, @extractor)
    end
    
    def method_missing(method_name, *args, &block)
      root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @extractor, nil, &block)
      @extractor.root_patterns << root_pattern
      root_pattern
    end
  end
  context.extractor = self
  context.instance_eval(&extractor_definition)
  @evaluating_extractor_definition = false
  
  if @root_patterns.empty?
    # TODO: this should be an exception
    Scrubyt.log :ERROR, 'No extractor defined, exiting...'
    exit
  end
  
  #Once all is set up, evaluate the extractor from the root pattern!
  root_results = evaluate_extractor
  
  @result = ScrubytResult.new('root')
  @result.push(*root_results)
  @result.root_patterns = @root_patterns
  @result.source_file = source_file
  @result.source_proc = extractor_definition
  
  #Return the root pattern
  Scrubyt.log :INFO, 'Extraction finished succesfully!'
end

Instance Attribute Details

#evaluating_extractor_definitionObject

, :hpricot_doc, :current_doc_url



13
14
15
# File 'lib/scrubyt/core/shared/extractor.rb', line 13

def evaluating_extractor_definition
  @evaluating_extractor_definition
end

#modeObject

, :hpricot_doc, :current_doc_url



13
14
15
# File 'lib/scrubyt/core/shared/extractor.rb', line 13

def mode
  @mode
end

#next_page_patternObject

, :hpricot_doc, :current_doc_url



13
14
15
# File 'lib/scrubyt/core/shared/extractor.rb', line 13

def next_page_pattern
  @next_page_pattern
end

#resultObject

, :hpricot_doc, :current_doc_url



13
14
15
# File 'lib/scrubyt/core/shared/extractor.rb', line 13

def result
  @result
end

#root_patternsObject

, :hpricot_doc, :current_doc_url



13
14
15
# File 'lib/scrubyt/core/shared/extractor.rb', line 13

def root_patterns
  @root_patterns
end

Class Method Details

.define(mode = nil, &extractor_definition) ⇒ Object

The definition of the extractor is passed through this method



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/scrubyt/core/shared/extractor.rb', line 16

def self.define(mode=nil, &extractor_definition)
  if mode.is_a?(Hash)
    if mode[:agent]==:firefox
      FetchAction.class_eval do
        include Navigation::Firewatir
      end
    else
      FetchAction.class_eval do
        include Navigation::Mechanize
      end
    end
  else
    FetchAction.class_eval do
      include Navigation::Mechanize
    end
  end
  extractor = self.new(mode, extractor_definition)
  extractor.result
end

.load(filename) ⇒ Object



36
37
38
# File 'lib/scrubyt/core/shared/extractor.rb', line 36

def self.load(filename)
  define(&eval(IO.read(filename)))
end

Instance Method Details

#add_to_next_page_list(result_node) ⇒ Object



117
118
119
120
121
122
123
124
125
126
127
# File 'lib/scrubyt/core/shared/extractor.rb', line 117

def add_to_next_page_list(result_node)
  if result_node.result.is_a? Hpricot::Elem
    node = XPathUtils.find_nearest_node_with_attribute(result_node.result, 'href')
    return if node == nil || node.attributes['href'] == nil
    href = node.attributes['href'].gsub('&amp;') {'&'}
  elsif result_node.result.is_a? String
    href = result_node.result
  end
  url = href #TODO need absolute address here 1/4
  @next_page_list << url
end

#evaluate_extractorObject



129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# File 'lib/scrubyt/core/shared/extractor.rb', line 129

def evaluate_extractor
  root_results = []
  current_page_count = 1
  catch :quit_next_page_loop do
    loop do
      url = get_current_doc_url #TODO need absolute address here 2/4
      @processed_pages << url
      @root_patterns.each do |root_pattern|
        root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
      end
      
      while @processed_pages.include? url #TODO need absolute address here 3/4
        if !@next_page_pattern.nil?
          throw :quit_next_page_loop if @next_page_pattern.options[:limit] == current_page_count
          throw :quit_next_page_loop unless @next_page_pattern.filters[0].generate_XPath_for_example(true)
          xpath = @next_page_pattern.filters[0].xpath
          node = (get_hpricot_doc/xpath).map.last
          node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
          throw :quit_next_page_loop if node == nil || node.attributes['href'] == nil
          href = node.attributes['href'].gsub('&amp;') {'&'}
          throw :quit_next_page_loop if href == nil
          url = href #TODO need absolute address here 4/4
        else
          throw :quit_next_page_loop if @next_page_list.empty?
          url = @next_page_list.pop
        end
      end

      restore_host_name
      FetchAction.fetch(url)
      
      current_page_count += 1
    end
  end
  root_results
end

#get_current_doc_urlObject



101
102
103
# File 'lib/scrubyt/core/shared/extractor.rb', line 101

def get_current_doc_url
  FetchAction.get_current_doc_url
end

#get_detail_pattern_relationsObject



105
106
107
# File 'lib/scrubyt/core/shared/extractor.rb', line 105

def get_detail_pattern_relations
  @detail_pattern_relations
end

#get_hpricot_docObject



97
98
99
# File 'lib/scrubyt/core/shared/extractor.rb', line 97

def get_hpricot_doc
  FetchAction.get_hpricot_doc
end

#get_modeObject



109
110
111
# File 'lib/scrubyt/core/shared/extractor.rb', line 109

def get_mode
  @mode
end

#get_original_host_nameObject



113
114
115
# File 'lib/scrubyt/core/shared/extractor.rb', line 113

def get_original_host_name
  @original_host_name
end