Class: Scrubyt::DetailPageFilter

Inherits:
BaseFilter show all
Defined in:
lib/scrubyt/core/scraping/filters/detail_page_filter.rb

Constant Summary

Constants inherited from BaseFilter

BaseFilter::EXAMPLE_TYPE_CHILDREN, BaseFilter::EXAMPLE_TYPE_COMPOUND, BaseFilter::EXAMPLE_TYPE_IMAGE, BaseFilter::EXAMPLE_TYPE_REGEXP, BaseFilter::EXAMPLE_TYPE_STRING, BaseFilter::EXAMPLE_TYPE_XPATH

Instance Attribute Summary

Attributes inherited from BaseFilter

#constraints, #example, #example_type, #final_result, #parent_pattern, #regexp, #temp_sink, #xpath

Instance Method Summary collapse

Methods inherited from BaseFilter

create, #method_missing, #throw_method_missing

Dynamic Method Handling

This class handles dynamic methods through the method_missing method in the class Scrubyt::BaseFilter

Instance Method Details

#evaluate(source) ⇒ Object



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/scrubyt/core/scraping/filters/detail_page_filter.rb', line 4

def evaluate(source)
  if source.is_a?(String)
    url = source
  else
    url = XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href']
  end
  @parent_pattern.extractor.store_page
  original_host_name = @parent_pattern.extractor.get_host_name
  @parent_pattern.extractor.restore_host_name

  begin
    FetchAction.fetch url, :resolve => @parent_pattern.resolve 
  rescue
    Scrubyt.log :ERROR, "Couldn't get page, probably returned 404 or 500 status code"
  end
  

  if @detail_extractor.nil?
    @detail_extractor = Extractor.new @parent_pattern.extractor.mode, @parent_pattern.referenced_extractor
    root_results = @detail_extractor.result
  else
    root_results = @detail_extractor.evaluate_extractor
  end



  @parent_pattern.extractor.restore_page
  @parent_pattern.extractor.store_host_name original_host_name

  root_results
end

#get_detail_sexpObject



36
37
38
# File 'lib/scrubyt/core/scraping/filters/detail_page_filter.rb', line 36

def get_detail_sexp
  [:block, *@detail_extractor.result.root_patterns.to_sexp_array]
end