Class: Sc::Selector

Inherits:
Object
  • Object
show all
Includes:
RDF::NodeProxy, Scrappy::Formats
Defined in:
lib/scrappy/extractor/selector.rb

Instance Method Summary collapse

Methods included from Scrappy::Formats

#format

Instance Method Details

#select(doc) ⇒ Object



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/scrappy/extractor/selector.rb', line 7

def select doc
  if sc::debug.first=="true" and Scrappy::Agent::Options.debug and
    (Scrappy::Agent::Options.debug_key.nil? or doc[:value].downcase.include?(Scrappy::Agent::Options.debug_key) )
    
    puts '== DEBUG'
    puts '== Selector:'
    puts node.serialize(:yarf, false)
    puts '== On fragment:'
    puts "URI: #{doc[:uri]}"
    puts "Content: #{doc[:content]}"
    puts "Value: #{doc[:value]}"
  end

  # Process selector
  # Filter method is defined in each subclass
  results = filter doc
  
  if sc::boolean.first=="true"
    results = results.map do |r|
      affirmations = ["yes", "true"]
      negations = ["no", "none", "false", "-", "--"]
      no  = negations.include?(r[:value].downcase)
      yes = affirmations.include?(r[:value].downcase)
      if no
        value = "false" 
      elsif yes
        value = "true"
      else
        value = :remove
      end
      r.merge :value=>value
    end
    results = results.select{ |r| r[:value] != :remove }
  end
  if sc::normalize_max.first
    max = sc::normalize_max.first.to_f
    min = sc::normalize_min.first.to_f
    in_range = sc::normalize_in_range.first == "true"
    results.each do |r|
      r[:value] = ((r[:value].to_f-min) / (max-min)).to_s
    end
    if in_range
      results = results.select { |r| r[:value].to_f <= 1.0 and r[:value].to_f >= 0.0 }
    end
  end
  if sc::nonempty.first=="true"
    results = results.select{ |r| r[:value] != ""}
  end
  
  if sc::debug.first=="true" and Scrappy::Agent::Options.debug and
    (Scrappy::Agent::Options.debug_key.nil? or doc[:value].downcase.include?(Scrappy::Agent::Options.debug_key) )
    
    puts "== No results" if results.empty?
    results.each_with_index do |result, i|
      puts "== Result ##{i}:"
      puts "URI: #{result[:uri]}"
      puts "Content: #{result[:content]}"
      puts "Value: #{result[:value].inspect}"
    end
    puts
  end
  
  # Return results if no nested selectors
  return results if sc::selector.empty?

  # Process nested selectors
  results.map do |result|
    sc::selector.map { |s| graph.node(s).select result }
  end.flatten
end