Module: Scrappy::Extractor

Included in:
Agent
Defined in:
lib/scrappy/extractor/extractor.rb

Instance Method Summary collapse

Instance Method Details

#extract(uri, html, kb, referenceable = nil) ⇒ Object



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/scrappy/extractor/extractor.rb', line 10

def extract uri, html, kb, referenceable=nil
  synchronize do
    if options.debug
      print "Extracting #{uri}..."; $stdout.flush
    end
    
    # Restart stateful selectors
    kb = RDF::Graph.new(kb.triples)
    
    # Parse document
    content = Nokogiri::HTML(html, nil, 'utf-8')
    
    # Extract each fragment
    options = { :doc => { :uri=>uri, :content=>content }, :referenceable=>referenceable }
    triples = []
    fragments_for(kb, uri).each do |fragment|
      kb.node(fragment).extract(options).each do |node|
        triples += node.graph.triples
      end
    end

    puts "done!" if self.options.debug

    triples
  end
end

#fragments_for(kb, uri) ⇒ Object



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/scrappy/extractor/extractor.rb', line 37

def fragments_for kb, uri
  root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)

  selectors = []
  fragments = {}
  root_fragments.each do |fragment|
    fragment.sc::selector.each do |selector|
      fragments[selector] = fragment
      selectors << selector
    end
  end
  
  uri_selectors = selectors.select { |selector| selector.rdf::type.include?(Node('sc:UriSelector')) or
                                                selector.rdf::type.include?(Node('sc:UriPatternSelector')) }.
                            select { |selector| !kb.node(selector).filter(:uri=>uri).empty? }

  visual_selectors = selectors.select { |selector| selector.rdf::type.include?(Node('sc:VisualSelector')) }

  (uri_selectors + visual_selectors).map { |selector| fragments[selector] }
end