Module: Scrappy::Extractor

Included in:
Agent
Defined in:
lib/scrappy/extractor/extractor.rb

Instance Method Summary collapse

Instance Method Details

#extract(uri, html, kb, referenceable = nil) ⇒ Object



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/scrappy/extractor/extractor.rb', line 10

def extract uri, html, kb, referenceable=nil
  synchronize do
    if options.debug
      print "Extracting #{uri}..."; $stdout.flush
    end
    
    # Restart stateful selectors
    kb = RDF::Graph.new(kb.triples)
    
    # Parse document
    content = Nokogiri::HTML(html, nil, 'utf-8')
    
    # Extract each fragment
    options = { :doc => { :uri=>uri, :content=>content }, :referenceable=>referenceable }
    output  = extract_graph(fragments_for(kb, uri), options)

    puts "done!" if self.options.debug

    output.triples
  end
end

#extract_graph(fragments, options) ⇒ Object

Extracts all mappings from a fragment and returns a graph



55
56
57
58
59
# File 'lib/scrappy/extractor/extractor.rb', line 55

def extract_graph fragments, options
  output = RDF::Graph.new
  fragments.each { |fragment| fragment.extract(options).each { |result| output << result } }
  output
end

#fragments_for(kb, uri) ⇒ Object

Returns a list of fragments that have mappings in a given URI



33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/scrappy/extractor/extractor.rb', line 33

def fragments_for kb, uri
  root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)

  selectors = []
  fragments = {}
  root_fragments.each do |fragment|
    fragment.sc::selector.each do |selector|
      fragments[selector] = fragment
      selectors << selector
    end
  end
  
  uri_selectors = selectors.select { |selector| selector.rdf::type.include?(Node('sc:UriSelector')) or
                                                selector.rdf::type.include?(Node('sc:UriPatternSelector')) }.
                            select { |selector| !kb.node(selector).filter(:uri=>uri).empty? }

  visual_selectors = selectors.select { |selector| selector.rdf::type.include?(Node('sc:VisualSelector')) }

  (uri_selectors + visual_selectors).map { |selector| fragments[selector].proxy }
end