Module: Scrappy::Extractor
- Included in:
- Agent
- Defined in:
- lib/scrappy/extractor/extractor.rb
Instance Method Summary collapse
Instance Method Details
#extract(uri, html, kb, referenceable = nil) ⇒ Object
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/scrappy/extractor/extractor.rb', line 10 def extract uri, html, kb, referenceable=nil synchronize do if .debug print "Extracting #{uri}..."; $stdout.flush end # Restart stateful selectors kb = RDF::Graph.new(kb.triples) # Parse document content = Nokogiri::HTML(html, nil, 'utf-8') # Extract each fragment = { :doc => { :uri=>uri, :content=>content }, :referenceable=>referenceable } triples = [] fragments_for(kb, uri).each do |fragment| kb.node(fragment).extract().each do |node| triples += node.graph.triples end end puts "done!" if self..debug triples end end |
#fragments_for(kb, uri) ⇒ Object
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
# File 'lib/scrappy/extractor/extractor.rb', line 37 def fragments_for kb, uri root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil) selectors = [] fragments = {} root_fragments.each do |fragment| fragment.sc::selector.each do |selector| fragments[selector] = fragment selectors << selector end end uri_selectors = selectors.select { |selector| selector.rdf::type.include?(Node('sc:UriSelector')) or selector.rdf::type.include?(Node('sc:UriPatternSelector')) }. select { |selector| !kb.node(selector).filter(:uri=>uri).empty? } visual_selectors = selectors.select { |selector| selector.rdf::type.include?(Node('sc:VisualSelector')) } (uri_selectors + visual_selectors).map { |selector| fragments[selector] } end |