Class: Sc::SectionSelector

Inherits:
Selector show all
Defined in:
lib/scrappy/extractor/selectors/section.rb

Instance Method Summary collapse

Methods inherited from Selector

#select

Methods included from Scrappy::Formats

#format

Instance Method Details

#filter(doc) ⇒ Object



3
4
5
6
7
8
9
10
11
12
# File 'lib/scrappy/extractor/selectors/section.rb', line 3

def filter doc
  rdf::value.map do |pattern|
    doc[:content].search('h1, h2, h3, h4, h5, h6, h7, h8, h9, h10').select { |n| n.parent.name!='script' and n.text.downcase.strip == pattern }.map do |node|
      found = false
      content = node.parent.children[node.parent.children.index(node)+1..-1].select { |n| found ||= (n.name==node.name or n.name=='div'); !found }
      
      [ { :uri=>doc[:uri], :content=>content, :value=>content.map{|t| format(t, sc::format, doc[:uri])}.select{|t| t.to_s.strip!=''}*"\n\n" } ]
    end
  end.flatten
end