Module: Scrapes::Hpricot::Extractors

Included in:
Page
Defined in:
lib/scrapes/hpricot.rb

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.content(node) ⇒ Object

Returns the text of any child text nodes concatenated.



47
48
49
# File 'lib/scrapes/hpricot.rb', line 47

def content(node)
  text_process(node,String) do |e| e.content end
end

.contents(node) ⇒ Object

Returns the text of any child text nodes as an Array.



53
54
55
# File 'lib/scrapes/hpricot.rb', line 53

def contents(node)
  text_process(node,Array) do |e| e.content end
end

.text(node) ⇒ Object

Returns the text of any child text nodes recursively concatenated.



35
36
37
# File 'lib/scrapes/hpricot.rb', line 35

def text(node)
  text_process(node,String) do |e| text(e) end
end

.text_process(node, klass, &block) ⇒ Object



87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/scrapes/hpricot.rb', line 87

def text_process(node, klass, &block)
  Extractors::unescape do
    case node
    when Array, ::Hpricot::Elements
      node.map do |elem|
        text_process(elem,klass,&block)
      end
    when ::Hpricot::Elem, ::Hpricot::Doc
      node.children.inject(klass.new) do |value,child|
        (value << block.call(child)) rescue nil
        value
      end
    when ::Hpricot::Text then node.content
    end
  end
end

.texts(node) ⇒ Object

Returns the text of any child text nodes recursively as nested Array.



41
42
43
# File 'lib/scrapes/hpricot.rb', line 41

def texts(node)
  text_process(node,Array) do |e| texts(e) end
end

.word(node) ⇒ Object

The result of text() with whitespace reduceded to single spaces and striped.



59
60
61
# File 'lib/scrapes/hpricot.rb', line 59

def word(node)
  text_process(node,String) do |e| word(e).gsub(/\s+/,' ').strip end
end

.words(node) ⇒ Object

The result of texts() striped, flattened, whitespace reduced to single spaces, and with all blank?s rejected.



66
67
68
# File 'lib/scrapes/hpricot.rb', line 66

def words(node)
  texts(node).flatten.compact.map{|e|e.gsub(/\s+/,' ').strip}.reject{|e| e.blank?}
end

Instance Method Details

#xml(node) ⇒ Object

Just reuturn the yielded node.



72
73
74
# File 'lib/scrapes/hpricot.rb', line 72

def xml(node)
  node
end