Class: ExtraLoop::DomExtractor

Inherits:
ExtractorBase show all
Defined in:
lib/extraloop/dom_extractor.rb

Instance Attribute Summary

Attributes inherited from ExtractorBase

#field_name

Instance Method Summary collapse

Methods inherited from ExtractorBase

#initialize

Constructor Details

This class inherits a constructor from ExtraLoop::ExtractorBase

Instance Method Details

#extract_field(node, record = nil) ⇒ Object

Public: Runs the extractor against a document fragment (dom node or object).

node - The document fragment record - The extracted record

Returns the text content of the element, or the output of the extractor’s callback.



12
13
14
15
16
17
18
19
20
21
# File 'lib/extraloop/dom_extractor.rb', line 12

def extract_field(node, record=nil)
  target = node = node.respond_to?(:document) ? node : parse(node)
  target = node.at(@selector)  if @selector
  target = target.attr(@attribute) if target.respond_to?(:attr) && @attribute
  target = @environment.run(target, record, &@callback) if @callback

  #if target is still a DOM node, return its text content
  target = target.text if target.respond_to?(:text)
  target
end

#extract_list(input) ⇒ Object

Public: Extracts a list of document fragments matching the provided selector/callback

input - a document (either as a string or as a parsed Nokogiri document)

Returns an array of elements matching the specified selector or function



32
33
34
35
36
37
# File 'lib/extraloop/dom_extractor.rb', line 32

def extract_list(input)
  nodes = (input.respond_to?(:document) ? input : parse(input))
  nodes = nodes.search(@selector) if @selector
  nodes = nodes.css("*") unless @selector or @callback
  @callback && Array(@environment.run(nodes, &@callback)) || nodes
end

#is_xml(input) ⇒ Object



44
45
46
# File 'lib/extraloop/dom_extractor.rb', line 44

def is_xml(input)
  input =~ /^\s*\<\?xml version=\"\d\.\d\"\?\>/
end

#parse(input) ⇒ Object



39
40
41
42
# File 'lib/extraloop/dom_extractor.rb', line 39

def parse(input)
  super(input)
  @environment.document = is_xml(input) ? Nokogiri::XML(input) : Nokogiri::HTML(input)
end