Class: Scrubyt::ResultDumper

Inherits:
Object
  • Object
show all
Defined in:
lib/scrubyt/output/result_dumper.rb

Overview

Dumping the result in various formats and providing statistics on the results

Class Method Summary collapse

Class Method Details

Print some simple statistics on the extracted results, like the count of extracted instances by each pattern



79
80
81
82
83
# File 'lib/scrubyt/output/result_dumper.rb', line 79

def self.print_statistics(pattern)
  puts "\n" * 2
  print_statistics_recursive(pattern,0)
  puts
end

.remove_empty_leaves(node) ⇒ Object



24
25
26
27
# File 'lib/scrubyt/output/result_dumper.rb', line 24

def self.remove_empty_leaves(node)
  node.remove if  node.elements.empty? && node.text == nil
  node.elements.each {|child| remove_empty_leaves child }
end

.to_csv(pattern) ⇒ Object



44
45
46
47
48
49
50
51
52
53
54
# File 'lib/scrubyt/output/result_dumper.rb', line 44

def self.to_csv(pattern)
  result = []
  flat_csv_inner = lambda {|e, parts|
    content = e.text || ''
    parts << content if ((e.is_a? REXML::Element) && content != '')
    e.children.each {|c| flat_csv_inner.call(c, parts) if c.is_a? REXML::Element }
    parts
  }
  to_xml(pattern).root.elements['/root'].each {|e| result << flat_csv_inner.call(e, []) }
  (result.map! {|a| a.join(',')}).join("\n")
end

.to_hash(pattern) ⇒ Object



56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/scrubyt/output/result_dumper.rb', line 56

def self.to_hash(pattern)
  result = []
  flat_hash_inner = lambda {|e, parts|
    content = e.text ? REXML::Text.unnormalize(e.text) : ''
    if ((e.is_a? REXML::Element) && content != '')
      if parts[e.local_name]
        parts[e.local_name] = parts[e.local_name] + "," + content
      else
        parts[e.local_name] = content
      end
    end
    e.children.each {|c| flat_hash_inner.call(c, parts) if c.is_a? REXML::Element }
    parts
  }
  to_xml(pattern).root.elements['/root'].each {|e| result << flat_hash_inner.call(e, {}) }
  result
end

.to_text(pattern) ⇒ Object

Output the text of the pattern; If this pattern is a tree, collect the text from its result instance node; otherwise rely on the last_result TODO: throw this away!!!



33
34
35
36
37
38
39
40
41
42
# File 'lib/scrubyt/output/result_dumper.rb', line 33

def self.to_text(pattern)
  last_result = pattern.last_result
  result = ""
  if pattern.type == :tree
    last_result.traverse_text { |t| result += t.to_s }
  else
    result = last_result
  end
  result
end

.to_xml(pattern) ⇒ Object

Output the results as XML



11
12
13
14
15
16
17
18
19
20
21
22
# File 'lib/scrubyt/output/result_dumper.rb', line 11

def self.to_xml(pattern)
  doc = REXML::Document.new
  root = REXML::Element.new('root')
  doc.add_element(root)
  all_extracted_docs = pattern.last_result
  [all_extracted_docs].flatten.each do |lr|
    pattern.last_result = lr
    to_xml_recursive(pattern, root)
  end
  remove_empty_leaves(doc)
  @@last_doc = doc
end