Module: PertinentParser

Defined in:
lib/pertinent_parser.rb

Class Method Summary collapse

Class Method Details

.html(html) ⇒ Object

Better write our own traversal function so that we can screw with the HTML representation the way we like.



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/pertinent_parser.rb', line 17

def html(html)
  doc = Hpricot(html)
  d = 0
  t = text(doc.inner_text)
  doc.traverse_all_element do |elem|
    if elem.text?
      #puts elem.inner_text
      d += elem.inner_text.size
    else
      #puts elem.stag
      t + wrap_(d...d+elem.inner_text.size, elem.stag)
      #puts "#{d}..#{d+elem.inner_text.size}"
    end
  end
  t
end

.new_replace(context, target, number, replacement) ⇒ Object



72
73
74
75
76
# File 'lib/pertinent_parser.rb', line 72

def new_replace(context, target, number, replacement)
  range = range_from_specification(context, target, number)
  transform = Transform.new(:replacement, replacement)
  r = Rule.new(range, transform)
end

.new_wrap(context, target, number, tag) ⇒ Object



57
58
59
60
# File 'lib/pertinent_parser.rb', line 57

def new_wrap(context, target, number, tag)
  range = range_from_specification(context, target, number)
  wrap_(range, tag)
end

.offset_to_r(o) ⇒ Object



41
42
43
# File 'lib/pertinent_parser.rb', line 41

def offset_to_r(o)
  (o[0]..o[1]-1)
end

.range_from_specification(context, target, number) ⇒ Object



45
46
47
48
49
50
51
52
53
54
55
# File 'lib/pertinent_parser.rb', line 45

def range_from_specification context, target, number
  count, position = 0, 0
  stored = []
  re = Regexp.new(Regexp.escape(target))
  while (match = context.match(re , position)) do
    temp = match.offset 0
    position += 1; count += 1 if temp != stored
    return offset_to_r(temp) if count == number
    stored = temp
  end
end

.rule(range, transform) ⇒ Object



62
63
64
# File 'lib/pertinent_parser.rb', line 62

def rule(range, transform)
  Rule.new(range, transform)
end

.text(s) ⇒ Object



34
35
36
37
38
39
# File 'lib/pertinent_parser.rb', line 34

def text(s)
  r = Rule.new((0..s.size-1), Transform.new(:identity, ["id"]))
  t = Text.new(s)
  t.rule = r
  t
end

.wrap_(range, tag) ⇒ Object



67
68
69
70
# File 'lib/pertinent_parser.rb', line 67

def wrap_(range, tag)
  transform = Transform.new(:wrap, [tag, "</"+tag.match(/<(\S*)(\s|>)/)[1]+">" ])
  r = Rule.new(range, transform)
end