11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
|
# File 'lib/treat/workers/processors/chunkers/html.rb', line 11
def self.recurse(node, html_node, level = 1)
html_node.children.each do |child|
next if child.name == 'text'
txt = child.inner_text
if child.name =~ /^h([0-9]{1})$/ ||
(child.name == 'p' && txt.length < 45 &&
node.parent && node.parent.type == :section)
if $1
lvl = $1.to_i
if lvl <= level
node.ancestors_with_type(:section).
each do |s|
l = s.has?(:level) ? s.level : 1
node = s if l == lvl - 1
end
node = node <<
Treat::Entities::Section.new
elsif lvl > level
node = node <<
Treat::Entities::Section.new
end
level = lvl
node.set :level, level
end
t = node <<
Treat::Entities::Title.new(txt)
elsif child.name == 'p'
node << Treat::Entities::Zone.
from_string(txt)
elsif ['ul', 'ol'].include?(child.name)
node = node <<
Treat::Entities::List.new
elsif ['li'].include?(child.name)
n = Treat::Entities::Entity.
zone_from_string(txt)
node << n
end
if child.children.size > 0
recurse(node, child, level)
end
end
end
|