Class: Konjak::HtmlSegmentor

Inherits:
Segmentor show all
Defined in:
lib/konjak/html_segmentor.rb

Constant Summary collapse

SEGMENTS_PATTERNS =
[
  %r{<(?<start>p|h1|h2|h3|h4|h5|h6|li|title|td)>(.*?)</\k<start>>}m,
  %r{<(?<start>p|h1|h2|h3|h4|h5|h6|li|title|td) [^>]*?>(.*?)</\k<start>>}m,
  %r{<div>(.*?)</div>}m,
  %r{<div [^>]*?>(.*?)</div>}m
]

Instance Attribute Summary

Attributes inherited from Segmentor

#content, #options

Instance Method Summary collapse

Methods inherited from Segmentor

#initialize

Constructor Details

This class inherits a constructor from Konjak::Segmentor

Instance Method Details

#segmentsObject



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/konjak/html_segmentor.rb', line 10

def segments
  segments = [content.dup]

  begin
    size = segments.size

    SEGMENTS_PATTERNS.each do |pattern|
      segments.map! do |s|
        s.partition(pattern)
      end
      segments.flatten!
      segments.reject!(&:empty?)
    end
  end while segments.size != size

  segments
end