Class: Splitta::Doc

Inherits:
Object
  • Object
show all
Defined in:
lib/splitta/doc.rb

Constant Summary collapse

FRAG_SPLITTER =
/
  (
    [.!?]         # sentence end punctuation
    (?:
      (?:<.*>)    # extra tag
      |
      [”"')\]}]   # right-handed punctuation to retain
    )*
    \s+           # must have whitespace
  )
/ux
SEGMENT_THRESHOLD =
0.5

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(text, model:) ⇒ Doc

Returns a new instance of Doc.



23
24
25
26
27
28
29
30
# File 'lib/splitta/doc.rb', line 23

def initialize(text, model:)
  @frags = []
  text.split(FRAG_SPLITTER).each_slice(2) do |frag_text|
    frag = Frag.new(frag_text.join, previous_frag: @frags.last)
    @frags << frag
  end
  model.classify(self)
end

Instance Attribute Details

#fragsObject (readonly)

Returns the value of attribute frags.



21
22
23
# File 'lib/splitta/doc.rb', line 21

def frags
  @frags
end

Instance Method Details

#segmentsObject

output all the text, split according to predictions



35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/splitta/doc.rb', line 35

def segments
  Enumerator.new do |y|
    io = StringIO.new
    frags.each do |frag|
      io << frag.orig
      if frag.over?(SEGMENT_THRESHOLD)
        y << io.string
        io.string = ''
      end
    end
    y << io.string unless io.string.empty?
  end
end