Class: Roseflow::Text::SentenceSplitter

Inherits:
Splitter
  • Object
show all
Defined in:
lib/roseflow/text/sentence_splitter.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(language: "en", **kwargs) ⇒ SentenceSplitter

Returns a new instance of SentenceSplitter.



9
10
11
12
# File 'lib/roseflow/text/sentence_splitter.rb', line 9

def initialize(language: "en", **kwargs)
  super(**kwargs)
  @language = language
end

Instance Attribute Details

#chunk_overlapObject (readonly)

Returns the value of attribute chunk_overlap.



14
15
16
# File 'lib/roseflow/text/sentence_splitter.rb', line 14

def chunk_overlap
  @chunk_overlap
end

#chunk_sizeObject (readonly)

Returns the value of attribute chunk_size.



14
15
16
# File 'lib/roseflow/text/sentence_splitter.rb', line 14

def chunk_size
  @chunk_size
end

Instance Method Details

#segmenter(text) ⇒ Object



16
17
18
# File 'lib/roseflow/text/sentence_splitter.rb', line 16

def segmenter(text)
  @segmenter ||= PragmaticSegmenter::Segmenter.new(text: text, language: @language)
end

#split(text) ⇒ Object



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/roseflow/text/sentence_splitter.rb', line 20

def split(text)
  segments = segmenter(text).segment
  current_size = 0
  results = [[]]

  segments.each do |segment|
    if current_size + segment.size > chunk_size
      overlap = [results.last.last(chunk_overlap), segment].flatten
      current_size = overlap.sum(&:size) + chunk_overlap
      results << overlap
    else
      current_size += segment.size + results.last.size
      results.last << segment
    end
  end

  results.map { |r| r.join(" ") }
end