Class: Roseflow::Text::SentenceSplitter
- Defined in:
- lib/roseflow/text/sentence_splitter.rb
Instance Attribute Summary collapse
-
#chunk_overlap ⇒ Object
readonly
Returns the value of attribute chunk_overlap.
-
#chunk_size ⇒ Object
readonly
Returns the value of attribute chunk_size.
Instance Method Summary collapse
-
#initialize(language: "en", **kwargs) ⇒ SentenceSplitter
constructor
A new instance of SentenceSplitter.
- #segmenter(text) ⇒ Object
- #split(text) ⇒ Object
Constructor Details
#initialize(language: "en", **kwargs) ⇒ SentenceSplitter
Returns a new instance of SentenceSplitter.
9 10 11 12 |
# File 'lib/roseflow/text/sentence_splitter.rb', line 9 def initialize(language: "en", **kwargs) super(**kwargs) @language = language end |
Instance Attribute Details
#chunk_overlap ⇒ Object (readonly)
Returns the value of attribute chunk_overlap.
14 15 16 |
# File 'lib/roseflow/text/sentence_splitter.rb', line 14 def chunk_overlap @chunk_overlap end |
#chunk_size ⇒ Object (readonly)
Returns the value of attribute chunk_size.
14 15 16 |
# File 'lib/roseflow/text/sentence_splitter.rb', line 14 def chunk_size @chunk_size end |
Instance Method Details
#segmenter(text) ⇒ Object
16 17 18 |
# File 'lib/roseflow/text/sentence_splitter.rb', line 16 def segmenter(text) @segmenter ||= PragmaticSegmenter::Segmenter.new(text: text, language: @language) end |
#split(text) ⇒ Object
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/roseflow/text/sentence_splitter.rb', line 20 def split(text) segments = segmenter(text).segment current_size = 0 results = [[]] segments.each do |segment| if current_size + segment.size > chunk_size overlap = [results.last.last(chunk_overlap), segment].flatten current_size = overlap.sum(&:size) + chunk_overlap results << overlap else current_size += segment.size + results.last.size results.last << segment end end results.map { |r| r.join(" ") } end |