Class: Ragdoll::TextChunker
- Inherits:
-
Object
- Object
- Ragdoll::TextChunker
- Defined in:
- app/services/ragdoll/text_chunker.rb
Constant Summary collapse
- DEFAULT_CHUNK_SIZE =
1000- DEFAULT_CHUNK_OVERLAP =
200
Class Method Summary collapse
Instance Method Summary collapse
- #chunk ⇒ Object
-
#initialize(text, chunk_size: DEFAULT_CHUNK_SIZE, chunk_overlap: DEFAULT_CHUNK_OVERLAP) ⇒ TextChunker
constructor
A new instance of TextChunker.
Constructor Details
#initialize(text, chunk_size: DEFAULT_CHUNK_SIZE, chunk_overlap: DEFAULT_CHUNK_OVERLAP) ⇒ TextChunker
Returns a new instance of TextChunker.
12 13 14 15 16 |
# File 'app/services/ragdoll/text_chunker.rb', line 12 def initialize(text, chunk_size: DEFAULT_CHUNK_SIZE, chunk_overlap: DEFAULT_CHUNK_OVERLAP) @text = text.to_s @chunk_size = chunk_size @chunk_overlap = chunk_overlap end |
Class Method Details
.chunk(text, chunk_size: DEFAULT_CHUNK_SIZE, chunk_overlap: DEFAULT_CHUNK_OVERLAP) ⇒ Object
8 9 10 |
# File 'app/services/ragdoll/text_chunker.rb', line 8 def self.chunk(text, chunk_size: DEFAULT_CHUNK_SIZE, chunk_overlap: DEFAULT_CHUNK_OVERLAP) new(text, chunk_size: chunk_size, chunk_overlap: chunk_overlap).chunk end |
Instance Method Details
#chunk ⇒ Object
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
# File 'app/services/ragdoll/text_chunker.rb', line 18 def chunk return [] if @text.empty? # Ensure chunk_size and chunk_overlap are valid integers @chunk_size = (@chunk_size || DEFAULT_CHUNK_SIZE).to_i @chunk_overlap = (@chunk_overlap || DEFAULT_CHUNK_OVERLAP).to_i # Ensure chunk_overlap is not greater than or equal to chunk_size to prevent infinite loops @chunk_overlap = [@chunk_size - 1, 0].max if @chunk_overlap >= @chunk_size return [@text] if @text.length <= @chunk_size chunks = [] start_pos = 0 while start_pos < @text.length end_pos = start_pos + @chunk_size # If this is the last chunk, take everything remaining if end_pos >= @text.length chunks << @text[start_pos..].strip break end # Try to find a good breaking point (sentence, paragraph, or word boundary) chunk_text = @text[start_pos...end_pos] break_pos = find_break_position(chunk_text, @text, start_pos, end_pos) # Extract the chunk actual_end_pos = start_pos + break_pos chunk_content = @text[start_pos...actual_end_pos].strip chunks << chunk_content unless chunk_content.empty? # Move to next chunk with overlap next_start_pos = actual_end_pos - @chunk_overlap next_start_pos = [next_start_pos, 0].max # Ensure we don't go negative # Ensure forward progress - if we're not advancing, force a step forward next_start_pos = start_pos + 1 if next_start_pos <= start_pos start_pos = next_start_pos end chunks.reject(&:empty?) end |