Class: Ragdoll::TextChunker

Inherits:
Object
  • Object
show all
Defined in:
app/services/ragdoll/text_chunker.rb

Constant Summary collapse

DEFAULT_CHUNK_SIZE =
1000
DEFAULT_CHUNK_OVERLAP =
200

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(text, chunk_size: DEFAULT_CHUNK_SIZE, chunk_overlap: DEFAULT_CHUNK_OVERLAP) ⇒ TextChunker

Returns a new instance of TextChunker.



12
13
14
15
16
# File 'app/services/ragdoll/text_chunker.rb', line 12

def initialize(text, chunk_size: DEFAULT_CHUNK_SIZE, chunk_overlap: DEFAULT_CHUNK_OVERLAP)
  @text = text.to_s
  @chunk_size = chunk_size
  @chunk_overlap = chunk_overlap
end

Class Method Details

.chunk(text, chunk_size: DEFAULT_CHUNK_SIZE, chunk_overlap: DEFAULT_CHUNK_OVERLAP) ⇒ Object



8
9
10
# File 'app/services/ragdoll/text_chunker.rb', line 8

def self.chunk(text, chunk_size: DEFAULT_CHUNK_SIZE, chunk_overlap: DEFAULT_CHUNK_OVERLAP)
  new(text, chunk_size: chunk_size, chunk_overlap: chunk_overlap).chunk
end

Instance Method Details

#chunkObject



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'app/services/ragdoll/text_chunker.rb', line 18

def chunk
  return [] if @text.empty?

  # Ensure chunk_size and chunk_overlap are valid integers
  @chunk_size = (@chunk_size || DEFAULT_CHUNK_SIZE).to_i
  @chunk_overlap = (@chunk_overlap || DEFAULT_CHUNK_OVERLAP).to_i

  # Ensure chunk_overlap is not greater than or equal to chunk_size to prevent infinite loops
  @chunk_overlap = [@chunk_size - 1, 0].max if @chunk_overlap >= @chunk_size

  return [@text] if @text.length <= @chunk_size

  chunks = []
  start_pos = 0

  while start_pos < @text.length
    end_pos = start_pos + @chunk_size

    # If this is the last chunk, take everything remaining
    if end_pos >= @text.length
      chunks << @text[start_pos..].strip
      break
    end

    # Try to find a good breaking point (sentence, paragraph, or word boundary)
    chunk_text = @text[start_pos...end_pos]
    break_pos = find_break_position(chunk_text, @text, start_pos, end_pos)

    # Extract the chunk
    actual_end_pos = start_pos + break_pos
    chunk_content = @text[start_pos...actual_end_pos].strip

    chunks << chunk_content unless chunk_content.empty?

    # Move to next chunk with overlap
    next_start_pos = actual_end_pos - @chunk_overlap
    next_start_pos = [next_start_pos, 0].max # Ensure we don't go negative

    # Ensure forward progress - if we're not advancing, force a step forward
    next_start_pos = start_pos + 1 if next_start_pos <= start_pos

    start_pos = next_start_pos
  end

  chunks.reject(&:empty?)
end