Module: DSPy::Propose::DatasetSummaryGenerator

Extended by:
T::Sig
Defined in:
lib/dspy/propose/dataset_summary_generator.rb

Overview

Dataset Summary Generator for creating concise dataset descriptions Used by GroundedProposer for data-aware instruction generation

Defined Under Namespace

Classes: DatasetDescriptor, DatasetDescriptorWithPriorObservations, ObservationSummarizer

Class Method Summary collapse

Class Method Details

.create_dataset_summary(trainset, view_data_batch_size, prompt_model, verbose: false) ⇒ Object

Raises:

  • (ArgumentError)


112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# File 'lib/dspy/propose/dataset_summary_generator.rb', line 112

def self.create_dataset_summary(trainset, view_data_batch_size, prompt_model, verbose: false)
  if verbose
    puts "\nBootstrapping dataset summary (this will be used to generate instructions)..."
  end

  # Use provided model or fall back to global LM
  lm = prompt_model || DSPy.lm
  raise ArgumentError, "No language model configured. Set prompt_model or DSPy.lm" unless lm

  # Use provided LM in a block context
  DSPy.with_lm(lm) do
    # Initial observation from first batch
    upper_lim = [trainset.length, view_data_batch_size].min
    batch_examples = trainset[0...upper_lim]
    predictor = DSPy::Predict.new(DatasetDescriptor)
    observation = predictor.call(examples: format_examples_for_prompt(batch_examples))
    observations = observation.observations

    # Iteratively refine observations with additional batches
    skips = 0
    max_calls = 10
    calls = 0

    begin
      (view_data_batch_size...trainset.length).step(view_data_batch_size) do |b|
        calls += 1
        break if calls >= max_calls

        puts "Processing batch starting at index #{b}" if verbose

        upper_lim = [trainset.length, b + view_data_batch_size].min

        predictor = DSPy::Predict.new(DatasetDescriptorWithPriorObservations)
        batch_examples = trainset[b...upper_lim]
        output = predictor.call(
          prior_observations: observations,
          examples: format_examples_for_prompt(batch_examples)
        )

        # Check if LLM indicates observations are complete
        if output.observations.length >= 8 && output.observations[0...8].upcase == "COMPLETE"
          skips += 1
          break if skips >= 5
          next
        end

        observations += output.observations
      end
    rescue => e
      if verbose
        puts "Error during observation refinement: #{e.message}. Using observations from past round for summary."
      end
    end

    # Generate final summary from accumulated observations
    predictor = DSPy::Predict.new(ObservationSummarizer)
    summary = predictor.call(observations: observations)

    if verbose
      puts "\nGenerated summary: #{strip_prefix(summary.summary)}\n"
    end

    strip_prefix(summary.summary)
  end
end

.format_examples_for_prompt(examples) ⇒ Object



179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
# File 'lib/dspy/propose/dataset_summary_generator.rb', line 179

def self.format_examples_for_prompt(examples)
  serialized_examples = examples.map do |example|
    case example
    when DSPy::Example
      {
        'signature' => example.signature_class.name || example.signature_class.to_s,
        'input' => stringify_keys(DSPy::TypeSerializer.serialize(example.input)),
        'expected' => stringify_keys(DSPy::TypeSerializer.serialize(example.expected))
      }
    when DSPy::FewShotExample
      base = {
        'input' => stringify_keys(example.input),
        'output' => stringify_keys(example.output)
      }
      base['reasoning'] = example.reasoning if example.reasoning
      base
    when Hash
      stringify_keys(example)
    else
      stringify_keys(example.respond_to?(:to_h) ? example.to_h : { value: example })
    end
  end

  serialized_examples
end

.order_input_keys_in_string(unordered_repr) ⇒ Object



65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/dspy/propose/dataset_summary_generator.rb', line 65

def self.order_input_keys_in_string(unordered_repr)
  # Regex pattern to match the input keys structure
  pattern = /input_keys=\{([^}]+)\}/

  # Function to reorder keys
  unordered_repr.gsub(pattern) do |match|
    keys_str = Regexp.last_match(1)
    # Split the keys, strip extra spaces, and sort them
    keys = keys_str.split(',').map(&:strip).sort
    # Format the sorted keys back into the expected structure
    "input_keys={#{keys.join(', ')}}"
  end
end

.stringify_keys(value) ⇒ Object



206
207
208
209
210
211
212
213
214
215
216
217
# File 'lib/dspy/propose/dataset_summary_generator.rb', line 206

def self.stringify_keys(value)
  case value
  when Hash
    value.each_with_object({}) do |(k, v), result|
      result[k.to_s] = stringify_keys(v)
    end
  when Array
    value.map { |item| stringify_keys(item) }
  else
    value
  end
end

.strip_prefix(text) ⇒ Object



81
82
83
84
85
86
# File 'lib/dspy/propose/dataset_summary_generator.rb', line 81

def self.strip_prefix(text)
  # Pattern matches up to 4 words followed by a colon
  pattern = /^[\*\s]*(([\w'\-]+\s+){0,4}[\w'\-]+):\s*/
  modified_text = text.gsub(pattern, '')
  modified_text.strip.gsub(/^["']|["']$/, '')
end