Class: DSPy::Teleprompt::ModuleEvaluator

Inherits:
Object
  • Object
show all
Extended by:
T::Sig
Defined in:
lib/dspy/teleprompt/gepa.rb

Overview

Module Evaluator - Evaluates DSPy modules with metrics and feedback

Instance Method Summary collapse

Constructor Details

#initialize(student:, metric:, feedback_map: {}, custom_instruction_proposer: nil) ⇒ ModuleEvaluator

Returns a new instance of ModuleEvaluator.



3192
3193
3194
3195
3196
3197
3198
# File 'lib/dspy/teleprompt/gepa.rb', line 3192

def initialize(student:, metric:, feedback_map: {}, custom_instruction_proposer: nil)
  @student = student
  @metric = metric
  @feedback_map = feedback_map
  @custom_instruction_proposer = custom_instruction_proposer
  @trace_collector = GEPA::TraceCollector.new
end

Instance Method Details

#build_program(candidate_instruction) ⇒ Object



3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
# File 'lib/dspy/teleprompt/gepa.rb', line 3202

def build_program(candidate_instruction)
  # For DSPy::Module compatibility, we'll need to create a new instance
  # with modified signature description
  if @student.respond_to?(:signature_class) && @student.signature_class.respond_to?(:description=)
    modified_student = @student.class.new
    modified_student.signature_class.description = candidate_instruction
    modified_student
  else
    # Fallback: return student as-is for non-standard modules
    @student
  end
end

#evaluate_batch(batch, candidate_instruction, capture_traces: true) ⇒ Object



3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
# File 'lib/dspy/teleprompt/gepa.rb', line 3224

def evaluate_batch(batch, candidate_instruction, capture_traces: true)
  program = build_program(candidate_instruction)
  results = []
  
  batch.each do |example|
    begin            
      # Execute program on example
      prediction = if program.respond_to?(:call)
                    program.call(**example.input_values)
                  elsif program.respond_to?(:forward)
                    program.forward(**example.input_values)
                  else
                    raise "Program must respond to :call or :forward"
                  end
      
      # Get collected traces (if trace collection is enabled)
      # Note: TraceCollector automatically collects via event subscriptions
      traces = capture_traces ? @trace_collector.traces : []
      
      # Evaluate with metric
      # Try with traces first (for GEPAFeedbackMetric), fallback to standard metric
      begin
        # Check if metric can accept 3 parameters (example, prediction, traces)
        if @metric.respond_to?(:arity) && (@metric.arity == 3 || @metric.arity < 0)
          score_result = @metric.call(example, prediction, traces)
        else
          score_result = @metric.call(example, prediction)
        end
      rescue ArgumentError => arg_error
        # If 3-arg call fails, try 2-arg call
        if arg_error.message.include?('wrong number of arguments')
          score_result = @metric.call(example, prediction)
        else
          raise arg_error
        end
      end
      
      # Ensure we always have a ScoreWithFeedback object
      if score_result.is_a?(ScoreWithFeedback)
        results << score_result
      else
        # Wrap plain float scores in ScoreWithFeedback
        results << ScoreWithFeedback.new(
          score: score_result.to_f,
          prediction: prediction,
          feedback: nil
        )
      end
      
    rescue => e
      DSPy.logger.error("Evaluation error: #{e.message}")
      # Return zero score on failure
      results << 0.0
    end
  end
  
  results
end

#make_reflective_dataset(examples, predictions, scores, threshold: 0.5) ⇒ Object



3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
# File 'lib/dspy/teleprompt/gepa.rb', line 3293

def make_reflective_dataset(examples, predictions, scores, threshold: 0.5)
  reflective_data = []
  
  examples.zip(predictions, scores).each do |example, prediction, score|
    # Extract score value
    score_value = score.is_a?(ScoreWithFeedback) ? score.score : score
    
    # Include failed predictions (below threshold)
    next if score_value >= threshold
    
    # Extract feedback if available
    feedback = if score.is_a?(ScoreWithFeedback) && score.feedback
                score.feedback
              else
                "Low performance (score: #{score_value.round(2)})"
              end
    
    reflective_data << {
      'input' => example.input_values,
      'expected' => example.expected_values,
      'prediction' => extract_prediction_values(prediction),
      'score' => score_value,
      'feedback' => feedback
    }
  end
  
  reflective_data
end

#propose_new_texts(current_instruction, reflective_dataset, components_to_update = ['instruction']) ⇒ Object



3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
# File 'lib/dspy/teleprompt/gepa.rb', line 3331

def propose_new_texts(current_instruction, reflective_dataset, components_to_update = ['instruction'])
  if @custom_instruction_proposer
    # Use custom proposer if provided
    proposed = @custom_instruction_proposer.call(current_instruction, reflective_dataset)
    [proposed].compact
  else
    # Use built-in proposal logic
    analyze_failures_and_propose(current_instruction, reflective_dataset)
  end
end