Class: DSPy::Teleprompt::ModuleEvaluator

Inherits:

Object

Object
DSPy::Teleprompt::ModuleEvaluator

show all

Extended by:: T::Sig

Defined in:: lib/dspy/teleprompt/gepa.rb

Overview

Module Evaluator - Evaluates DSPy modules with metrics and feedback

Instance Method Summary collapse

Constructor Details

#initialize(student:, metric:, feedback_map: {}, custom_instruction_proposer: nil) ⇒ `ModuleEvaluator`

Returns a new instance of ModuleEvaluator.

# File 'lib/dspy/teleprompt/gepa.rb', line 2749

def initialize(student:, metric:, feedback_map: {}, custom_instruction_proposer: nil)
  @student = student
  @metric = metric
  @feedback_map = feedback_map
  @custom_instruction_proposer = custom_instruction_proposer
  @trace_collector = GEPA::TraceCollector.new
end

Instance Method Details

#build_program(candidate_instruction) ⇒ `Object`

# File 'lib/dspy/teleprompt/gepa.rb', line 2759

def build_program(candidate_instruction)
  # For DSPy::Module compatibility, we'll need to create a new instance
  # with modified signature description
  if @student.respond_to?(:signature_class) && @student.signature_class.respond_to?(:description=)
    modified_student = @student.class.new
    modified_student.signature_class.description = candidate_instruction
    modified_student
  else
    # Fallback: return student as-is for non-standard modules
    @student
  end
end

#evaluate_batch(batch, candidate_instruction, capture_traces: true) ⇒ `Object`

# File 'lib/dspy/teleprompt/gepa.rb', line 2781

def evaluate_batch(batch, candidate_instruction, capture_traces: true)
  program = build_program(candidate_instruction)
  results = []

  batch.each do |example|
    begin
      # Execute program on example
      prediction = if program.respond_to?(:call)
                    program.call(**example.input_values)
                  elsif program.respond_to?(:forward)
                    program.forward(**example.input_values)
                  else
                    raise "Program must respond to :call or :forward"
                  end

      # Get collected traces (if trace collection is enabled)
      # Note: TraceCollector automatically collects via event subscriptions
      traces = capture_traces ? @trace_collector.traces : []

      # Evaluate with metric
      # Try with traces first (for GEPAFeedbackMetric), fallback to standard metric
      begin
        # Check if metric can accept 3 parameters (example, prediction, traces)
        if @metric.respond_to?(:arity) && (@metric.arity == 3 || @metric.arity < 0)
          score_result = @metric.call(example, prediction, traces)
        else
          score_result = @metric.call(example, prediction)
        end
      rescue ArgumentError => arg_error
        # If 3-arg call fails, try 2-arg call
        if arg_error.message.include?('wrong number of arguments')
          score_result = @metric.call(example, prediction)
        else
          raise arg_error
        end
      end

      # Ensure we always have a ScoreWithFeedback object
      if score_result.is_a?(ScoreWithFeedback)
        results << score_result
      else
        # Wrap plain float scores in ScoreWithFeedback
        results << ScoreWithFeedback.new(
          score: score_result.to_f,
          prediction: prediction,
          feedback: nil
        )
      end

    rescue => e
      DSPy.logger.error("Evaluation error: #{e.message}")
      # Return zero score on failure
      results << 0.0
    end
  end

  results
end

#make_reflective_dataset(examples, predictions, scores, threshold: 0.5) ⇒ `Object`

# File 'lib/dspy/teleprompt/gepa.rb', line 2850

def make_reflective_dataset(examples, predictions, scores, threshold: 0.5)
  reflective_data = []

  examples.zip(predictions, scores).each do |example, prediction, score|
    # Extract score value
    score_value = score.is_a?(ScoreWithFeedback) ? score.score : score

    # Include failed predictions (below threshold)
    next if score_value >= threshold

    # Extract feedback if available
    feedback = if score.is_a?(ScoreWithFeedback) && score.feedback
                score.feedback
              else
                "Low performance (score: #{score_value.round(2)})"
              end

    reflective_data << {
      'input' => example.input_values,
      'expected' => example.expected_values,
      'prediction' => extract_prediction_values(prediction),
      'score' => score_value,
      'feedback' => feedback
    }
  end

  reflective_data
end

#propose_new_texts(current_instruction, reflective_dataset, components_to_update = ['instruction']) ⇒ `Object`

# File 'lib/dspy/teleprompt/gepa.rb', line 2888

def propose_new_texts(current_instruction, reflective_dataset, components_to_update = ['instruction'])
  if @custom_instruction_proposer
    # Use custom proposer if provided
    proposed = @custom_instruction_proposer.call(current_instruction, reflective_dataset)
    [proposed].compact
  else
    # Use built-in proposal logic
    analyze_failures_and_propose(current_instruction, reflective_dataset)
  end
end

Class: DSPy::Teleprompt::ModuleEvaluator

Overview

Instance Method Summary collapse

Constructor Details

#initialize(student:, metric:, feedback_map: {}, custom_instruction_proposer: nil) ⇒ ModuleEvaluator

Instance Method Details

#build_program(candidate_instruction) ⇒ Object

#evaluate_batch(batch, candidate_instruction, capture_traces: true) ⇒ Object

#make_reflective_dataset(examples, predictions, scores, threshold: 0.5) ⇒ Object

#propose_new_texts(current_instruction, reflective_dataset, components_to_update = ['instruction']) ⇒ Object

#initialize(student:, metric:, feedback_map: {}, custom_instruction_proposer: nil) ⇒ `ModuleEvaluator`

#build_program(candidate_instruction) ⇒ `Object`

#evaluate_batch(batch, candidate_instruction, capture_traces: true) ⇒ `Object`

#make_reflective_dataset(examples, predictions, scores, threshold: 0.5) ⇒ `Object`

#propose_new_texts(current_instruction, reflective_dataset, components_to_update = ['instruction']) ⇒ `Object`