Class: DSPy::Teleprompt::ModuleEvaluator

Inherits:
Object
  • Object
show all
Extended by:
T::Sig
Defined in:
lib/dspy/teleprompt/gepa.rb

Overview

Module Evaluator - Evaluates DSPy modules with metrics and feedback

Instance Method Summary collapse

Constructor Details

#initialize(student:, metric:, feedback_map: {}, custom_instruction_proposer: nil) ⇒ ModuleEvaluator

Returns a new instance of ModuleEvaluator.



2749
2750
2751
2752
2753
2754
2755
# File 'lib/dspy/teleprompt/gepa.rb', line 2749

def initialize(student:, metric:, feedback_map: {}, custom_instruction_proposer: nil)
  @student = student
  @metric = metric
  @feedback_map = feedback_map
  @custom_instruction_proposer = custom_instruction_proposer
  @trace_collector = GEPA::TraceCollector.new
end

Instance Method Details

#build_program(candidate_instruction) ⇒ Object



2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
# File 'lib/dspy/teleprompt/gepa.rb', line 2759

def build_program(candidate_instruction)
  # For DSPy::Module compatibility, we'll need to create a new instance
  # with modified signature description
  if @student.respond_to?(:signature_class) && @student.signature_class.respond_to?(:description=)
    modified_student = @student.class.new
    modified_student.signature_class.description = candidate_instruction
    modified_student
  else
    # Fallback: return student as-is for non-standard modules
    @student
  end
end

#evaluate_batch(batch, candidate_instruction, capture_traces: true) ⇒ Object



2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
# File 'lib/dspy/teleprompt/gepa.rb', line 2781

def evaluate_batch(batch, candidate_instruction, capture_traces: true)
  program = build_program(candidate_instruction)
  results = []

  batch.each do |example|
    begin
      # Execute program on example
      prediction = if program.respond_to?(:call)
                    program.call(**example.input_values)
                  elsif program.respond_to?(:forward)
                    program.forward(**example.input_values)
                  else
                    raise "Program must respond to :call or :forward"
                  end

      # Get collected traces (if trace collection is enabled)
      # Note: TraceCollector automatically collects via event subscriptions
      traces = capture_traces ? @trace_collector.traces : []

      # Evaluate with metric
      # Try with traces first (for GEPAFeedbackMetric), fallback to standard metric
      begin
        # Check if metric can accept 3 parameters (example, prediction, traces)
        if @metric.respond_to?(:arity) && (@metric.arity == 3 || @metric.arity < 0)
          score_result = @metric.call(example, prediction, traces)
        else
          score_result = @metric.call(example, prediction)
        end
      rescue ArgumentError => arg_error
        # If 3-arg call fails, try 2-arg call
        if arg_error.message.include?('wrong number of arguments')
          score_result = @metric.call(example, prediction)
        else
          raise arg_error
        end
      end

      # Ensure we always have a ScoreWithFeedback object
      if score_result.is_a?(ScoreWithFeedback)
        results << score_result
      else
        # Wrap plain float scores in ScoreWithFeedback
        results << ScoreWithFeedback.new(
          score: score_result.to_f,
          prediction: prediction,
          feedback: nil
        )
      end

    rescue => e
      DSPy.logger.error("Evaluation error: #{e.message}")
      # Return zero score on failure
      results << 0.0
    end
  end

  results
end

#make_reflective_dataset(examples, predictions, scores, threshold: 0.5) ⇒ Object



2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
# File 'lib/dspy/teleprompt/gepa.rb', line 2850

def make_reflective_dataset(examples, predictions, scores, threshold: 0.5)
  reflective_data = []

  examples.zip(predictions, scores).each do |example, prediction, score|
    # Extract score value
    score_value = score.is_a?(ScoreWithFeedback) ? score.score : score

    # Include failed predictions (below threshold)
    next if score_value >= threshold

    # Extract feedback if available
    feedback = if score.is_a?(ScoreWithFeedback) && score.feedback
                score.feedback
              else
                "Low performance (score: #{score_value.round(2)})"
              end

    reflective_data << {
      'input' => example.input_values,
      'expected' => example.expected_values,
      'prediction' => extract_prediction_values(prediction),
      'score' => score_value,
      'feedback' => feedback
    }
  end

  reflective_data
end

#propose_new_texts(current_instruction, reflective_dataset, components_to_update = ['instruction']) ⇒ Object



2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
# File 'lib/dspy/teleprompt/gepa.rb', line 2888

def propose_new_texts(current_instruction, reflective_dataset, components_to_update = ['instruction'])
  if @custom_instruction_proposer
    # Use custom proposer if provided
    proposed = @custom_instruction_proposer.call(current_instruction, reflective_dataset)
    [proposed].compact
  else
    # Use built-in proposal logic
    analyze_failures_and_propose(current_instruction, reflective_dataset)
  end
end