Class: DSPy::Teleprompt::GEPA::FitnessEvaluator

Inherits:
Object
  • Object
show all
Extended by:
T::Sig
Defined in:
lib/dspy/teleprompt/gepa.rb

Overview

FitnessEvaluator provides multi-dimensional evaluation of prompt candidates

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(primary_metric:, config:, secondary_metrics: nil) ⇒ FitnessEvaluator

Returns a new instance of FitnessEvaluator.



1653
1654
1655
1656
1657
1658
# File 'lib/dspy/teleprompt/gepa.rb', line 1653

def initialize(primary_metric:, config:, secondary_metrics: nil)
  @primary_metric = primary_metric
  @config = config
  @secondary_metrics = secondary_metrics || default_secondary_metrics
  @trace_collector = TraceCollector.new
end

Instance Attribute Details

#configObject (readonly)

Returns the value of attribute config.



1641
1642
1643
# File 'lib/dspy/teleprompt/gepa.rb', line 1641

def config
  @config
end

#primary_metricObject (readonly)

Returns the value of attribute primary_metric.



1638
1639
1640
# File 'lib/dspy/teleprompt/gepa.rb', line 1638

def primary_metric
  @primary_metric
end

#secondary_metricsObject (readonly)

Returns the value of attribute secondary_metrics.



1644
1645
1646
# File 'lib/dspy/teleprompt/gepa.rb', line 1644

def secondary_metrics
  @secondary_metrics
end

Instance Method Details

#batch_evaluate(programs, trainset) ⇒ Object



1736
1737
1738
# File 'lib/dspy/teleprompt/gepa.rb', line 1736

def batch_evaluate(programs, trainset)
  programs.map { |program| evaluate_candidate(program, trainset) }
end

#compare_candidates(score1, score2) ⇒ Object



1742
1743
1744
# File 'lib/dspy/teleprompt/gepa.rb', line 1742

def compare_candidates(score1, score2)
  score1.overall_score - score2.overall_score
end

#evaluate_candidate(program, trainset) ⇒ Object



1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
# File 'lib/dspy/teleprompt/gepa.rb', line 1662

def evaluate_candidate(program, trainset)
  start_time = Time.now
  predictions = []
  traces = []

  # Collect primary metric scores and execution data
  primary_scores = trainset.map do |example|
    prediction_start = Time.now
    prediction = program.call(**example.input_values)
    prediction_time = Time.now - prediction_start

    predictions << {
      prediction: prediction,
      latency: prediction_time,
      example: example
    }

    @primary_metric.call(example, prediction).to_f
  rescue => e
    # Handle prediction errors
    predictions << {
      prediction: nil,
      latency: 0.0,
      example: example,
      error: e.message
    }
    0.0
  end

  primary_score = primary_scores.sum / primary_scores.size

  # Calculate secondary metrics
  secondary_scores = {}
  
  # Token efficiency (mock data for now - will be replaced with real trace collection)
  mock_traces = predictions.map.with_index do |pred, i|
    OpenStruct.new(token_usage: 50 + rand(100))
  end
  secondary_scores[:token_efficiency] = calculate_token_efficiency(mock_traces, predictions.size)

  # Response consistency - use first output field for any signature
  response_texts = predictions.map do |p|
    pred = p[:prediction]
    if pred && pred.respond_to?(:class) && pred.class.respond_to?(:props)
      # Get first output field name and value
      first_field = pred.class.props.keys.first
      first_field ? (pred.send(first_field)&.to_s || '') : ''
    else
      ''
    end
  end
  secondary_scores[:consistency] = calculate_consistency(response_texts)

  # Latency performance
  latencies = predictions.map { |p| p[:latency] }
  secondary_scores[:latency] = calculate_latency_score(latencies)

  # Calculate weighted overall score
  overall_score = calculate_overall_score(primary_score, secondary_scores)

  FitnessScore.new(
    primary_score: primary_score,
    secondary_scores: secondary_scores,
    overall_score: overall_score,
    metadata: {
      evaluation_time: Time.now - start_time,
      examples_count: trainset.size,
      errors_count: predictions.count { |p| p[:error] }
    }
  )
end

#rank_candidates(scores) ⇒ Object



1748
1749
1750
# File 'lib/dspy/teleprompt/gepa.rb', line 1748

def rank_candidates(scores)
  scores.each_with_index.sort_by { |score, _| -score.overall_score }.map(&:last)
end