Class: DSPy::Teleprompt::GEPA::FitnessEvaluator

Inherits:
Object
  • Object
show all
Extended by:
T::Sig
Defined in:
lib/dspy/teleprompt/gepa.rb

Overview

FitnessEvaluator provides multi-dimensional evaluation of prompt candidates

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(primary_metric:, config:, secondary_metrics: nil) ⇒ FitnessEvaluator

Returns a new instance of FitnessEvaluator.



1393
1394
1395
1396
1397
1398
# File 'lib/dspy/teleprompt/gepa.rb', line 1393

def initialize(primary_metric:, config:, secondary_metrics: nil)
  @primary_metric = primary_metric
  @config = config
  @secondary_metrics = secondary_metrics || default_secondary_metrics
  @trace_collector = TraceCollector.new
end

Instance Attribute Details

#configObject (readonly)

Returns the value of attribute config.



1381
1382
1383
# File 'lib/dspy/teleprompt/gepa.rb', line 1381

def config
  @config
end

#primary_metricObject (readonly)

Returns the value of attribute primary_metric.



1378
1379
1380
# File 'lib/dspy/teleprompt/gepa.rb', line 1378

def primary_metric
  @primary_metric
end

#secondary_metricsObject (readonly)

Returns the value of attribute secondary_metrics.



1384
1385
1386
# File 'lib/dspy/teleprompt/gepa.rb', line 1384

def secondary_metrics
  @secondary_metrics
end

Instance Method Details

#batch_evaluate(programs, trainset) ⇒ Object



1476
1477
1478
# File 'lib/dspy/teleprompt/gepa.rb', line 1476

def batch_evaluate(programs, trainset)
  programs.map { |program| evaluate_candidate(program, trainset) }
end

#compare_candidates(score1, score2) ⇒ Object



1482
1483
1484
# File 'lib/dspy/teleprompt/gepa.rb', line 1482

def compare_candidates(score1, score2)
  score1.overall_score - score2.overall_score
end

#evaluate_candidate(program, trainset) ⇒ Object



1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
# File 'lib/dspy/teleprompt/gepa.rb', line 1402

def evaluate_candidate(program, trainset)
  start_time = Time.now
  predictions = []
  traces = []

  # Collect primary metric scores and execution data
  primary_scores = trainset.map do |example|
    prediction_start = Time.now
    prediction = program.call(**example.input_values)
    prediction_time = Time.now - prediction_start

    predictions << {
      prediction: prediction,
      latency: prediction_time,
      example: example
    }

    @primary_metric.call(example, prediction).to_f
  rescue => e
    # Handle prediction errors
    predictions << {
      prediction: nil,
      latency: 0.0,
      example: example,
      error: e.message
    }
    0.0
  end

  primary_score = primary_scores.sum / primary_scores.size

  # Calculate secondary metrics
  secondary_scores = {}

  # Token efficiency (mock data for now - will be replaced with real trace collection)
  mock_traces = predictions.map.with_index do |pred, i|
    OpenStruct.new(token_usage: 50 + rand(100))
  end
  secondary_scores[:token_efficiency] = calculate_token_efficiency(mock_traces, predictions.size)

  # Response consistency - use first output field for any signature
  response_texts = predictions.map do |p|
    pred = p[:prediction]
    if pred && pred.respond_to?(:class) && pred.class.respond_to?(:props)
      # Get first output field name and value
      first_field = pred.class.props.keys.first
      first_field ? (pred.send(first_field)&.to_s || '') : ''
    else
      ''
    end
  end
  secondary_scores[:consistency] = calculate_consistency(response_texts)

  # Latency performance
  latencies = predictions.map { |p| p[:latency] }
  secondary_scores[:latency] = calculate_latency_score(latencies)

  # Calculate weighted overall score
  overall_score = calculate_overall_score(primary_score, secondary_scores)

  FitnessScore.new(
    primary_score: primary_score,
    secondary_scores: secondary_scores,
    overall_score: overall_score,
    metadata: {
      evaluation_time: Time.now - start_time,
      examples_count: trainset.size,
      errors_count: predictions.count { |p| p[:error] }
    }
  )
end

#rank_candidates(scores) ⇒ Object



1488
1489
1490
# File 'lib/dspy/teleprompt/gepa.rb', line 1488

def rank_candidates(scores)
  scores.each_with_index.sort_by { |score, _| -score.overall_score }.map(&:last)
end