Class: EvalRuby::Comparison

Inherits:
Object
  • Object
show all
Defined in:
lib/eval_ruby/comparison.rb

Overview

Statistical comparison of two evaluation reports using paired t-tests.

Examples:

comparison = EvalRuby.compare(report_a, report_b)
puts comparison.summary
comparison.significant_improvements # => [:faithfulness]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(report_a, report_b) ⇒ Comparison

Returns a new instance of Comparison.

Parameters:

  • report_a (Report)

    baseline

  • report_b (Report)

    comparison



19
20
21
22
# File 'lib/eval_ruby/comparison.rb', line 19

def initialize(report_a, report_b)
  @report_a = report_a
  @report_b = report_b
end

Instance Attribute Details

#report_aReport (readonly)

Returns baseline report.

Returns:

  • (Report)

    baseline report



12
13
14
# File 'lib/eval_ruby/comparison.rb', line 12

def report_a
  @report_a
end

#report_bReport (readonly)

Returns comparison report.

Returns:

  • (Report)

    comparison report



15
16
17
# File 'lib/eval_ruby/comparison.rb', line 15

def report_b
  @report_b
end

Instance Method Details

#significant_improvements(alpha: 0.05) ⇒ Array<Symbol>

Returns metrics where report_b is significantly better than report_a.

Parameters:

  • alpha (Float) (defaults to: 0.05)

    significance level (default 0.05)

Returns:

  • (Array<Symbol>)

    metric names with significant improvements



55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/eval_ruby/comparison.rb', line 55

def significant_improvements(alpha: 0.05)
  all_metrics.select do |metric|
    scores_a = @report_a.results.filter_map { |r| r.scores[metric] }
    scores_b = @report_b.results.filter_map { |r| r.scores[metric] }
    next false if scores_a.empty? || scores_b.empty?

    t_result = paired_t_test(scores_a, scores_b)
    mean_b = scores_b.sum / scores_b.size.to_f
    mean_a = scores_a.sum / scores_a.size.to_f
    t_result[:p_value] < alpha && mean_b > mean_a
  end
end

#summaryString

Returns formatted comparison table with deltas and p-values.

Returns:

  • (String)

    formatted comparison table with deltas and p-values



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/eval_ruby/comparison.rb', line 25

def summary
  lines = [
    format("%-20s | %-10s | %-10s | %-8s | %s", "Metric", "A", "B", "Delta", "p-value"),
    "-" * 70
  ]

  all_metrics.each do |metric|
    stats_a = @report_a.metric_stats[metric]
    stats_b = @report_b.metric_stats[metric]
    next unless stats_a && stats_b

    delta = stats_b[:mean] - stats_a[:mean]
    scores_a = @report_a.results.filter_map { |r| r.scores[metric] }
    scores_b = @report_b.results.filter_map { |r| r.scores[metric] }
    t_result = paired_t_test(scores_a, scores_b)
    sig = significance_marker(t_result[:p_value])

    lines << format(
      "%-20s | %-10.4f | %-10.4f | %+.4f  | %.4f %s",
      metric, stats_a[:mean], stats_b[:mean], delta, t_result[:p_value], sig
    )
  end

  lines.join("\n")
end