Class: LangsmithrbRails::Evaluation::Evaluator

Inherits:
Object
  • Object
show all
Defined in:
lib/langsmithrb_rails/evaluation/evaluator.rb

Overview

Base evaluator class

Direct Known Subclasses

LLMEvaluator, StringEvaluator

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(client: nil, project_name: nil, tags: []) ⇒ Evaluator

Initialize a new evaluator



13
14
15
16
17
# File 'lib/langsmithrb_rails/evaluation/evaluator.rb', line 13

def initialize(client: nil, project_name: nil, tags: [])
  @client = client || LangsmithrbRails::Client.new
  @project_name = project_name
  @tags = tags
end

Instance Attribute Details

#clientObject (readonly)

Returns the value of attribute client.



7
8
9
# File 'lib/langsmithrb_rails/evaluation/evaluator.rb', line 7

def client
  @client
end

#project_nameObject (readonly)

Returns the value of attribute project_name.



7
8
9
# File 'lib/langsmithrb_rails/evaluation/evaluator.rb', line 7

def project_name
  @project_name
end

#tagsObject (readonly)

Returns the value of attribute tags.



7
8
9
# File 'lib/langsmithrb_rails/evaluation/evaluator.rb', line 7

def tags
  @tags
end

Instance Method Details

#evaluate(prediction, reference = nil, input = nil) ⇒ Hash

Evaluate a prediction against a reference

Raises:

  • (NotImplementedError)


24
25
26
# File 'lib/langsmithrb_rails/evaluation/evaluator.rb', line 24

def evaluate(prediction, reference = nil, input = nil)
  raise NotImplementedError, "Subclasses must implement evaluate method"
end

#evaluate_dataset(dataset_id, experiment_name, target_llm = nil) ⇒ Hash

Evaluate a dataset



77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/langsmithrb_rails/evaluation/evaluator.rb', line 77

def evaluate_dataset(dataset_id, experiment_name, target_llm = nil)
  # Get the dataset examples
  response = client.list_examples(dataset_id)
  
  unless response[:status] >= 200 && response[:status] < 300
    raise "Failed to get dataset examples: #{response[:error] || response[:body]}"
  end
  
  examples = response[:body]
  
  results = {
    experiment_name: experiment_name,
    dataset_id: dataset_id,
    evaluator_name: self.class.name,
    results: []
  }
  
  examples.each do |example|
    # If target LLM is provided, generate a prediction
    if target_llm
      prediction = generate_prediction(target_llm, example["inputs"])
    else
      # Otherwise use the example's outputs as the prediction
      prediction = example["outputs"]
    end
    
    # Evaluate
    result = evaluate(prediction, example["outputs"], example["inputs"])
    
    results[:results] << {
      example_id: example["id"],
      score: result[:score],
      metadata: result[:metadata]
    }
  end
  
  results
end

#evaluate_run(run_id, reference = nil) ⇒ Hash

Evaluate a run



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/langsmithrb_rails/evaluation/evaluator.rb', line 32

def evaluate_run(run_id, reference = nil)
  # Get the run
  response = client.get_run(run_id)
  
  unless response[:status] >= 200 && response[:status] < 300
    raise "Failed to get run: #{response[:error] || response[:body]}"
  end
  
  run = response[:body]
  
  # Extract prediction from run outputs
  prediction = extract_prediction_from_run(run)
  
  # Extract input from run inputs
  input = run["inputs"]
  
  # Evaluate
  result = evaluate(prediction, reference, input)
  
  # Create feedback
  create_feedback(run_id, result)
  
  result
end

#evaluate_runs(run_ids, references = {}) ⇒ Hash<String, Hash>

Evaluate multiple runs



61
62
63
64
65
66
67
68
69
70
# File 'lib/langsmithrb_rails/evaluation/evaluator.rb', line 61

def evaluate_runs(run_ids, references = {})
  results = {}
  
  run_ids.each do |run_id|
    reference = references[run_id]
    results[run_id] = evaluate_run(run_id, reference)
  end
  
  results
end