Class: Evals::PromptEvaluator

Inherits:

Object

Object
Evals::PromptEvaluator

show all

Defined in:: lib/evals/prompt_evaluator.rb

Instance Method Summary collapse

Constructor Details

#initialize(max_concurrent_tasks: 3) ⇒ `PromptEvaluator`

Returns a new instance of PromptEvaluator.

# File 'lib/evals/prompt_evaluator.rb', line 7

def initialize(max_concurrent_tasks: 3)
  @max_concurrent_tasks = max_concurrent_tasks
  @client = Anthropic::Client.new(api_key: ENV["ANTHROPIC_API_KEY"])
  @model = "claude-3-5-haiku-latest"
end

Instance Method Details

#add_assistant_message(messages, text) ⇒ `Object`



30
31
32

# File 'lib/evals/prompt_evaluator.rb', line 30

def add_assistant_message(messages, text)
  messages << {role: "assistant", content: text}
end

#add_user_message(messages, text) ⇒ `Object`



26
27
28

# File 'lib/evals/prompt_evaluator.rb', line 26

def add_user_message(messages, text)
  messages << {role: "user", content: text}
end

#chat(messages, system: nil, temperature: 1.0, stop_sequences: []) ⇒ `Object`

# File 'lib/evals/prompt_evaluator.rb', line 34

def chat(messages, system: nil, temperature: 1.0, stop_sequences: [])
  params = {
    model: @model,
    max_tokens: 1000,
    messages: messages,
    temperature: temperature,
    stop_sequences: stop_sequences
  }

  params[:system] = system if system

  response = @client.messages.create(params)
  response.content[0].text
end

#generate_dataset(task_description, prompt_inputs_spec: {}, num_cases: 1, output_file: "dataset.json") ⇒ `Object`

# File 'lib/evals/prompt_evaluator.rb', line 195

def generate_dataset(task_description, prompt_inputs_spec: {}, num_cases: 1, output_file: "dataset.json")
  ideas = generate_unique_ideas(task_description, prompt_inputs_spec, num_cases)

  dataset = []
  completed = 0
  total = ideas.length
  last_reported_percentage = 0

  threads = ideas.map do |idea|
    Thread.new do
      generate_test_case(task_description, idea, prompt_inputs_spec)
    end
  end

  threads.each do |thread|
    result = thread.value
    completed += 1
    current_percentage = ((completed.to_f / total) * 100).to_i
    milestone_percentage = (current_percentage / 20) * 20

    if milestone_percentage > last_reported_percentage
      puts "Generated #{completed}/#{total} test cases"
      last_reported_percentage = milestone_percentage
    end

    dataset << result
  rescue => e
    puts "Error generating test case: #{e}"
  end

  File.write(output_file, JSON.pretty_generate(dataset))
  dataset
end

#generate_test_case(task_description, idea, prompt_inputs_spec = {}) ⇒ `Object`

# File 'lib/evals/prompt_evaluator.rb', line 117

def generate_test_case(task_description, idea, prompt_inputs_spec = {})
  example_prompt_inputs = ""
  prompt_inputs_spec.each do |key, value|
    val = value.gsub("\n", "\\n")
    example_prompt_inputs += "\"#{key}\": \"EXAMPLE_VALUE\", // #{val}\n"
  end

  allowed_keys = prompt_inputs_spec.keys.map { |key| "\"#{key}\"" }.join(", ")

  prompt = "    Generate a single detailed test case for a prompt evaluation based on:\n\n    <task_description>\n    \#{task_description}\n    </task_description>\n\n    <specific_idea>\n    \#{idea}\n    </specific_idea>\n\n    <allowed_input_keys>\n    \#{allowed_keys}\n    </allowed_input_keys>\n\n    Output Format:\n    ```json\n    {\n        \"prompt_inputs\": {\n        \#{example_prompt_inputs}\n        },\n        \"solution_criteria\": [\"criterion 1\", \"criterion 2\", ...] // Concise list of criteria for evaluating the solution, 1 to 4 items\n    }\n    ```\n\n    IMPORTANT REQUIREMENTS:\n    - You MUST ONLY use these exact input keys in your prompt_inputs: \#{allowed_keys}\n    - Do NOT add any additional keys to prompt_inputs\n    - All keys listed in allowed_input_keys must be included in your response\n    - Make the test case realistic and practically useful\n    - Include measurable, concise solution criteria\n    - The solution criteria should ONLY address the direct requirements of the task description and the generated prompt_inputs\n    - Avoid over-specifying criteria with requirements that go beyond the core task\n    - Keep solution criteria simple, focused, and directly tied to the fundamental task\n    - The test case should be tailored to the specific idea provided\n    - Quick to solve without requiring extensive computation or multi-step processing\n    - Solvable with no more than 400 tokens of output\n    - DO NOT include any fields beyond those specified in the output format\n  TEXT\n\n  system_prompt = \"You are a test case creator specializing in designing evaluation scenarios.\"\n\n  rendered_prompt = render(\n    prompt.strip,\n    {\n      \"allowed_keys\" => allowed_keys,\n      \"task_description\" => task_description,\n      \"idea\" => idea,\n      \"example_prompt_inputs\" => example_prompt_inputs\n    }\n  )\n\n  messages = []\n  add_user_message(messages, rendered_prompt)\n  add_assistant_message(messages, \"```json\")\n  text = chat(\n    messages,\n    stop_sequences: [\"```\"],\n    system: system_prompt,\n    temperature: 0.7\n  )\n\n  test_case = JSON.parse(text)\n  test_case[\"task_description\"] = task_description\n  test_case[\"scenario\"] = idea\n\n  test_case\nend\n"

#generate_unique_ideas(task_description, prompt_inputs_spec, num_cases) ⇒ `Object`

# File 'lib/evals/prompt_evaluator.rb', line 49

def generate_unique_ideas(task_description, prompt_inputs_spec, num_cases)
  prompt = "    Generate \#{num_cases} unique, diverse ideas for testing a prompt that accomplishes this task:\n\n    <task_description>\n    \#{task_description}\n    </task_description>\n\n    The prompt will receive the following inputs\n    <prompt_inputs>\n    \#{prompt_inputs_spec}\n    </prompt_inputs>\n\n    Each idea should represent a distinct scenario or example that tests different aspects of the task.\n\n    Output Format:\n    Provide your response as a structured JSON array where each item is a brief description of the idea.\n\n    Example:\n    ```json\n    [\n        \"Testing with technical computer science terminology\",\n        \"Testing with medical research findings\",\n        \"Testing with complex mathematical concepts\",\n        ...\n    ]\n    ```\n\n    Ensure each idea is:\n    - Clearly distinct from the others\n    - Relevant to the task description\n    - Specific enough to guide generation of a full test case\n    - Quick to solve without requiring extensive computation or multi-step processing\n    - Solvable with no more than 400 tokens of output\n\n    Remember, only generate \#{num_cases} unique ideas\n  TEXT\n\n  system_prompt = \"You are a test scenario designer specialized in creating diverse, unique testing scenarios.\"\n\n  example_prompt_inputs = \"\"\n  prompt_inputs_spec.each do |key, value|\n    val = value.gsub(\"\\n\", \"\\\\n\")\n    example_prompt_inputs += \"\\\"\#{key}\\\": str # \#{val},\"\n  end\n\n  rendered_prompt = render(\n    prompt.strip,\n    {\n      \"task_description\" => task_description,\n      \"num_cases\" => num_cases,\n      \"prompt_inputs\" => example_prompt_inputs\n    }\n  )\n\n  messages = []\n  add_user_message(messages, rendered_prompt)\n  add_assistant_message(messages, \"```json\")\n  text = chat(\n    messages,\n    stop_sequences: [\"```\"],\n    system: system_prompt,\n    temperature: 1.0\n  )\n\n  JSON.parse(text)\nend\n"

#grade_output(test_case, output, extra_criteria) ⇒ `Object`

# File 'lib/evals/prompt_evaluator.rb', line 229

def grade_output(test_case, output, extra_criteria)
  prompt_inputs = ""
  test_case["prompt_inputs"].each do |key, value|
    val = value.gsub("\n", "\\n")
    prompt_inputs += "\"#{key}\":\"#{val}\",\n"
  end

  extra_criteria_section = ""
  if extra_criteria
    extra_criteria_template = "      Mandatory Requirements - ANY VIOLATION MEANS AUTOMATIC FAILURE (score of 3 or lower):\n      <extra_important_criteria>\n      \#{extra_criteria}\n      </extra_important_criteria>\n    TEXT\n    extra_criteria_section = render(\n      extra_criteria_template.strip,\n      {\"extra_criteria\" => extra_criteria}\n    )\n  end\n\n  eval_template = <<~TEXT\n    Your task is to evaluate the following AI-generated solution with EXTREME RIGOR.\n\n    Original task description:\n    <task_description>\n    \#{test_case[\"task_description\"]}\n    </task_description>\n\n    Original task inputs:\n    <task_inputs>\n    { \#{prompt_inputs} }\n    </task_inputs>\n\n    Solution to Evaluate:\n    <solution>\n    \#{output}\n    </solution>\n\n    Criteria you should use to evaluate the solution:\n    <criteria>\n    \#{test_case[\"solution_criteria\"].join(\"\\n\")}\n    </criteria>\n\n    \#{extra_criteria_section}\n\n    Scoring Guidelines:\n    * Score 1-3: Solution fails to meet one or more MANDATORY requirements\n    * Score 4-6: Solution meets all mandatory requirements but has significant deficiencies in secondary criteria\n    * Score 7-8: Solution meets all mandatory requirements and most secondary criteria, with minor issues\n    * Score 9-10: Solution meets all mandatory and secondary criteria\n\n    IMPORTANT SCORING INSTRUCTIONS:\n    * Grade the output based ONLY on the listed criteria. Do not add your own extra requirements.\n    * If a solution meets all of the mandatory and secondary criteria give it a 10\n    * Don't complain that the solution \"only\" meets the mandatory and secondary criteria. Solutions shouldn't go above and beyond - they should meet the exact listed criteria.\n    * ANY violation of a mandatory requirement MUST result in a score of 3 or lower\n    * The full 1-10 scale should be utilized - don't hesitate to give low scores when warranted\n\n    Output Format\n    Provide your evaluation as a structured JSON object with the following fields, in this specific order:\n    - \"strengths\": An array of 1-3 key strengths\n    - \"weaknesses\": An array of 1-3 key areas for improvement\n    - \"reasoning\": A concise explanation of your overall assessment\n    - \"score\": A number between 1-10\n\n    Respond with JSON. Keep your response concise and direct.\n  TEXT\n\n  eval_prompt = render(\n    eval_template.strip,\n    {\n      \"task_description\" => test_case[\"task_description\"],\n      \"prompt_inputs\" => prompt_inputs,\n      \"output\" => output,\n      \"solution_criteria\" => test_case[\"solution_criteria\"].join(\"\\n\"),\n      \"extra_criteria_section\" => extra_criteria_section\n    }\n  )\n\n  messages = []\n  add_user_message(messages, eval_prompt)\n  add_assistant_message(messages, \"```json\")\n  eval_text = chat(\n    messages,\n    stop_sequences: [\"```\"],\n    temperature: 0.0\n  )\n\n  JSON.parse(eval_text)\nend\n"

#render(template_string, variables) ⇒ `Object`

# File 'lib/evals/prompt_evaluator.rb', line 13

def render(template_string, variables)
  placeholders = template_string.scan(/{([^{}]+)}/)

  result = template_string
  placeholders.flatten.each do |placeholder|
    if variables.key?(placeholder)
      result = result.gsub("{#{placeholder}}", variables[placeholder].to_s)
    end
  end

  result.gsub("{{", "{").gsub("}}", "}")
end

#run_evaluation(run_prompt_function, dataset_file, extra_criteria: nil, json_output_file: "output.json", html_output_file: "output.html") ⇒ `Object`

# File 'lib/evals/prompt_evaluator.rb', line 336

def run_evaluation(run_prompt_function, dataset_file, extra_criteria: nil, json_output_file: "output.json", html_output_file: "output.html")
  dataset = JSON.parse(File.read(dataset_file))

  results = []
  completed = 0
  total = dataset.length
  last_reported_percentage = 0

  threads = dataset.map do |test_case|
    Thread.new do
      run_test_case(test_case, run_prompt_function, extra_criteria)
    end
  end

  threads.each do |thread|
    result = thread.value
    completed += 1
    current_percentage = ((completed.to_f / total) * 100).to_i
    milestone_percentage = (current_percentage / 20) * 20

    if milestone_percentage > last_reported_percentage
      puts "Graded #{completed}/#{total} test cases"
      last_reported_percentage = milestone_percentage
    end

    results << result
  end

  average_score = results.sum { |result| result["score"] } / results.length.to_f
  puts "Average score: #{average_score}"

  File.write(json_output_file, JSON.pretty_generate(results))

  html = generate_prompt_evaluation_report(results)
  File.write(html_output_file, html)

  results
end

#run_test_case(test_case, run_prompt_function, extra_criteria = nil) ⇒ `Object`

# File 'lib/evals/prompt_evaluator.rb', line 321

def run_test_case(test_case, run_prompt_function, extra_criteria = nil)
  output = run_prompt_function.call(test_case["prompt_inputs"])

  model_grade = grade_output(test_case, output, extra_criteria)
  model_score = model_grade["score"]
  reasoning = model_grade["reasoning"]

  {
    "output" => output,
    "test_case" => test_case,
    "score" => model_score,
    "reasoning" => reasoning
  }
end

Class: Evals::PromptEvaluator

Instance Method Summary collapse

Constructor Details

#initialize(max_concurrent_tasks: 3) ⇒ PromptEvaluator

Instance Method Details

#add_assistant_message(messages, text) ⇒ Object

#add_user_message(messages, text) ⇒ Object

#chat(messages, system: nil, temperature: 1.0, stop_sequences: []) ⇒ Object

#generate_dataset(task_description, prompt_inputs_spec: {}, num_cases: 1, output_file: "dataset.json") ⇒ Object

#generate_test_case(task_description, idea, prompt_inputs_spec = {}) ⇒ Object

#generate_unique_ideas(task_description, prompt_inputs_spec, num_cases) ⇒ Object

#grade_output(test_case, output, extra_criteria) ⇒ Object

#render(template_string, variables) ⇒ Object

#run_evaluation(run_prompt_function, dataset_file, extra_criteria: nil, json_output_file: "output.json", html_output_file: "output.html") ⇒ Object

#run_test_case(test_case, run_prompt_function, extra_criteria = nil) ⇒ Object

#initialize(max_concurrent_tasks: 3) ⇒ `PromptEvaluator`

#add_assistant_message(messages, text) ⇒ `Object`

#add_user_message(messages, text) ⇒ `Object`

#chat(messages, system: nil, temperature: 1.0, stop_sequences: []) ⇒ `Object`

#generate_dataset(task_description, prompt_inputs_spec: {}, num_cases: 1, output_file: "dataset.json") ⇒ `Object`

#generate_test_case(task_description, idea, prompt_inputs_spec = {}) ⇒ `Object`

#generate_unique_ideas(task_description, prompt_inputs_spec, num_cases) ⇒ `Object`

#grade_output(test_case, output, extra_criteria) ⇒ `Object`

#render(template_string, variables) ⇒ `Object`

#run_evaluation(run_prompt_function, dataset_file, extra_criteria: nil, json_output_file: "output.json", html_output_file: "output.html") ⇒ `Object`

#run_test_case(test_case, run_prompt_function, extra_criteria = nil) ⇒ `Object`