Class: Aidp::Execute::PromptEvaluator

Inherits:

Object

Object
Aidp::Execute::PromptEvaluator

show all

Defined in:: lib/aidp/execute/prompt_evaluator.rb

Overview

Evaluates prompt effectiveness using ZFC after multiple iterations

FIX for issue #391: When the work loop reaches 10+ iterations without completion, this evaluator assesses prompt quality and suggests improvements.

Uses Zero Framework Cognition (ZFC) to analyze:

Whether the prompt clearly defines completion criteria
If task breakdown instructions are adequate
Whether the agent has sufficient context
If there are blockers preventing progress

Examples:

evaluator = PromptEvaluator.new(config)
result = evaluator.evaluate(
  prompt_content: prompt_manager.read,
  iteration_count: 12,
  task_summary: persistent_tasklist.summary,
  recent_failures: all_results
)
# => { effective: false, issues: [...], suggestions: [...] }

Constant Summary collapse

EVALUATION_ITERATION_THRESHOLD = Threshold for triggering evaluation

EVALUATION_INTERVAL = Re-evaluate periodically after threshold

Instance Attribute Summary collapse

#ai_decision_engine ⇒ Object readonly

Expose for testability.

Instance Method Summary collapse

#evaluate(prompt_content:, iteration_count:, task_summary:, recent_failures:, step_name: nil) ⇒ Hash

Evaluate prompt effectiveness.
#generate_template_improvements(evaluation_result:, original_template:) ⇒ Hash

Generate improvement recommendations for the prompt template Used for AGD pattern - generating improved templates based on evaluation.
#initialize(config, ai_decision_engine: nil) ⇒ PromptEvaluator constructor

A new instance of PromptEvaluator.
#safely_build_ai_decision_engine ⇒ Object

Safely build AIDecisionEngine, returning nil if config doesn’t support it This allows tests with mock configs to work without AI calls.
#should_evaluate?(iteration_count) ⇒ Boolean

Check if evaluation should be triggered based on iteration count.

Constructor Details

#initialize(config, ai_decision_engine: nil) ⇒ `PromptEvaluator`

# File 'lib/aidp/execute/prompt_evaluator.rb', line 38

def initialize(config, ai_decision_engine: nil)
  @config = config
  @ai_decision_engine = ai_decision_engine || safely_build_ai_decision_engine
end

Instance Attribute Details

#ai_decision_engine ⇒ `Object` (readonly)

Expose for testability



36
37
38

# File 'lib/aidp/execute/prompt_evaluator.rb', line 36

def ai_decision_engine
  @ai_decision_engine
end

Instance Method Details

#evaluate(prompt_content:, iteration_count:, task_summary:, recent_failures:, step_name: nil) ⇒ `Hash`

Evaluate prompt effectiveness

# File 'lib/aidp/execute/prompt_evaluator.rb', line 73

def evaluate(prompt_content:, iteration_count:, task_summary:, recent_failures:, step_name: nil)
  Aidp.log_debug("prompt_evaluator", "starting_evaluation",
    iteration: iteration_count,
    step: step_name,
    prompt_size: prompt_content&.length || 0)

  # When AI decision engine is unavailable (e.g., in tests with mock configs),
  # return a neutral result that doesn't trigger feedback appending
  unless @ai_decision_engine
    Aidp.log_debug("prompt_evaluator", "skipping_evaluation_no_ai_engine")
    return {
      effective: true,  # Assume effective to avoid unnecessary feedback
      issues: [],
      suggestions: [],
      likely_blockers: [],
      recommended_actions: [],
      confidence: 0.0,
      skipped: true,
      skip_reason: "AI decision engine not available"
    }
  end

  prompt = build_evaluation_prompt(
    prompt_content: prompt_content,
    iteration_count: iteration_count,
    task_summary: task_summary,
    recent_failures: recent_failures
  )

  schema = {
    type: "object",
    properties: {
      effective: {
        type: "boolean",
        description: "True if the prompt is likely to lead to completion within a few more iterations"
      },
      issues: {
        type: "array",
        items: {type: "string"},
        description: "Specific problems identified with the current prompt"
      },
      suggestions: {
        type: "array",
        items: {type: "string"},
        description: "Actionable suggestions to improve prompt effectiveness"
      },
      likely_blockers: {
        type: "array",
        items: {type: "string"},
        description: "Potential blockers preventing progress"
      },
      recommended_actions: {
        type: "array",
        items: {
          type: "object",
          properties: {
            action: {type: "string"},
            priority: {type: "string", enum: ["high", "medium", "low"]},
            rationale: {type: "string"}
          }
        },
        description: "Specific actions to take, prioritized"
      },
      confidence: {
        type: "number",
        minimum: 0.0,
        maximum: 1.0,
        description: "Confidence in this assessment"
      }
    },
    required: ["effective", "issues", "suggestions", "confidence"]
  }

  begin
    result = @ai_decision_engine.decide(
      :prompt_evaluation,
      context: {prompt: prompt},
      schema: schema,
      tier: :mini,
      cache_ttl: nil  # Each evaluation is context-specific
    )

    Aidp.log_info("prompt_evaluator", "evaluation_complete",
      iteration: iteration_count,
      effective: result[:effective],
      issue_count: result[:issues]&.size || 0,
      confidence: result[:confidence])

    result
  rescue => e
    Aidp.log_error("prompt_evaluator", "evaluation_failed",
      error: e.message,
      error_class: e.class.name)

    build_fallback_result("Evaluation failed: #{e.message}")
  end
end

#generate_template_improvements(evaluation_result:, original_template:) ⇒ `Hash`

Generate improvement recommendations for the prompt template Used for AGD pattern - generating improved templates based on evaluation

# File 'lib/aidp/execute/prompt_evaluator.rb', line 176

def generate_template_improvements(evaluation_result:, original_template:)
  return nil unless @ai_decision_engine

  Aidp.log_debug("prompt_evaluator", "generating_template_improvements",
    issue_count: evaluation_result[:issues]&.size || 0)

  prompt = build_improvement_prompt(evaluation_result, original_template)

  schema = {
    type: "object",
    properties: {
      improved_sections: {
        type: "array",
        items: {
          type: "object",
          properties: {
            section_name: {type: "string"},
            original: {type: "string"},
            improved: {type: "string"},
            rationale: {type: "string"}
          }
        }
      },
      additional_sections: {
        type: "array",
        items: {
          type: "object",
          properties: {
            section_name: {type: "string"},
            content: {type: "string"},
            rationale: {type: "string"}
          }
        }
      },
      completion_criteria_improvements: {
        type: "array",
        items: {type: "string"},
        description: "Specific improvements to completion criteria definitions"
      }
    },
    required: ["improved_sections", "completion_criteria_improvements"]
  }

  @ai_decision_engine.decide(
    :template_improvement,
    context: {prompt: prompt},
    schema: schema,
    tier: :standard,  # Use standard tier for more thoughtful improvements
    cache_ttl: nil
  )
rescue => e
  Aidp.log_error("prompt_evaluator", "template_improvement_failed",
    error: e.message)
  nil
end

#safely_build_ai_decision_engine ⇒ `Object`

Safely build AIDecisionEngine, returning nil if config doesn’t support it This allows tests with mock configs to work without AI calls

# File 'lib/aidp/execute/prompt_evaluator.rb', line 45

def safely_build_ai_decision_engine
  # Check if config supports the methods AIDecisionEngine needs
  return nil unless @config.respond_to?(:default_provider)

  build_default_ai_decision_engine
rescue => e
  Aidp.log_debug("prompt_evaluator", "skipping_ai_decision_engine",
    reason: e.message)
  nil
end

#should_evaluate?(iteration_count) ⇒ `Boolean`

Check if evaluation should be triggered based on iteration count

# File 'lib/aidp/execute/prompt_evaluator.rb', line 59

def should_evaluate?(iteration_count)
  return false unless iteration_count >= EVALUATION_ITERATION_THRESHOLD

  # Evaluate at threshold and every EVALUATION_INTERVAL after
  (iteration_count - EVALUATION_ITERATION_THRESHOLD) % EVALUATION_INTERVAL == 0
end