Class: Aidp::Harness::ErrorHandler

Inherits:
Object
  • Object
show all
Includes:
DebugMixin
Defined in:
lib/aidp/harness/error_handler.rb

Overview

Handles error recovery, retry strategies, and fallback mechanisms

Defined Under Namespace

Classes: BackoffCalculator, ErrorClassifier, RecoveryPlanner, Sleeper

Constant Summary

Constants included from DebugMixin

DebugMixin::DEBUG_BASIC, DebugMixin::DEBUG_OFF, DebugMixin::DEBUG_VERBOSE

Instance Method Summary collapse

Methods included from DebugMixin

#debug_basic?, #debug_command, #debug_enabled?, #debug_error, #debug_execute_command, #debug_level, #debug_log, #debug_logger, #debug_provider, #debug_step, #debug_timing, #debug_verbose?, included, shared_logger

Constructor Details

#initialize(provider_manager, configuration, metrics_manager = nil, sleeper: nil) ⇒ ErrorHandler

Returns a new instance of ErrorHandler.

Parameters:

  • sleeper (#sleep) (defaults to: nil)

    object responding to sleep(seconds); injectable for tests



23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/aidp/harness/error_handler.rb', line 23

def initialize(provider_manager, configuration, metrics_manager = nil, sleeper: nil)
  @provider_manager = provider_manager
  @configuration = configuration
  @metrics_manager = metrics_manager
  @sleeper = sleeper || Sleeper.new
  @retry_strategies = {}
  @retry_counts = {}
  @error_history = []
  @circuit_breakers = {}
  @backoff_calculator = BackoffCalculator.new
  @error_classifier = ErrorClassifier.new
  @recovery_planner = RecoveryPlanner.new
  initialize_retry_strategies
end

Instance Method Details

#attempt_recovery(error_info, context = {}) ⇒ Object

Attempt recovery when retries are exhausted or not applicable



225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
# File 'lib/aidp/harness/error_handler.rb', line 225

def attempt_recovery(error_info, context = {})
  recovery_plan = @recovery_planner.create_recovery_plan(error_info, context)

  case recovery_plan[:action]
  when :switch_provider
    attempt_provider_switch(error_info, recovery_plan)
  when :switch_model
    attempt_model_switch(error_info, recovery_plan)
  when :circuit_breaker
    open_circuit_breaker(error_info, recovery_plan)
  when :escalate
    escalate_error(error_info, recovery_plan)
  when :abort
    abort_execution(error_info, recovery_plan)
  else
    {
      success: false,
      action: :unknown_recovery,
      error: "Unknown recovery action: #{recovery_plan[:action]}"
    }
  end
end

#circuit_breaker_statusObject

Get circuit breaker status



321
322
323
324
325
326
327
328
329
330
# File 'lib/aidp/harness/error_handler.rb', line 321

def circuit_breaker_status
  @circuit_breakers.transform_values do |cb|
    {
      open: cb[:open],
      opened_at: cb[:opened_at],
      failure_count: cb[:failure_count],
      threshold: cb[:threshold]
    }
  end
end

#clear_error_historyObject

Clear error history



316
317
318
# File 'lib/aidp/harness/error_handler.rb', line 316

def clear_error_history
  @error_history.clear
end

#error_history(time_range = nil) ⇒ Object

Get error history



307
308
309
310
311
312
313
# File 'lib/aidp/harness/error_handler.rb', line 307

def error_history(time_range = nil)
  if time_range
    @error_history.select { |e| time_range.include?(e[:timestamp]) }
  else
    @error_history
  end
end

#error_statsObject

Get error statistics



39
40
41
42
43
44
45
46
47
# File 'lib/aidp/harness/error_handler.rb', line 39

def error_stats
  {
    total_errors: @error_history.size,
    error_types: @error_history.group_by { |e| e[:error_type] }.transform_values(&:size),
    recent_errors: @error_history.last(10),
    retry_counts: @retry_counts.dup,
    circuit_breaker_states: @circuit_breakers.transform_values { |cb| cb[:state] }
  }
end

#execute_retry(error_info, strategy, context = {}) ⇒ Object

Execute a retry with the given strategy



179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# File 'lib/aidp/harness/error_handler.rb', line 179

def execute_retry(error_info, strategy, context = {})
  provider = error_info[:provider]
  model = error_info[:model]
  error_type = error_info[:error_type]

  # Increment retry count
  retry_key = "#{provider}:#{model}:#{error_type}"
  @retry_counts[retry_key] ||= 0
  @retry_counts[retry_key] += 1

  # Check if we've exceeded max retries
  if @retry_counts[retry_key] > strategy[:max_retries]
    return {
      success: false,
      action: :exhausted_retries,
      error: "Max retries exceeded for #{error_type}",
      retry_count: @retry_counts[retry_key],
      next_action: :fallback
    }
  end

  # Calculate backoff delay
  delay = @backoff_calculator.calculate_delay(
    @retry_counts[retry_key],
    strategy[:backoff_strategy],
    strategy[:base_delay],
    strategy[:max_delay]
  )

  # Wait for backoff delay
  @sleeper.sleep(delay) if delay > 0

  # Execute the retry
  retry_result = execute_retry_attempt(error_info, strategy, context)

  # Update retry result with metadata
  retry_result.merge!(
    retry_count: @retry_counts[retry_key],
    delay: delay,
    strategy: strategy[:name]
  )

  retry_result
end

#execute_with_retry(&block) ⇒ Object

Execute a block with retry logic



101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# File 'lib/aidp/harness/error_handler.rb', line 101

def execute_with_retry(&block)
  providers_tried = []

  loop do
    max_attempts = @configuration.max_retries + 1
    attempt = 0

    begin
      attempt += 1
      return yield
    rescue Aidp::Errors::ConfigurationError
      # Configuration errors should crash immediately (crash-early principle)
      # Re-raise without catching
      raise
    rescue => error
      current_provider = current_provider_safely

      if attempt < max_attempts
        error_info = {
          error: error,
          provider: current_provider,
          model: current_model_safely,
          error_type: @error_classifier.classify_error(error)
        }

        strategy = retry_strategy(error_info[:error_type])
        if should_retry?(error_info, strategy)
          delay = @backoff_calculator.calculate_delay(attempt, strategy[:backoff_strategy] || :exponential, 1, 10)
          debug_log("🔁 Retry attempt #{attempt} for #{current_provider}", level: :info, data: {delay: delay, error_type: error_info[:error_type]})
          @sleeper.sleep(delay) if delay > 0
          retry
        end
      end

      # Provider exhausted – attempt recovery (may switch provider)
      debug_log("🚫 Exhausted retries for provider, attempting recovery", level: :warn, data: {provider: current_provider, attempt: attempt, max_attempts: max_attempts})
      handle_error(error, {
        provider: current_provider,
        model: current_model_safely,
        exhausted_retries: true
      })

      new_provider = current_provider_safely
      if new_provider != current_provider && !providers_tried.include?(new_provider)
        providers_tried << current_provider
        # Reset retry counts for the new provider
        begin
          reset_retry_counts(new_provider)
        rescue => e
          debug_log("⚠️ Failed to reset retry counts for new provider", level: :warn, data: {error: e.message})
        end
        debug_log("🔀 Switched provider after failure – re-executing block", level: :info, data: {from: current_provider, to: new_provider})
        # Start retry loop fresh for new provider
        next
      end

      # No new provider (or already tried) – return structured failure
      debug_log("❌ No fallback provider available or all tried", level: :error, data: {providers_tried: providers_tried})
      begin
        if @provider_manager.respond_to?(:mark_provider_failure_exhausted)
          @provider_manager.mark_provider_failure_exhausted(current_provider)
          debug_log("🛑 Marked provider #{current_provider} unhealthy due to exhausted retries", level: :warn)
        end
      rescue => e
        debug_log("⚠️ Failed to mark provider failure-exhausted", level: :warn, data: {error: e.message})
      end
      return {
        status: "failed",
        error: error,
        message: error.message,
        provider: current_provider,
        providers_tried: providers_tried.dup
      }
    end
  end
end

#handle_error(error, context = {}) ⇒ Object

Main entry point for error handling



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/aidp/harness/error_handler.rb', line 50

def handle_error(error, context = {})
  error_info = @error_classifier.classify_error(error, context)

  # Debug logging
  debug_error(error, context)
  debug_log("🔧 ErrorHandler: Processing error", level: :info, data: {
    error_type: error_info[:error_type],
    provider: error_info[:provider],
    model: error_info[:model]
  })

  # Record error in metrics if available
  @metrics_manager&.record_error(error_info[:provider], error_info[:model], error_info)

  # Add to error history
  @error_history << error_info

  # Get retry strategy for this error type
  strategy = retry_strategy(error_info[:error_type])

  # Check if we should retry
  if should_retry?(error_info, strategy)
    debug_log("🔄 ErrorHandler: Attempting retry", level: :info, data: {
      strategy: strategy[:name],
      max_retries: strategy[:max_retries]
    })
    execute_retry(error_info, strategy, context)

  else
    # No retry, attempt recovery
    debug_log("🚨 ErrorHandler: No retry, attempting recovery", level: :warn, data: {
      error_type: error_info[:error_type],
      reason: "Retry not applicable or exhausted"
    })
    if error_info[:error_type].to_sym == :auth_expired
      # Mark provider unhealthy to avoid immediate re-selection
      begin
        if @provider_manager.respond_to?(:mark_provider_auth_failure)
          @provider_manager.mark_provider_auth_failure(error_info[:provider])
          debug_log("🔐 Marked provider #{error_info[:provider]} unhealthy due to auth error", level: :warn)
        end
      rescue => e
        debug_log("⚠️ Failed to mark provider unhealthy after auth error", level: :warn, data: {error: e.message})
      end
    end
    attempt_recovery(error_info, context)

  end
end

#max_attemptsObject

Get maximum retry attempts



254
255
256
# File 'lib/aidp/harness/error_handler.rb', line 254

def max_attempts
  @configuration.respond_to?(:max_retries) ? @configuration.max_retries : 3
end

#reset_all_circuit_breakersObject

Reset all circuit breakers



339
340
341
# File 'lib/aidp/harness/error_handler.rb', line 339

def reset_all_circuit_breakers
  @circuit_breakers.clear
end

#reset_circuit_breaker(provider, model = nil) ⇒ Object

Reset circuit breaker



333
334
335
336
# File 'lib/aidp/harness/error_handler.rb', line 333

def reset_circuit_breaker(provider, model = nil)
  key = model ? "#{provider}:#{model}" : provider
  @circuit_breakers.delete(key)
end

#reset_retry_counts(provider, model = nil) ⇒ Object

Reset retry counts for a specific provider/model combination



274
275
276
277
278
279
280
281
282
283
284
# File 'lib/aidp/harness/error_handler.rb', line 274

def reset_retry_counts(provider, model = nil)
  keys_to_reset = if model
    # Reset specific model
    @retry_counts.keys.select { |k| k.start_with?("#{provider}:#{model}:") }
  else
    # Reset all models for provider
    @retry_counts.keys.select { |k| k.start_with?("#{provider}:") }
  end

  keys_to_reset.each { |key| @retry_counts.delete(key) }
end

#retry_status(provider, model = nil) ⇒ Object

Get retry status for a provider/model



287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
# File 'lib/aidp/harness/error_handler.rb', line 287

def retry_status(provider, model = nil)
  keys = if model
    @retry_counts.keys.select { |k| k.start_with?("#{provider}:#{model}:") }
  else
    @retry_counts.keys.select { |k| k.start_with?("#{provider}:") }
  end

  status = {}
  keys.each do |key|
    error_type = key.split(":").last
    status[error_type] = {
      retry_count: @retry_counts[key],
      max_retries: retry_strategy(error_type.to_sym)[:max_retries]
    }
  end

  status
end

#retry_strategy(error_type) ⇒ Object

Get retry strategy for error type



249
250
251
# File 'lib/aidp/harness/error_handler.rb', line 249

def retry_strategy(error_type)
  @retry_strategies[error_type] || @retry_strategies[:default]
end

#should_retry?(error_info, strategy) ⇒ Boolean

Check if we should retry based on error type and strategy

Returns:

  • (Boolean)


259
260
261
262
263
264
265
266
267
268
269
270
271
# File 'lib/aidp/harness/error_handler.rb', line 259

def should_retry?(error_info, strategy)
  return false unless strategy[:enabled]

  # Use ErrorTaxonomy to determine if error is retryable
  error_type = error_info[:error_type]
  return false unless Aidp::Providers::ErrorTaxonomy.retryable?(error_type)

  # Check circuit breaker
  circuit_breaker_key = "#{error_info[:provider]}:#{error_info[:model]}"
  return false if circuit_breaker_open?(circuit_breaker_key)

  true
end