Class: Interferon::Destinations::Datadog
- Inherits:
-
Object
- Object
- Interferon::Destinations::Datadog
- Includes:
- Logging
- Defined in:
- lib/interferon/destinations/datadog.rb
Constant Summary collapse
- ALERT_KEY =
'This alert was created via the alerts framework'.freeze
Instance Attribute Summary collapse
-
#concurrency ⇒ Object
Returns the value of attribute concurrency.
Class Method Summary collapse
- .generate_message(message, people, options = {}) ⇒ Object
- .normalize_monitor_type(monitor_type) ⇒ Object
- .same_alerts(alert, people, alert_api_json) ⇒ Object
- .same_monitor_type(monitor_type_a, monitor_type_b) ⇒ Object
Instance Method Summary collapse
- #api_errors ⇒ Object
- #create_alert(alert, people) ⇒ Object
- #create_datadog_alert(alert, datadog_query, message, alert_options) ⇒ Object
- #existing_alerts ⇒ Object
- #fetch_existing_alerts ⇒ Object
-
#initialize(options) ⇒ Datadog
constructor
A new instance of Datadog.
- #log_datadog_response_code(resp, code, action, alert = nil) ⇒ Object
- #need_update(alert_people_pair, existing_alerts_from_api) ⇒ Object
- #remove_alert(alert) ⇒ Object
- #report_stats ⇒ Object
- #update_datadog_alert(alert, datadog_query, message, alert_options, existing_alert) ⇒ Object
Methods included from Logging
configure_logger_for, #log, #statsd
Constructor Details
#initialize(options) ⇒ Datadog
Returns a new instance of Datadog.
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
# File 'lib/interferon/destinations/datadog.rb', line 16 def initialize() %w(app_key api_key).each do |req| unless [req] raise ArgumentError, "missing required argument #{req}" end end # Set dogapi timeout explicitly api_timeout = ['api_timeout'] || 15 # Default parameters of Dogapi Client initialize() can be referenced from link below: # (as of this writing) # https://github.com/DataDog/dogapi-rb/blob/master/lib/dogapi/facade.rb#L14 args = [ ['api_key'], ['app_key'], nil, # host to talk to nil, # device true, # silent? api_timeout, # API timeout ] @dog = Dogapi::Client.new(*args) @existing_alerts = nil @max_mute_minutes = ['max_mute_minutes'] @dry_run = ['dry_run'] # Datadog communication threads @concurrency = ['concurrency'] || 10 # Fetch page size @page_size = ['page_size'] || 1000 # configure retries @retries = ['retries'] || 3 @stats = { alerts_created: 0, alerts_to_be_created: 0, alerts_updated: 0, alerts_to_be_updated: 0, alerts_deleted: 0, alerts_to_be_deleted: 0, alerts_silenced: 0, api_successes: 0, api_client_errors: 0, api_unknown_errors: 0, manually_created_alerts: 0, } end |
Instance Attribute Details
#concurrency ⇒ Object
Returns the value of attribute concurrency.
13 14 15 |
# File 'lib/interferon/destinations/datadog.rb', line 13 def concurrency @concurrency end |
Class Method Details
.generate_message(message, people, options = {}) ⇒ Object
70 71 72 73 74 75 76 77 78 79 |
# File 'lib/interferon/destinations/datadog.rb', line 70 def self.(, people, = {}) mentions = people.sort.map { |p| "@#{p}" } unless [:notify_recovery] # Only mention on alert/warning mentions = "{{^is_recovery}}#{mentions}{{/is_recovery}}" end [, ALERT_KEY, mentions].flatten.join("\n") end |
.normalize_monitor_type(monitor_type) ⇒ Object
334 335 336 337 338 339 340 |
# File 'lib/interferon/destinations/datadog.rb', line 334 def self.normalize_monitor_type(monitor_type) # Convert 'query alert' type to 'metric alert' type. They can used interchangeably when # submitting monitors to Datadog. Datadog will automatically do the conversion to 'query # alert' for a "complex" query that includes multiple metrics/tags while using 'metric alert' # for monitors that include a single scope/metric. monitor_type == 'query alert' ? 'metric alert' : monitor_type end |
.same_alerts(alert, people, alert_api_json) ⇒ Object
346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 |
# File 'lib/interferon/destinations/datadog.rb', line 346 def self.same_alerts(alert, people, alert_api_json) prev_alert = { monitor_type: normalize_monitor_type(alert_api_json['type']), query: alert_api_json['query'].strip, message: alert_api_json['message'].strip, evaluation_delay: alert_api_json['options']['evaluation_delay'], include_tags: alert_api_json['options']['include_tags'], notify_no_data: alert_api_json['options']['notify_no_data'], notify_audit: alert_api_json['options']['notify_audit'], no_data_timeframe: alert_api_json['options']['no_data_timeframe'], silenced: alert_api_json['options']['silenced'], thresholds: alert_api_json['options']['thresholds'], timeout_h: alert_api_json['options']['timeout_h'], } new_alert = { monitor_type: normalize_monitor_type(alert['monitor_type']), query: alert['metric']['datadog_query'], message: ( alert['message'], people, notify_recovery: alert['notify']['recovery'] ).strip, evaluation_delay: alert['evaluation_delay'], include_tags: alert['notify']['include_tags'], notify_no_data: alert['notify_no_data'], notify_audit: alert['notify']['audit'], no_data_timeframe: alert['no_data_timeframe'], silenced: alert['silenced'], thresholds: alert['thresholds'], timeout_h: alert['timeout_h'], } unless alert['require_full_window'].nil? prev_alert[:require_full_window] = alert_api_json['options']['require_full_window'] new_alert[:require_full_window] = alert['require_full_window'] end prev_alert == new_alert end |
.same_monitor_type(monitor_type_a, monitor_type_b) ⇒ Object
342 343 344 |
# File 'lib/interferon/destinations/datadog.rb', line 342 def self.same_monitor_type(monitor_type_a, monitor_type_b) normalize_monitor_type(monitor_type_a) == normalize_monitor_type(monitor_type_b) end |
Instance Method Details
#api_errors ⇒ Object
66 67 68 |
# File 'lib/interferon/destinations/datadog.rb', line 66 def api_errors @api_errors ||= [] end |
#create_alert(alert, people) ⇒ Object
139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
# File 'lib/interferon/destinations/datadog.rb', line 139 def create_alert(alert, people) # create a message which includes the notifications # Datadog may have a race condition where alerts created in a bad state may be triggered # during the dry-run creation process. Delete people from dry-run alerts to avoid this = self.class.( alert['message'], people, notify_recovery: alert['notify']['recovery'] ) # create the hash of options to send to datadog = { notify_audit: alert['notify']['audit'], notify_no_data: alert['notify_no_data'], no_data_timeframe: alert['no_data_timeframe'], silenced: alert['silenced'], timeout_h: alert['timeout_h'], } unless alert['notify']['include_tags'].nil? [:include_tags] = alert['notify']['include_tags'] end unless alert['evaluation_delay'].nil? [:evaluation_delay] = alert['evaluation_delay'] end unless alert['require_full_window'].nil? [:require_full_window] = alert['require_full_window'] end unless alert['thresholds'].nil? [:thresholds] = alert['thresholds'] end datadog_query = alert['metric']['datadog_query'] existing_alert = existing_alerts[alert['name']] # new alert, create it if existing_alert.nil? action = :creating resp = create_datadog_alert(alert, datadog_query, , ) else # existing alert, modify it action = :updating resp = update_datadog_alert(alert, datadog_query, , , existing_alert) end # log whenever we've encountered errors code = resp[0].to_i log_datadog_response_code(resp, code, action, alert) # assume this was a success unless code >= 400 || code == -1 # assume this was a success @stats[:alerts_created] += 1 if action == :creating @stats[:alerts_updated] += 1 if action == :updating @stats[:alerts_silenced] += 1 unless [:silenced].empty? end id = resp[1].nil? ? nil : [resp[1]['id']] # lets key alerts by their name [alert['name'], id] end |
#create_datadog_alert(alert, datadog_query, message, alert_options) ⇒ Object
204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 |
# File 'lib/interferon/destinations/datadog.rb', line 204 def create_datadog_alert(alert, datadog_query, , ) @stats[:alerts_to_be_created] += 1 new_alert_text = <<-EOM Query: #{datadog_query} Message: #{} Options: #{} EOM log.info("creating new alert #{alert['name']}: #{new_alert_text}") = { name: alert['name'], message: , options: , } if @dry_run @dog.validate_monitor( alert['monitor_type'], datadog_query, ) else @dog.monitor( alert['monitor_type'], datadog_query, ) end end |
#existing_alerts ⇒ Object
110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
# File 'lib/interferon/destinations/datadog.rb', line 110 def existing_alerts unless @existing_alerts alerts = fetch_existing_alerts # key alerts by name @existing_alerts = {} alerts.each do |alert| existing_alert = @existing_alerts[alert['name']] if existing_alert.nil? alert['id'] = [alert['id']] @existing_alerts[alert['name']] = alert else existing_alert['id'] << alert['id'] end end # count how many are manually created @stats[:manually_created_alerts] = \ @existing_alerts.reject { |_n, a| a['message'].include?(ALERT_KEY) }.length log.info( "datadog: found #{@existing_alerts.length} existing alerts; " \ "#{@stats[:manually_created_alerts]} were manually created" ) end @existing_alerts end |
#fetch_existing_alerts ⇒ Object
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
# File 'lib/interferon/destinations/datadog.rb', line 81 def fetch_existing_alerts alerts = Queue.new has_more = true Parallel.map_with_index(-> { has_more || Parallel::Stop }, in_threads: @concurrency) do |_, page| successful = false @retries.downto(0) do resp = @dog.get_all_monitors(page: page, page_size: @page_size) code = resp[0].to_i if code != 200 log.info("Failed to retrieve existing alerts from datadog. #{code}: #{resp[1].inspect}") else alerts_page = resp[1] has_more = false if alerts_page.length < @page_size alerts_page.map { |alert| alerts.push(alert) } successful = true break end end unless successful # Out of retries raise 'Retries exceeded for fetching data from datadog.' end end Array.new(alerts.size) { alerts.pop } end |
#log_datadog_response_code(resp, code, action, alert = nil) ⇒ Object
404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 |
# File 'lib/interferon/destinations/datadog.rb', line 404 def log_datadog_response_code(resp, code, action, alert = nil) # log whenever we've encountered errors if code != 200 && !alert.nil? api_errors << "#{code} on alert #{alert['name']}" end # client error if code == 400 @stats[:api_client_errors] += 1 unless alert.nil? statsd.gauge('datadog.api.unknown_error', 0, tags: ["alert:#{alert}"]) statsd.gauge('datadog.api.client_error', 1, tags: ["alert:#{alert}"]) statsd.gauge('datadog.api.success', 0, tags: ["alert:#{alert}"]) log.error("client error while #{action} alert '#{alert['name']}';" \ " query was '#{alert['metric']['datadog_query']}'" \ " response was #{resp[0]}:'#{resp[1].inspect}'") end # unknown (prob. datadog) error: elsif code > 400 || code == -1 @stats[:api_unknown_errors] += 1 unless alert.nil? statsd.gauge('datadog.api.unknown_error', 1, tags: ["alert:#{alert}"]) statsd.gauge('datadog.api.client_error', 0, tags: ["alert:#{alert}"]) statsd.gauge('datadog.api.success', 0, tags: ["alert:#{alert}"]) log.error("unknown error while #{action} alert '#{alert['name']}':" \ " query was '#{alert['metric']['datadog_query']}'" \ " response was #{resp[0]}:'#{resp[1].inspect}'") end else @stats[:api_successes] += 1 unless alert.nil? statsd.gauge('datadog.api.unknown_error', 0, tags: ["alert:#{alert}"]) statsd.gauge('datadog.api.client_error', 0, tags: ["alert:#{alert}"]) statsd.gauge('datadog.api.success', 1, tags: ["alert:#{alert}"]) end end end |
#need_update(alert_people_pair, existing_alerts_from_api) ⇒ Object
328 329 330 331 332 |
# File 'lib/interferon/destinations/datadog.rb', line 328 def need_update(alert_people_pair, existing_alerts_from_api) alert, people = alert_people_pair existing = existing_alerts_from_api[alert['name']] existing.nil? || !self.class.same_alerts(alert, people, existing) end |
#remove_alert(alert) ⇒ Object
305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 |
# File 'lib/interferon/destinations/datadog.rb', line 305 def remove_alert(alert) if alert['message'].include?(ALERT_KEY) @stats[:alerts_to_be_deleted] += 1 log.info("deleting alert: #{alert['name']}") # Safety to protect aginst accident dry_run deletion unless @dry_run alert['id'].each do |alert_id| resp = @dog.delete_monitor(alert_id) code = resp[0].to_i log_datadog_response_code(resp, code, :deleting) unless code >= 300 || code == -1 # assume this was a success @stats[:alerts_deleted] += 1 end end end else log.warn("not deleting manually-created alert #{alert['id']} (#{alert['name']})") end end |
#report_stats ⇒ Object
387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 |
# File 'lib/interferon/destinations/datadog.rb', line 387 def report_stats @stats.each do |k, v| statsd.gauge("datadog.#{k}", v) end log.info( 'datadog: successfully created (%d/%d), updated (%d/%d), and deleted (%d/%d) alerts' % [ @stats[:alerts_created], @stats[:alerts_to_be_created], @stats[:alerts_updated], @stats[:alerts_to_be_updated], @stats[:alerts_deleted], @stats[:alerts_to_be_deleted], ] ) end |
#update_datadog_alert(alert, datadog_query, message, alert_options, existing_alert) ⇒ Object
237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 |
# File 'lib/interferon/destinations/datadog.rb', line 237 def update_datadog_alert(alert, datadog_query, , , existing_alert) @stats[:alerts_to_be_updated] += 1 id = existing_alert['id'][0] new_alert_text = <<-EOM.strip Query: #{datadog_query.strip} Message: #{.strip} Options: #{} EOM existing_alert_text = <<-EOM.strip Query: #{existing_alert['query'].strip} Message: #{existing_alert['message'].strip} Options: #{} EOM diff = Diffy::Diff.new(existing_alert_text, new_alert_text, context: 1) log.info("updating existing alert #{id} (#{alert['name']}):\n#{diff}") = { name: alert['name'], message: , options: , } if @dry_run resp = @dog.validate_monitor( alert['monitor_type'], datadog_query, ) elsif self.class.same_monitor_type(alert['monitor_type'], existing_alert['type']) resp = @dog.update_monitor( id, datadog_query, ) # Unmute existing alerts that exceed the max silenced time # Datadog does not allow updates to silencing via the update_alert API call. silenced = existing_alert['options']['silenced'] if !@max_mute_minutes.nil? silenced = silenced.values.reject do |t| t.nil? || t == '*' || t > Time.now.to_i + @max_mute_minutes * 60 end @dog.unmute_monitor(id) if [:silenced].empty? && silenced.empty? elsif [:silenced].empty? && !silenced.empty? @dog.unmute_monitor(id) end else # Need to recreate alert with new monitor type resp = @dog.delete_monitor(id) code = resp[0].to_i unless code >= 300 || code == -1 resp = @dog.monitor( alert['monitor_type'], datadog_query, ) end end resp end |