Class: Bfire::Engine

Inherits:
Object
  • Object
show all
Includes:
PubSub::Publisher
Defined in:
lib/bfire/engine.rb

Constant Summary collapse

DEBUG =
Logger::DEBUG
INFO =
Logger::INFO
WARN =
Logger::WARN
ERROR =
Logger::ERROR
UNKNOWN =
Logger::UNKNOWN

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from PubSub::Publisher

#error?, #hooks, included, #on, #trigger, #triggered_events

Constructor Details

#initialize(opts = {}) ⇒ Engine

Returns a new instance of Engine.



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/bfire/engine.rb', line 35

def initialize(opts = {})
  @root = opts[:root] || Dir.pwd
  @properties = {}
  @vmgroups = {}
  @networks = {}
  @storages = {}
  @locations = {}
  @mutex = Mutex.new
  @experiment = nil

  # The group of all master threads.
  @tg_master = ThreadGroup.new
  # The group of all threads related to a Group.
  @tg_groups = ThreadGroup.new

  reset
end

Instance Attribute Details

#propertiesObject (readonly)

Engine configuration hash:



31
32
33
# File 'lib/bfire/engine.rb', line 31

def properties
  @properties
end

#sessionObject (readonly)

Returns the Restfully::Session object



33
34
35
# File 'lib/bfire/engine.rb', line 33

def session
  @session
end

Instance Method Details



435
436
437
# File 'lib/bfire/engine.rb', line 435

def banner
  "[BFIRE] "
end

#cleanup!Object

Cleanup procedure =



402
403
404
405
406
407
408
409
410
411
412
413
414
415
# File 'lib/bfire/engine.rb', line 402

def cleanup!
  unless @tg_groups.list.empty?
    synchronize{
      @tg_groups.list.each(&:kill)
    }
  end
  if cleanup? && !@experiment.nil?
    logger.warn "#{banner}Cleaning up in 5 seconds. Hit CTRL-C now to keep your experiment running."
    sleep 5
    @experiment.delete
  else
    logger.warn "#{banner}Not cleaning up experiment."
  end
end

#cleanup?Boolean

Returns:

  • (Boolean)


417
418
419
420
421
# File 'lib/bfire/engine.rb', line 417

def cleanup?
  return false if dev? || conf[:no_cancel]
  return false if conf[:no_cleanup] && !error?
  true
end

#confObject

Returns the configuration Hash.



394
395
396
# File 'lib/bfire/engine.rb', line 394

def conf
  @properties
end

#dag(nodes) ⇒ Object

Returns the directed acyclic graph for the given group names, based on their declared dependencies.

Raises:



85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/bfire/engine.rb', line 85

def dag(nodes)
  dg = RGL::DirectedAdjacencyGraph.new
  nodes.each{|n|
    dg.add_vertex(n)
    group(n).dependencies.each{|m, block|
      dg.add_vertex(m)
      dg.add_edge(m, n)
    }
  }

  raise Error, "Your dependency graph is not acyclic!" unless dg.acyclic?
  dg
end

#deploy!Object



134
135
136
137
138
139
140
# File 'lib/bfire/engine.rb', line 134

def deploy!
  dg = dag(@vmgroups.keys)
  topsort_iterator = dg.topsort_iterator
  logger.info "#{banner}Launching groups in the following topological order: #{topsort_iterator.clone.to_a.inspect}."

  launch_waiting_groups(topsort_iterator)
end

#dev?Boolean

Returns:

  • (Boolean)


439
440
441
# File 'lib/bfire/engine.rb', line 439

def dev?
  !!conf[:dev]
end

#engineObject

Helpers methods =



427
428
429
# File 'lib/bfire/engine.rb', line 427

def engine
  self
end

#experiment(name = nil) ⇒ Object

If given a name, attempts to find an existing running experiment with the same name. If name is nil or omitted, creates a new experiment.

Returns a Restfully::Resource object, or nil.



338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
# File 'lib/bfire/engine.rb', line 338

def experiment(name = nil)
  connection = session
  synchronize {
    @experiment ||= if name.nil?
      connection.root.experiments.submit(
        :name => conf[:name],
        :description => conf[:description],
        :walltime => conf[:walltime],
        :status => "waiting"
      )
    else
      connection.root.experiments.find{|exp|
        exp['status'] == 'running' && exp['name'] == name
      }
    end
  }
end

#fetch_location(name) ⇒ Object

Raises:



307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
# File 'lib/bfire/engine.rb', line 307

def fetch_location(name)
  name = name.to_sym
  location = if (name == :any)
     choices = session.root.locations
     return nil if choices.length == 0
     choices[rand(choices.length)]
  else
    @locations[name] || session.root.locations[name]
  end
  raise Error, "#{banner}Can't find #{name.inspect} location" if location.nil?
  synchronize {
    @locations[location['name'].to_sym] ||= location
  }
  location
end

#fetch_network(name, location) ⇒ Object



248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
# File 'lib/bfire/engine.rb', line 248

def fetch_network(name, location)
  sname = name.to_s
  key = [location['name'], sname].join(".")
  logger.debug "#{banner}Looking for network #{name.inspect} at #{location['name'].inspect}. key=#{key.inspect}"
  exp = experiment
  synchronize {
    # Duplicate general networks if present
    @networks[key] = @networks[sname].clone if @networks[sname]

    @networks[key] = case @networks[key]
    when Restfully::Resource
      @networks[key]
    when Proc
      @networks[key].call(name, location, exp)
    else
      location.networks.find{|n|
        if name.kind_of?(Regexp)
          n['name'] =~ name
        else
          n['name'] == sname
        end
      }
    end
  }
  @networks[key]
end

#fetch_storage(name, location) ⇒ Object



280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
# File 'lib/bfire/engine.rb', line 280

def fetch_storage(name, location)
  sname = name.to_s
  key = [location['name'], sname].join(".")
  logger.debug "#{banner}Looking for storage #{name.inspect} at #{location['name'].inspect}. key=#{key.inspect}"
  exp = experiment
  synchronize {
    # Duplicate general storages if present
    @storages[key] = @storages[sname].clone if @storages[sname]

    @storages[key] = case @storages[key]
    when Restfully::Resource
      @storages[key]
    when Proc
      @storages[key].call(name, location, exp)
    else
      location.storages.find{|n|
        if name.kind_of?(Regexp)
          n['name'] =~ name
        else
          n['name'] == sname
        end
      }
    end
  }
  @storages[key]
end

#group(name, options = {}, &block) ⇒ Object

Define a new group (if block given), or return the group corresponding to the given name.



218
219
220
221
222
223
224
225
226
227
228
229
# File 'lib/bfire/engine.rb', line 218

def group(name, options = {}, &block)
  if block
    @vmgroups[name.to_sym] ||= Group.new(
      self,
      name.to_sym,
      options.symbolize_keys
    )
    @vmgroups[name.to_sym].instance_eval(&block)
  else
    @vmgroups[name.to_sym]
  end
end

#groupsObject



431
432
433
# File 'lib/bfire/engine.rb', line 431

def groups
  @vmgroups
end

#keychainObject



70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/bfire/engine.rb', line 70

def keychain
  private_key = nil
  public_key = Dir[File.expand_path("~/.ssh/*.pub")].find{|key|
    private_key = key.gsub(/\.pub$/,"")
    File.exist?(private_key)
  }
  if public_key.nil?
    nil
  else
    [public_key, private_key]
  end
end

#launch!Object

Launch a monitor for each group, and waits for their termination before saying “ready”.



169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# File 'lib/bfire/engine.rb', line 169

def launch!
  @vmgroups.each{|name, group|
    @tg_groups.add(Thread.new {
      Thread.current.abort_on_exception = true
      group.monitor
    })
  }

  until @vmgroups.all?{|(n,g)| g.triggered_events.include?(:ready)}
    sleep 5
  end

  logger.info "#{banner}All groups are now READY: #{groups.inspect}."

  trigger :ready
  ThreadsWait.all_waits(*@tg_groups.list) do |t|
    # http://apidock.com/ruby/Thread/status
    if t.status.nil? || t.status == "aborting" || t[:ko]
      trigger :error
    end
  end
end

#launch_compute(template, count = 1) ⇒ Object

Laucnh a number of compute resources based on the given template.



325
326
327
328
329
330
331
# File 'lib/bfire/engine.rb', line 325

def launch_compute(template, count = 1)
  h = template.to_h
  count.times.map do |i|
    logger.debug "#{banner}#{i+1}/#{count} - Launching compute with the following data: #{h.inspect}"
    experiment.computes.submit(h)
  end
end

#launch_waiting_groups(topsort_iterator) ⇒ Object

This launches the group in the topological order, and waits for the end of that initialization procedure.



144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# File 'lib/bfire/engine.rb', line 144

def launch_waiting_groups(topsort_iterator)
  return true if topsort_iterator.at_end?
  return false if error?

  # ugly, but I don't know why the lib don't give access to it...
  waiting = topsort_iterator.instance_variable_get("@waiting")
  logger.info "#{banner}Launching #{waiting.inspect}"
  # Make sure you don't touch the topsort_iterator in the each block,
  # otherwise you can get side-effects.
  waiting.each do |group_name|
    g = group(group_name)
    # in case that group was error'ed by the engine...
    next if g.error?
    Thread.new {
      Thread.current.abort_on_exception = true
      g.launch_initial_resources
    }.join
  end
  waiting.length.times { topsort_iterator.forward }
  launch_waiting_groups(topsort_iterator)
end

#loggerObject

Returns the logger for the engine.



444
445
446
447
448
449
450
# File 'lib/bfire/engine.rb', line 444

def logger
  @logger ||= begin
    l = conf[:logger]
    l.level = conf[:logging]
    l
  end
end

#metric(name, options = {}) ⇒ Object



356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
# File 'lib/bfire/engine.rb', line 356

def metric(name, options = {})
  hosts = [options.delete(:hosts) || []].flatten.map{|h|
    [h['name'], h['id']].join("-")
  }
  @zabbix ||= Aggregator::Zabbix.new(session, experiment)

  items = @zabbix.request("item.get", {
    :filter => {
      "host" => hosts[0],
      "key_" => name.to_s
    },
    "output" => "extend"
  }).map{|i| i['itemid']}

  # Most recent last
  now = Time.now.to_i
  results = @zabbix.request("history.get", {
    "itemids" => items[0..1],
    # FIX once we can correctly specify metric type
    "history" => 1, # STRING
    "output" => "extend",
    "time_from" => now-3600,
    "time_till" => now
  })

  Metric.new(name, results, options)
end

#network(name, options = {}, &block) ⇒ Object

Define a network. A network is location dependent.



244
245
246
# File 'lib/bfire/engine.rb', line 244

def network(name, options = {}, &block)
  @networks[name.to_s] = block
end

#path_to(path) ⇒ Object



53
54
55
# File 'lib/bfire/engine.rb', line 53

def path_to(path)
  File.expand_path(path, @root)
end

#resetObject



57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/bfire/engine.rb', line 57

def reset
  conf[:name] ||= "Bfire experiment"
  conf[:description] ||= "Anonymous description"
  conf[:walltime] ||= 3600
  conf[:logger] ||= Logger.new(STDOUT)
  conf[:logging] ||= INFO
  conf[:user] ||= ENV['USER']
  conf[:ssh_max_attempts] ||= 3
  public_key, private_key = keychain
  conf[:key] ||= private_key
  conf[:authorized_keys] ||= public_key
end

#resuscitate!Object

Reloads vmgroups, networks and storages linked to an experiment.



193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
# File 'lib/bfire/engine.rb', line 193

def resuscitate!
  experiment.networks.each do |network|
    @networks[network['name']] = network
  end
  experiment.storages.each do |storage|
    @storages[storage['name']] = storage
  end
  experiment.computes.each do |compute|
    group_name, template_name, guid = compute['name'].split("--")
    g = group(group_name)
    if g.nil?
      raise Error, "Group #{group_name} is not declared in the DSL."
    else
      g.template(template_name).instances.push(compute)
    end
  end
  @vmgroups.each do |name, vmgroup|
    vmgroup.merge_templates!
    vmgroup.check!
  end
  true
end

#run!Object

Launch procedure. Will execute each group in a separate thread, and launch a thread to monitor experiment status.



101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# File 'lib/bfire/engine.rb', line 101

def run!
  # call #session to initiate Restfully::Session object outside of threads
  logger.info "#{banner}Using bonfire-api/#{session.root['version']}"
  
  on(:error) { cleanup! }
  on(:terminated) { cleanup! }

  @tg_master.add(Thread.new {
    Thread.current.abort_on_exception = true
    monitor
  })

  initialized = if dev? && experiment(conf[:name])
    resuscitate!
  else
    deploy!
  end
  
  experiment.update(:status => "running")

  if initialized
    launch!
  else
    cleanup!
  end

  ThreadsWait.all_waits(*@tg_master.list)
rescue Exception => e
  logger.error "#{banner}#{e.class.name}: #{e.message}"
  logger.debug e.backtrace.join("; ")
  trigger :error
end

#set(property, value) ⇒ Object

Sets the given property to the given value.



389
390
391
# File 'lib/bfire/engine.rb', line 389

def set(property, value)
  @properties[property.to_sym] = value
end

#ssh(fqdn, username, options = {}) {|Net::SSH::Connection::Session| ... } ⇒ Object

Setup an SSH connection as username to fqdn. By default, the SSH connection will be retried at most ssh_max_attempts times if the host is unreachable. You can overwrite that default locally by passing a different ssh_max_attempts option. Same for :timeout and :keys options.

If option :multi is given and true, then an instance of Net::SSH::Multi::Session is yielded. See <net-ssh.github.com/multi/v1/api/index.html> for more information.

Parameters:

  • fqdn (String)

    the fully qualified domain name of the host to connect to.

  • username (String)

    the login to use to connect to the host.

  • options (Hash) (defaults to: {})

    a hash of additional options to pass.

Yields:

  • (Net::SSH::Connection::Session)

    ssh a SSH handler.

Raises:

  • (ArgumentError)


471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
# File 'lib/bfire/engine.rb', line 471

def ssh(fqdn, username, options = {}, &block)
  raise ArgumentError, "You MUST provide a block when calling #ssh" if block.nil?
  log = !!options.delete(:log)
  options[:timeout] ||= 10
  if options.has_key?(:password)
    options[:auth_methods] ||= ['keyboard-interactive']
  else
    options[:keys] ||= [conf[:key]].compact
  end
  max_attempts = options[:max_attempts] || conf[:ssh_max_attempts]
  logger.info "#{banner}SSHing to #{username}@#{fqdn.inspect}..." if log
  attempts = 0
  begin
    attempts += 1
    if options[:multi]
      Net::SSH::Multi.start(
        :concurrent_connections => (
          options[:concurrent_connections] || 10
        )
      ) do |s|
        s.via conf[:gateway], conf[:user] unless conf[:gateway].nil?
        fqdn.each {|h| s.use "#{username}@#{h}"}
        block.call(s)
      end
    else
      if conf[:gateway]
        gw_handler = Net::SSH::Gateway.new(conf[:gateway], conf[:user], :forward_agent => true)
        gw_handler.ssh(fqdn, username, options, &block)
        gw_handler.shutdown!
      else
        Net::SSH.start(fqdn, username, options, &block)
      end
    end
  rescue Errno::EHOSTUNREACH => e
    if attempts <= max_attempts
      logger.info "#{banner}No route to host #{fqdn}. Retrying in 5 secs..." if log
      sleep 5
      retry
    else
      logger.info "#{banner}No route to host #{fqdn}. Won't retry." if log
      raise e
    end
  end
end

#storage(name, options = {}, &block) ⇒ Object

Define a storage. A storage is location dependent.



276
277
278
# File 'lib/bfire/engine.rb', line 276

def storage(name, options = {}, &block)
  @storages[name.to_s] = block
end

#synchronize(&block) ⇒ Object

Synchronization primitive



453
454
455
# File 'lib/bfire/engine.rb', line 453

def synchronize(&block)
  @mutex.synchronize { block.call }
end