Module: Bosh::Director::CloudcheckHelper

Included in:
ProblemHandlers::Base
Defined in:
lib/bosh/director/cloudcheck_helper.rb

Constant Summary collapse

DEFAULT_AGENT_TIMEOUT =

This timeout has been made pretty short mainly to avoid long cloudchecks, however 10 seconds should still be pretty generous interval for agent to respond.

10

Instance Method Summary collapse

Instance Method Details

#agent_client(vm, timeout = DEFAULT_AGENT_TIMEOUT, retries = 0) ⇒ Object



33
34
35
36
37
38
39
40
# File 'lib/bosh/director/cloudcheck_helper.rb', line 33

def agent_client(vm, timeout = DEFAULT_AGENT_TIMEOUT, retries = 0)
  options = {
    :timeout => timeout,
    :retry_methods => { :get_state => retries }
  }
  @clients ||= {}
  @clients[vm.agent_id] ||= AgentClient.with_defaults(vm.agent_id, options)
end

#agent_timeout_guard(vm, &block) ⇒ Object



42
43
44
45
46
# File 'lib/bosh/director/cloudcheck_helper.rb', line 42

def agent_timeout_guard(vm, &block)
  yield agent_client(vm)
rescue Bosh::Director::RpcTimeout
  handler_error("VM `#{vm.cid}' is not responding")
end

#cloudObject



16
17
18
# File 'lib/bosh/director/cloudcheck_helper.rb', line 16

def cloud
  Bosh::Director::Config.cloud
end

#delete_vm(vm) ⇒ Object



57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/bosh/director/cloudcheck_helper.rb', line 57

def delete_vm(vm)
  # Paranoia: don't blindly delete VMs with persistent disk
  disk_list = agent_timeout_guard(vm) { |agent| agent.list_disk }
  if disk_list.size != 0
    handler_error("VM has persistent disk attached")
  end

  cloud.delete_vm(vm.cid)
  vm.db.transaction do
    vm.instance.update(:vm => nil) if vm.instance
    vm.destroy
  end
end

#delete_vm_reference(vm, options = {}) ⇒ Object



71
72
73
74
75
76
77
78
79
80
# File 'lib/bosh/director/cloudcheck_helper.rb', line 71

def delete_vm_reference(vm, options={})
  if vm.cid && !options[:skip_cid_check]
    handler_error("VM has a CID")
  end

  vm.db.transaction do
    vm.instance.update(:vm => nil) if vm.instance
    vm.destroy
  end
end

#handler_error(message) ⇒ Object



20
21
22
# File 'lib/bosh/director/cloudcheck_helper.rb', line 20

def handler_error(message)
  raise Bosh::Director::ProblemHandlerError, message
end

#instance_name(vm) ⇒ Object



24
25
26
27
28
29
30
31
# File 'lib/bosh/director/cloudcheck_helper.rb', line 24

def instance_name(vm)
  instance = vm.instance
  return "Unknown VM" if instance.nil?

  job = instance.job || "unknown job"
  index = instance.index || "unknown index"
  "#{job}/#{index}"
end

#reboot_vm(vm) ⇒ Object



48
49
50
51
52
53
54
55
# File 'lib/bosh/director/cloudcheck_helper.rb', line 48

def reboot_vm(vm)
  cloud.reboot_vm(vm.cid)
  begin
    agent_client(vm).wait_until_ready
  rescue Bosh::Director::RpcTimeout
    handler_error("Agent still unresponsive after reboot")
  end
end

#recreate_vm(vm) ⇒ Object



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# File 'lib/bosh/director/cloudcheck_helper.rb', line 82

def recreate_vm(vm)
  # Best we can do without any feedback from the agent
  # is to use the spec persisted in the DB at the time
  # of last apply call.
  # This method is somewhat similar in its nature to what
  # InstanceUpdater is doing in case of the stemcell update,
  # however we don't need to handle some advanced scenarios
  # such as disk migration.

  spec = validate_spec(vm)
  env = validate_env(vm)

  resource_pool_spec = spec.fetch("resource_pool", {})
  stemcell = find_stemcell(resource_pool_spec.fetch("stemcell", {}))

  deployment = vm.deployment
  handler_error("VM doesn't belong to any deployment") unless deployment

  instance = vm.instance
  disk_cid = instance ? instance.persistent_disk_cid : nil

  # One situation where this handler is actually useful is when
  # VM has already been deleted but something failed after that
  # and it is still referenced in DB. In that case it makes sense
  # to ignore "VM not found" errors in `delete_vm' and let the method
  # proceed creating a new VM. Other errors are not forgiven.
  begin
    cloud.delete_vm(vm.cid)
  rescue Bosh::Clouds::VMNotFound => e
    @logger.warn("VM '#{vm.cid}' might have already been deleted from the cloud")
  end

  vm.db.transaction do
    instance.update(:vm => nil) if instance
    vm.destroy
  end

  cloud_properties = resource_pool_spec.fetch("cloud_properties", {})
  networks = spec["networks"]
  new_vm = VmCreator.create(deployment, stemcell, cloud_properties, networks, Array(disk_cid), env)
  new_vm.apply_spec = spec
  new_vm.save

  if instance
    instance.update(:vm => new_vm)

    # refresh metadata after new instance has been set
    VmMetadataUpdater.build.update(new_vm, {})
  end

  agent_client(new_vm).wait_until_ready

  agent_client(new_vm).update_settings(Bosh::Director::Config.trusted_certs)
  new_vm.update(:trusted_certs_sha1 => Digest::SHA1.hexdigest(Bosh::Director::Config.trusted_certs))

  # After this point agent is actually responding to
  # pings, so if the rest of this handler fails
  # bcck won't find this type of problem again
  # but regular deployment will fail with "out-of-sync"
  # error (as we now have an instance that points to
  # VM that reports empty state). This problem
  # should be handled by "out-of-sync VM" problem handler.

  if disk_cid
    # N.B. attach_disk might fail if disk image is no longer
    # there or for some other reason. Generally it means
    # the data has been lost (e.g. someone deleted VM from vCenter
    # along with the disk.
    cloud.attach_disk(new_vm.cid, disk_cid)
    agent_client(new_vm).mount_disk(disk_cid)
  end

  agent_client(new_vm).apply(spec)

  if instance && instance.state == "started"
    agent_client(new_vm).start
  end
end