Class: Pwrake::Master

Inherits:
Object
  • Object
show all
Defined in:
lib/pwrake/master/master.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeMaster

Returns a new instance of Master.



5
6
7
8
9
10
11
12
13
14
# File 'lib/pwrake/master/master.rb', line 5

def initialize
  @runner = Runner.new
  @hostid_by_taskname = {}
  @option = Option.new
  @hdl_set = HandlerSet.new
  @channel_by_hostid = {}
  @channels = []
  @hosts = {}
  init_logger
end

Instance Attribute Details

#loggerObject (readonly)

Returns the value of attribute logger.



18
19
20
# File 'lib/pwrake/master/master.rb', line 18

def logger
  @logger
end

#optionObject (readonly)

Returns the value of attribute option.



17
18
19
# File 'lib/pwrake/master/master.rb', line 17

def option
  @option
end

#task_queueObject (readonly)

Returns the value of attribute task_queue.



16
17
18
# File 'lib/pwrake/master/master.rb', line 16

def task_queue
  @task_queue
end

Instance Method Details

#create_fiber(channels, &blk) ⇒ Object



162
163
164
165
166
167
# File 'lib/pwrake/master/master.rb', line 162

def create_fiber(channels,&blk)
  channels.each do |chan|
    fb = Fiber.new(&blk)
    fb.resume(chan)
  end
end

#finishObject



335
336
337
338
339
340
341
342
# File 'lib/pwrake/master/master.rb', line 335

def finish
  Log.debug "Master#finish begin"
  @branch_setup_thread.join
  @hdl_set.exit unless @exited
  TaskWrapper.close_task_logger
  Log.debug "Master#finish end"
  @failed
end

#handle_failed_target(name) ⇒ Object



315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
# File 'lib/pwrake/master/master.rb', line 315

def handle_failed_target(name)
  case @option['FAILED_TARGET']
    #
  when /rename/i, NilClass
    dst = name+"._fail_"
    ::FileUtils.mv(name,dst)
    msg = "Rename failed target file '#{name}' to '#{dst}'"
    $stderr.puts(msg)
    Log.warn(msg)
    #
  when /delete/i
    ::FileUtils.rm(name)
    msg = "Delete failed target file '#{name}'"
    $stderr.puts(msg)
    Log.warn(msg)
    #
  when /leave/i
  end
end

#init(hosts = nil) ⇒ Object



40
41
42
43
# File 'lib/pwrake/master/master.rb', line 40

def init(hosts=nil)
  @option.init
  TaskWrapper.init_task_logger(@option)
end

#init_loggerObject



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/pwrake/master/master.rb', line 20

def init_logger
  if logdir = @option['LOG_DIR']
    ::FileUtils.mkdir_p(logdir)
    logfile = File.join(logdir,@option['LOG_FILE'])
    @logger = Logger.new(logfile)
  else
    if @option['DEBUG']
      @logger = Logger.new($stderr)
    else
      @logger = Logger.new(File::NULL)
    end
  end

  if @option['DEBUG']
    @logger.level = Logger::DEBUG
  else
    @logger.level = Logger::INFO
  end
end

#invoke(t, args) ⇒ Object



169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# File 'lib/pwrake/master/master.rb', line 169

def invoke(t, args)
  @failed = false
  t.pw_search_tasks(args)

  if @option['GRAPH_PARTITION']
    setup_postprocess0
    @task_queue.deq_noaction_task do |tw,hid|
      tw.preprocess
      tw.status = "end"
      @post_pool.enq(tw)
    end
    @runner.run
    @post_pool.finish
    Log.debug "@post_pool.finish"

    require 'pwrake/misc/mcgp'
    MCGP.graph_partition(@option.host_map)
  end

  setup_postprocess1
  @branch_setup_thread.join
  send_task_to_idle_core
  #
  create_fiber(@channels) do |chan|
    while s = chan.get_line
      Log.debug "Master:recv #{s.inspect} from branch[#{chan.handler.host}]"
      case s
      when /^task(\w+):(\d*):(.*)$/o
        status, shell_id, task_name = $1, $2.to_i, $3
        tw = Rake.application[task_name].wrapper
        tw.shell_id = shell_id
        tw.status = status
        hid = @hostid_by_taskname[task_name]
        @task_queue.task_end(tw,hid) # @idle_cores.increase(..
        # check failure
        if tw.status == "fail"
          $stderr.puts %[task "#{tw.name}" failed.]
          if !@failed
            @failed = true
            case @option['FAILURE_TERMINATION']
            when 'kill'
              @hdl_set.kill("INT")
              @no_more_run = true
              $stderr.puts "... Kill running tasks."
            when 'continue'
              $stderr.puts "... Continue runable tasks."
            else # 'wait'
              @no_more_run = true
              $stderr.puts "... Wait for running tasks."
            end
          end
          if tw.has_output_file? && File.exist?(tw.name)
            handle_failed_target(tw.name)
          end
        end
        # postprocess
        @post_pool.enq(tw) # must be after @no_more_run = true
        break if @finished
      when /^exited$/o
        @exited = true
        Log.debug "receive #{s.chomp} from branch"
        break
      else
        Log.error "unknown result: #{s.inspect}"
        $stderr.puts(s)
      end
    end
    Log.debug "Master#invoke: fiber end"
  end
  @runner.run
  @post_pool.finish
  Log.debug "Master#invoke: end of task=#{t.name}"
end

#send_task_to_idle_coreObject



243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
# File 'lib/pwrake/master/master.rb', line 243

def send_task_to_idle_core
  #Log.debug "#{self.class}#send_task_to_idle_core start"
  count = 0
  # @idle_cores.decrease(..
  @task_queue.deq_task do |tw,hid|
    count += 1
    @hostid_by_taskname[tw.name] = hid
    tw.preprocess
    if tw.has_action?
      s = "#{hid}:#{tw.task_id}:#{tw.name}"
      @channel_by_hostid[hid].put_line(s)
      tw.exec_host = @hosts[hid]
    else
      tw.status = "end"
      @task_queue.task_end(tw,hid) # @idle_cores.increase(..
      @post_pool.enq(tw)
    end
  end
  if count == 0 && !@task_queue.empty? && @hostid_by_taskname.empty?
    m="No task was invoked while unexecuted tasks remain"
    Log.error m
    raise RuntimeError,m
  end
  #Log.debug "#{self.class}#send_task_to_idle_core end time=#{Time.now-tm}"
end

#setup_branch_handler(sub_host) ⇒ Object



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/pwrake/master/master.rb', line 45

def setup_branch_handler(sub_host)
  if sub_host == "localhost" && /^(n|f)/i !~ ENV['T']
    hdl = Handler.new(@runner) do |w0,w1,r2|
      @thread = Thread.new(r2,w0,@option) do |r,w,o|
        Rake.application.run_branch_in_thread(r,w,o)
      end
    end
  else
    hdl = Handler.new(@runner) do |w0,w1,r2|
      dir = File.absolute_path(File.dirname($PROGRAM_NAME))
      #args = Shellwords.shelljoin(@args)
      cmd = "ssh -x -T -q #{sub_host} '" +
        "cd \"#{Dir.pwd}\";"+
        "PATH=#{dir}:${PATH} exec pwrake_branch'"
      Log.debug("BranchCommunicator cmd=#{cmd}")
      #$stderr.puts "BranchCommunicator cmd=#{cmd}"
      spawn(cmd,:pgroup=>true,:out=>w0,:err=>w1,:in=>r2)
      w0.close
      w1.close
      r2.close
    end
    Marshal.dump(@option,hdl.iow)
    hdl.iow.flush
    s = hdl.ior.gets
    if !s or s.chomp != "pwrake_branch start"
      raise RuntimeError,"pwrake_branch start failed: receive #{s.inspect}"
    end
  end
  hdl.host = sub_host
  return hdl
end

#setup_branchesObject



101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/pwrake/master/master.rb', line 101

def setup_branches
  sum_ncore = 0

  @option.host_map.each do |sub_host, wk_hosts|
    @hdl_set << hdl = setup_branch_handler(sub_host)
    @channels << chan = Channel.new(hdl)
    chan.puts "host_list_begin"
    wk_hosts.each do |host_info|
      name = host_info.name
      ncore = host_info.ncore
      host_id = host_info.id
      Log.debug "connecting #{name} ncore=#{ncore} id=#{host_id}"
      chan.puts "host:#{host_id} #{name} #{ncore}"
      @channel_by_hostid[host_id] = chan
      @hosts[host_id] = name
    end
    chan.puts "host_list_end"

    while s = chan.gets
      case s
      when /^ncore:done$/
        break
      when /^ncore:(\d+):(\d+)$/
        id, ncore = $1.to_i, $2.to_i
        Log.debug "worker_id=#{id} ncore=#{ncore}"
        @option.host_map.by_id[id].set_ncore(ncore)
        sum_ncore += ncore
      when /^exited$/
        raise RuntimeError,"Unexpected branch exit"
      else
        msg = "#{hdl.host}:#{s.inspect}"
        raise RuntimeError,"invalid return: #{msg}"
      end
    end
  end

  Log.info "num_cores=#{sum_ncore}"
  @hosts.each do |id,host|
    Log.info "#{host} id=#{id} ncore=#{
      @option.host_map.by_id[id].idle_cores}"
  end
  queue_class = Pwrake.const_get(@option.queue_class)
  @task_queue = queue_class.new(@option.host_map)

  @branch_setup_thread = Thread.new do
    @channels.each do |chan|
      s = chan.gets
      if /^branch_setup:done$/ !~ s
        raise RuntimeError,"branch_setup failed"
      end
    end
    @killed = 0
    [:TERM,:INT].each do |sig|
      Signal.trap(sig) do
        signal_trap(sig)
      end
    end
  end

end

#setup_postprocessObject



269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
# File 'lib/pwrake/master/master.rb', line 269

def setup_postprocess
  i = 0
  n = @option.max_postprocess_pool
  @post_pool = FiberPool.new(n) do |pool|
    postproc = @option.postprocess(@runner)
    i += 1
    Log.debug "New postprocess fiber ##{i}"
    Fiber.new do
      j = i
      while tw = pool.deq()
        Log.debug "postproc##{j} deq=#{tw.name}"
        loc = postproc.run(tw)
        tw.postprocess(loc)
        pool.count_down
        @hostid_by_taskname.delete(tw.name)
        break if yield(pool,j)
      end
      postproc.close
      Log.debug "postproc##{j} end"
    end
  end
end

#setup_postprocess0Object



292
293
294
# File 'lib/pwrake/master/master.rb', line 292

def setup_postprocess0
  setup_postprocess{false}
end

#setup_postprocess1Object



296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
# File 'lib/pwrake/master/master.rb', line 296

def setup_postprocess1
  setup_postprocess do |pool,j|
    #Log.debug "@no_more_run=#{@no_more_run.inspect}"
    #Log.debug "@task_queue.empty?=#{@task_queue.empty?}"
    #Log.debug "@hostid_by_taskname=#{@hostid_by_taskname.inspect}"
    #Log.debug "pool.empty?=#{pool.empty?}"
    if (@no_more_run || @task_queue.empty?) &&
        @hostid_by_taskname.empty?
      Log.debug "postproc##{j} closing @channels=#{@channels.inspect}"
      @finished = true
      @channels.each{|ch| ch.finish} # exit
      true
    elsif !@no_more_run
      send_task_to_idle_core
      false
    end
  end
end

#signal_trap(sig) ⇒ Object



77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# File 'lib/pwrake/master/master.rb', line 77

def signal_trap(sig)
  case @killed
  when 0
    # log writing failed. can't be called from trap context
    if Rake.application.options.debug
      $stderr.puts "\nSignal trapped. (sig=#{sig} pid=#{Process.pid}"+
        " thread=#{Thread.current} ##{@killed})"
      $stderr.puts caller
    else
      $stderr.puts "\nSignal trapped. (sig=#{sig} pid=#{Process.pid}"+
        " ##{@killed})"
    end
    $stderr.puts "Exiting..."
    @no_more_run = true
    @failed = true
    @hdl_set.kill(sig)
  when 1
    $stderr.puts "\nOnce more Ctrl-C (SIGINT) for exit."
  else
    Kernel.exit(false) # must wait for nomral exit
  end
  @killed += 1
end