Class: TestQueue::Runner

Inherits:
Object
  • Object
show all
Defined in:
lib/test_queue/runner.rb,
lib/test_queue/runner/rspec.rb,
lib/test_queue/runner/sample.rb,
lib/test_queue/runner/cucumber.rb,
lib/test_queue/runner/minitest.rb,
lib/test_queue/runner/minitest5.rb,
lib/test_queue/runner/puppet_lint.rb,
lib/test_queue/runner/minitest4.rb,
lib/test_queue/runner/testunit.rb

Direct Known Subclasses

Cucumber, MiniTest, PuppetLint, RSpec, Sample, TestUnit

Defined Under Namespace

Classes: Cucumber, MiniTest, PuppetLint, RSpec, Sample, TestUnit

Constant Summary collapse

TOKEN_REGEX =
/^TOKEN=(\w+)/

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(test_framework, concurrency = nil, socket = nil, relay = nil) ⇒ Runner

Returns a new instance of Runner.



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/test_queue/runner.rb', line 36

def initialize(test_framework, concurrency=nil, socket=nil, relay=nil)
  @test_framework = test_framework
  @stats = Stats.new(stats_file)

  @early_failure_limit = nil
  if ENV['TEST_QUEUE_EARLY_FAILURE_LIMIT']
    begin
      @early_failure_limit = Integer(ENV['TEST_QUEUE_EARLY_FAILURE_LIMIT'])
    rescue ArgumentError
      raise ArgumentError, 'TEST_QUEUE_EARLY_FAILURE_LIMIT could not be parsed as an integer'
    end
  end

  @procline = $0

  @allowlist = if forced = ENV['TEST_QUEUE_FORCE']
                 forced.split(/\s*,\s*/)
               else
                 []
               end
  @allowlist.freeze

  all_files = @test_framework.all_suite_files.to_set
  @queue = @stats.all_suites
    .select { |suite| all_files.include?(suite.path) }
    .sort_by { |suite| -suite.duration }
    .map { |suite| [suite.name, suite.path] }

  if @allowlist.any?
    @queue.select! { |suite_name, path| @allowlist.include?(suite_name) }
    @queue.sort_by! { |suite_name, path| @allowlist.index(suite_name) }
  end

  @awaited_suites = Set.new(@allowlist)
  @original_queue = Set.new(@queue).freeze

  @workers = {}
  @completed = []

  @concurrency =
    concurrency ||
    (ENV['TEST_QUEUE_WORKERS'] && ENV['TEST_QUEUE_WORKERS'].to_i) ||
    if File.exist?('/proc/cpuinfo')
      File.read('/proc/cpuinfo').split("\n").grep(/processor/).size
    elsif RUBY_PLATFORM =~ /darwin/
      `/usr/sbin/sysctl -n hw.activecpu`.to_i
    else
      2
    end
  unless @concurrency > 0
    raise ArgumentError, "Worker count (#{@concurrency}) must be greater than 0"
  end

  @relay_connection_timeout =
    (ENV['TEST_QUEUE_RELAY_TIMEOUT'] && ENV['TEST_QUEUE_RELAY_TIMEOUT'].to_i) ||
    30

  @run_token = ENV['TEST_QUEUE_RELAY_TOKEN'] || SecureRandom.hex(8)

  @socket =
    socket ||
    ENV['TEST_QUEUE_SOCKET'] ||
    "/tmp/test_queue_#{$$}_#{object_id}.sock"

  @relay =
    relay ||
    ENV['TEST_QUEUE_RELAY']

  @remote_master_message = if ENV.has_key?("TEST_QUEUE_REMOTE_MASTER_MESSAGE")
                             ENV["TEST_QUEUE_REMOTE_MASTER_MESSAGE"]
                           elsif ENV.has_key?("TEST_QUEUE_SLAVE_MESSAGE")
                             warn("`TEST_QUEUE_SLAVE_MESSAGE` is deprecated. Use `TEST_QUEUE_REMOTE_MASTER_MESSAGE` instead.")
                             ENV["TEST_QUEUE_SLAVE_MESSAGE"]
                           end

  if @relay == @socket
    STDERR.puts "*** Detected TEST_QUEUE_RELAY == TEST_QUEUE_SOCKET. Disabling relay mode."
    @relay = nil
  elsif @relay
    @queue = []
  end

  @discovered_suites = Set.new
  @assignments = {}

  @exit_when_done = true

  @aborting = false
end

Instance Attribute Details

#concurrencyObject

Returns the value of attribute concurrency.



31
32
33
# File 'lib/test_queue/runner.rb', line 31

def concurrency
  @concurrency
end

#exit_when_doneObject

Returns the value of attribute exit_when_done.



31
32
33
# File 'lib/test_queue/runner.rb', line 31

def exit_when_done
  @exit_when_done
end

#statsObject (readonly)

Returns the value of attribute stats.



32
33
34
# File 'lib/test_queue/runner.rb', line 32

def stats
  @stats
end

Instance Method Details

#abort(message) ⇒ Object

Stop the test run immediately.

message - String message to print to the console when exiting.

Doesn’t return.



612
613
614
615
616
# File 'lib/test_queue/runner.rb', line 612

def abort(message)
  @aborting = true
  kill_subprocesses
  Kernel::abort("Aborting: #{message}")
end

#after_fork(num) ⇒ Object

Prepare a worker for executing jobs after a fork.



416
417
# File 'lib/test_queue/runner.rb', line 416

def after_fork(num)
end

#after_fork_internal(num, iterator) ⇒ Object



388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
# File 'lib/test_queue/runner.rb', line 388

def after_fork_internal(num, iterator)
  srand

  output = File.open("/tmp/test_queue_worker_#{$$}_output", 'w')

  $stdout.reopen(output)
  $stderr.reopen($stdout)
  $stdout.sync = $stderr.sync = true

  $0 = "test-queue worker [#{num}]"
  puts
  puts "==> Starting #$0 (#{Process.pid} on #{Socket.gethostname}) - iterating over #{iterator.sock}"
  puts

  after_fork(num)
end

#around_filter(suite) ⇒ Object



411
412
413
# File 'lib/test_queue/runner.rb', line 411

def around_filter(suite)
  yield
end

#awaiting_suites?Boolean

Returns:

  • (Boolean)


346
347
348
349
350
351
352
353
354
355
356
357
358
359
# File 'lib/test_queue/runner.rb', line 346

def awaiting_suites?
  case
  when @awaited_suites.any?
    # We're waiting to find all the allowlisted suites so we can run them
    # in the correct order.
    true
  when @queue.empty? && !!@discovering_suites_pid
    # We don't have any suites yet, but we're working on it.
    true
  else
    # It's fine to run any queued suites now.
    false
  end
end

#cleanup_workerObject



431
432
# File 'lib/test_queue/runner.rb', line 431

def cleanup_worker
end

#collect_worker_data(worker) ⇒ Object



456
457
458
459
460
461
462
463
464
465
466
# File 'lib/test_queue/runner.rb', line 456

def collect_worker_data(worker)
  if File.exist?(file = "/tmp/test_queue_worker_#{worker.pid}_output")
    worker.output = IO.binread(file)
    FileUtils.rm(file)
  end

  if File.exist?(file = "/tmp/test_queue_worker_#{worker.pid}_suites")
    worker.suites.replace(Marshal.load(IO.binread(file)))
    FileUtils.rm(file)
  end
end

#connect_to_relayObject



551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
# File 'lib/test_queue/runner.rb', line 551

def connect_to_relay
  sock = nil
  start = Time.now
  puts "Attempting to connect for #{@relay_connection_timeout}s..."
  while sock.nil?
    begin
      sock = TCPSocket.new(*@relay.split(':'))
    rescue Errno::ECONNREFUSED => e
      raise e if Time.now - start > @relay_connection_timeout
      puts "Master not yet available, sleeping..."
      sleep 0.5
    end
  end
  sock
end

#discover_suitesObject



316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
# File 'lib/test_queue/runner.rb', line 316

def discover_suites
  # Remote masters don't discover suites; the central master does and
  # distributes them to remote masters.
  return if relay?

  # No need to discover suites if all allowlisted suites are already
  # queued.
  return if @allowlist.any? && @awaited_suites.empty?

  @discovering_suites_pid = fork do
    terminate = false
    Signal.trap("INT") { terminate = true }

    $0 = "test-queue suite discovery process"

    @test_framework.all_suite_files.each do |path|
      @test_framework.suites_from_file(path).each do |suite_name, suite|
        Kernel.exit!(0) if terminate

        @server.connect_address.connect do |sock|
          sock.puts("TOKEN=#{@run_token}")
          sock.puts("NEW SUITE #{Marshal.dump([suite_name, path])}")
        end
      end
    end

    Kernel.exit! 0
  end
end

#distribute_queueObject



474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
# File 'lib/test_queue/runner.rb', line 474

def distribute_queue
  return if relay?
  remote_workers = 0

  until !awaiting_suites? && @queue.empty? && remote_workers == 0
    queue_status(@start_time, @queue.size, @workers.size, remote_workers)

    if status = reap_suite_discovery_process(false)
      abort("Discovering suites failed.") unless status.success?
      abort("Failed to discover #{@awaited_suites.sort.join(", ")} specified in TEST_QUEUE_FORCE") if @awaited_suites.any?
    end

    if IO.select([@server], nil, nil, 0.1).nil?
      reap_workers(false) # check for worker deaths
    else
      sock = @server.accept
      token = sock.gets.strip
      cmd = sock.gets.strip

      token = token[TOKEN_REGEX, 1]
      # If we have a remote master from a different test run, respond with "WRONG RUN", and it will consider the test run done.
      if token != @run_token
        message = token.nil? ? "Worker sent no token to master" : "Worker from run #{token} connected to master"
        STDERR.puts "*** #{message} for run #{@run_token}; ignoring."
        sock.write("WRONG RUN\n")
        next
      end

      case cmd
      when /^POP (\S+) (\d+)/
        hostname = $1
        pid = Integer($2)
        if awaiting_suites?
          sock.write(Marshal.dump("WAIT"))
        elsif obj = @queue.shift
          data = Marshal.dump(obj)
          sock.write(data)
          @assignments[obj] = [hostname, pid]
        end
      when /^REMOTE MASTER (\d+) ([\w\.-]+)(?: (.+))?/
        num = $1.to_i
        remote_master = $2
        remote_master_message = $3

        sock.write("OK\n")
        remote_workers += num

        message = "*** #{num} workers connected from #{remote_master} after #{Time.now-@start_time}s"
        message << " " + remote_master_message if remote_master_message
        STDERR.puts message
      when /^WORKER (\d+)/
        data = sock.read($1.to_i)
        worker = Marshal.load(data)
        worker_completed(worker)
        remote_workers -= 1
      when /^NEW SUITE (.+)/
        suite_name, path = Marshal.load($1)
        enqueue_discovered_suite(suite_name, path)
      when /^KABOOM/
        # worker reporting an abnormal number of test failures;
        # stop everything immediately and report the results.
        break
      else
        STDERR.puts("Ignoring unrecognized command: \"#{cmd}\"")
      end
      sock.close
    end
  end
ensure
  stop_master
  reap_workers
end

#enqueue_discovered_suite(suite_name, path) ⇒ Object



361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
# File 'lib/test_queue/runner.rb', line 361

def enqueue_discovered_suite(suite_name, path)
  if @allowlist.any? && !@allowlist.include?(suite_name)
    return
  end

  @discovered_suites << [suite_name, path]

  if @original_queue.include?([suite_name, path])
    # This suite was already added to the queue some other way.
    @awaited_suites.delete(suite_name)
    return
  end

  # We don't know how long new suites will take to run, so we put them at
  # the front of the queue. It's better to run a fast suite early than to
  # run a slow suite late.
  @queue.unshift [suite_name, path]

  if @awaited_suites.delete?(suite_name) && @awaited_suites.empty?
    # We've found all the allowlisted suites. Sort the queue to match the
    # allowlist.
    @queue.sort_by! { |suite_name, path| @allowlist.index(suite_name) }

    kill_suite_discovery_process("INT")
  end
end

#executeObject

Run the tests.

If exit_when_done is true, exit! will be called before this method completes. If exit_when_done is false, this method will return an Integer number of failures.



131
132
133
134
135
136
137
138
139
140
141
142
143
# File 'lib/test_queue/runner.rb', line 131

def execute
  $stdout.sync = $stderr.sync = true
  @start_time = Time.now

  execute_internal
  exitstatus = summarize_internal

  if exit_when_done
    exit! exitstatus
  else
    exitstatus
  end
end

#execute_internalObject



238
239
240
241
242
243
244
245
246
247
248
249
250
# File 'lib/test_queue/runner.rb', line 238

def execute_internal
  start_master
  prepare(@concurrency)
  @prepared_time = Time.now
  start_relay if relay?
  discover_suites
  spawn_workers
  distribute_queue
ensure
  stop_master

  kill_subprocesses
end

#kill_subprocessesObject



579
580
581
582
# File 'lib/test_queue/runner.rb', line 579

def kill_subprocesses
  kill_workers
  kill_suite_discovery_process
end

#kill_suite_discovery_process(signal = "KILL") ⇒ Object



592
593
594
595
596
# File 'lib/test_queue/runner.rb', line 592

def kill_suite_discovery_process(signal="KILL")
  return unless @discovering_suites_pid
  Process.kill signal, @discovering_suites_pid
  reap_suite_discovery_process
end

#kill_workersObject



584
585
586
587
588
589
590
# File 'lib/test_queue/runner.rb', line 584

def kill_workers
  @workers.each do |pid, worker|
    Process.kill 'KILL', pid
  end

  reap_workers
end

#prepare(concurrency) ⇒ Object

Run in the master before the fork. Used to create concurrency copies of any databases required by the test workers.



408
409
# File 'lib/test_queue/runner.rb', line 408

def prepare(concurrency)
end

#queue_status(start_time, queue_size, local_worker_count, remote_worker_count) ⇒ Object

Subclasses can override to monitor the status of the queue.

For example, you may want to record metrics about how quickly remote workers connect, or abort the build if not enough connect.

This method is called very frequently during the test run, so don’t do anything expensive/blocking.

This method is not called on remote masters when using remote workers, only on the central master.

start_time - Time when the test run began queue_size - Integer number of suites left in the queue local_worker_count - Integer number of active local workers remote_worker_count - Integer number of active remote workers

Returns nothing.



635
636
# File 'lib/test_queue/runner.rb', line 635

def queue_status(start_time, queue_size, local_worker_count, remote_worker_count)
end

#reap_suite_discovery_process(blocking = true) ⇒ Object



598
599
600
601
602
603
604
605
# File 'lib/test_queue/runner.rb', line 598

def reap_suite_discovery_process(blocking=true)
  return unless @discovering_suites_pid
  _, status = Process.waitpid2(@discovering_suites_pid, blocking ? 0 : Process::WNOHANG)
  return unless status

  @discovering_suites_pid = nil
  status
end

#reap_workers(blocking = true) ⇒ Object



439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
# File 'lib/test_queue/runner.rb', line 439

def reap_workers(blocking=true)
  @workers.delete_if do |_, worker|
    if Process.waitpid(worker.pid, blocking ? 0 : Process::WNOHANG).nil?
      next false
    end

    worker.status = $?
    worker.end_time = Time.now

    collect_worker_data(worker)
    relay_to_master(worker) if relay?
    worker_completed(worker)

    true
  end
end

#relay?Boolean

Returns:

  • (Boolean)


547
548
549
# File 'lib/test_queue/runner.rb', line 547

def relay?
  !!@relay
end

#relay_to_master(worker) ⇒ Object



567
568
569
570
571
572
573
574
575
576
577
# File 'lib/test_queue/runner.rb', line 567

def relay_to_master(worker)
  worker.host = Socket.gethostname
  data = Marshal.dump(worker)

  sock = connect_to_relay
  sock.puts("TOKEN=#{@run_token}")
  sock.puts("WORKER #{data.bytesize}")
  sock.write(data)
ensure
  sock.close if sock
end

#run_worker(iterator) ⇒ Object

Entry point for internal runner implementations. The iterator will yield jobs from the shared queue on the master.

Returns an Integer number of failures.



423
424
425
426
427
428
429
# File 'lib/test_queue/runner.rb', line 423

def run_worker(iterator)
  iterator.each do |item|
    puts "  #{item.inspect}"
  end

  return 0 # exit status
end

#spawn_workersObject



298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
# File 'lib/test_queue/runner.rb', line 298

def spawn_workers
  @concurrency.times do |i|
    num = i+1

    pid = fork do
      @server.close if @server

      iterator = Iterator.new(@test_framework, relay?? @relay : @socket, method(:around_filter), early_failure_limit: @early_failure_limit, run_token: @run_token)
      after_fork_internal(num, iterator)
      ret = run_worker(iterator) || 0
      cleanup_worker
      Kernel.exit! ret
    end

    @workers[pid] = Worker.new(pid, num)
  end
end

#start_masterObject



252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
# File 'lib/test_queue/runner.rb', line 252

def start_master
  if !relay?
    if @socket =~ /^(?:(.+):)?(\d+)$/
      address = $1 || '0.0.0.0'
      port = $2.to_i
      @socket = "#$1:#$2"
      @server = TCPServer.new(address, port)
    else
      FileUtils.rm(@socket) if File.exist?(@socket)
      @server = UNIXServer.new(@socket)
    end
  end

  desc = "test-queue master (#{relay?? "relaying to #{@relay}" : @socket})"
  puts "Starting #{desc}"
  $0 = "#{desc} - #{@procline}"
end

#start_relayObject



270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
# File 'lib/test_queue/runner.rb', line 270

def start_relay
  return unless relay?

  sock = connect_to_relay
  message = @remote_master_message ? " #{@remote_master_message}" : ""
  message.gsub!(/(\r|\n)/, "") # Our "protocol" is newline-separated
  sock.puts("TOKEN=#{@run_token}")
  sock.puts("REMOTE MASTER #{@concurrency} #{Socket.gethostname} #{message}")
  response = sock.gets.strip
  unless response == "OK"
    STDERR.puts "*** Got non-OK response from master: #{response}"
    sock.close
    exit! 1
  end
  sock.close
rescue Errno::ECONNREFUSED
  STDERR.puts "*** Unable to connect to relay #{@relay}. Aborting.."
  exit! 1
end

#stats_fileObject



233
234
235
236
# File 'lib/test_queue/runner.rb', line 233

def stats_file
  ENV['TEST_QUEUE_STATS'] ||
  '.test_queue_stats'
end

#stop_masterObject



290
291
292
293
294
295
296
# File 'lib/test_queue/runner.rb', line 290

def stop_master
  return if relay?

  FileUtils.rm_f(@socket) if @socket && @server.is_a?(UNIXServer)
  @server.close rescue nil if @server
  @socket = @server = nil
end

#summarizeObject



230
231
# File 'lib/test_queue/runner.rb', line 230

def summarize
end

#summarize_internalObject



145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# File 'lib/test_queue/runner.rb', line 145

def summarize_internal
  puts
  puts "==> Summary (#{@completed.size} workers in %.4fs)" % (Time.now-@start_time)
  puts

  estatus = 0
  misrun_suites = []
  unassigned_suites = []
  @failures = ''
  @completed.each do |worker|
    estatus += (worker.status.exitstatus || 1)
    @stats.record_suites(worker.suites)
    worker.suites.each do |suite|
      assignment = @assignments.delete([suite.name, suite.path])
      host = worker.host || Socket.gethostname
      if assignment.nil?
        unassigned_suites << [suite.name, suite.path]
      elsif assignment != [host, worker.pid]
        misrun_suites << [suite.name, suite.path] + assignment + [host, worker.pid]
      end
      @discovered_suites.delete([suite.name, suite.path])
    end

    summarize_worker(worker)

    @failures << worker.failure_output if worker.failure_output

    puts "    [%2d] %60s      %4d suites in %.4fs      (%s %s)" % [
      worker.num,
      worker.summary,
      worker.suites.size,
      worker.end_time - worker.start_time,
      worker.status.to_s,
      worker.host && " on #{worker.host.split('.').first}"
    ]
  end

  unless @failures.empty?
    puts
    puts "==> Failures"
    puts
    puts @failures
  end

  if !relay?
    unless @discovered_suites.empty?
      estatus += 1
      puts
      puts "The following suites were discovered but were not run:"
      puts

      @discovered_suites.sort.each do |suite_name, path|
        puts "#{suite_name} - #{path}"
      end
    end
    unless unassigned_suites.empty?
      estatus += 1
      puts
      puts "The following suites were not discovered but were run anyway:"
      puts
      unassigned_suites.sort.each do |suite_name, path|
        puts "#{suite_name} - #{path}"
      end
    end
    unless misrun_suites.empty?
      estatus += 1
      puts
      puts "The following suites were run on the wrong workers:"
      puts
      misrun_suites.each do |suite_name, path, target_host, target_pid, actual_host, actual_pid|
        puts "#{suite_name} - #{path}: #{actual_host} (#{actual_pid}) - assigned to #{target_host} (#{target_pid})"
      end
    end
  end

  puts

  @stats.save

  summarize

  estatus = @completed.inject(0){ |s, worker| s + (worker.status.exitstatus || 1)}
  [estatus, 255].min
end

#summarize_worker(worker) ⇒ Object



434
435
436
437
# File 'lib/test_queue/runner.rb', line 434

def summarize_worker(worker)
  worker.summary = ''
  worker.failure_output = ''
end

#worker_completed(worker) ⇒ Object



468
469
470
471
472
# File 'lib/test_queue/runner.rb', line 468

def worker_completed(worker)
  return if @aborting
  @completed << worker
  puts worker.output if ENV['TEST_QUEUE_VERBOSE'] || worker.status.exitstatus != 0
end