Class: MyWorkerManagerFln

Inherits:
ScbiMapreduce::WorkManager show all
Defined in:
lib/full_lengther_next/my_worker_manager_fln.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from ScbiMapreduce::WorkManager

#send_next_work

Instance Attribute Details

#seqs_annotation_protObject

MANAGER INITIALIZATION



89
90
91
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 89

def seqs_annotation_prot
  @seqs_annotation_prot
end

#seqs_some_codingObject

MANAGER INITIALIZATION



89
90
91
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 89

def seqs_some_coding
  @seqs_some_coding
end

#seqs_unknownObject

MANAGER INITIALIZATION



89
90
91
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 89

def seqs_unknown
  @seqs_unknown
end

Class Method Details

.asign_coding_attributes(seq, coding) ⇒ Object



437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 437

def self.asign_coding_attributes(seq, coding)
	seq.type = CODING
	@@stats_hash['unknown'] -= 1
	@@stats_hash['unknown_>200'] -= 1 if seq.seq_fasta.length > 200
		@@stats_hash['unknown_>500'] -= 1 if seq.seq_fasta.length > 500
	@@stats_hash['coding_>200'] += 1 if seq.seq_fasta.length > 200
	@@stats_hash['coding_>500'] += 1 if seq.seq_fasta.length > 500
	@@stats_hash['coding'] += 1
	coding = select_orf(coding)
	if coding[1] == 'complete'
		seq.status = true 
		@@stats_hash['coding_sure'] += 1
	else
		@@stats_hash['coding_putative'] += 1
	end

	seq.t_code = coding.last
	ind = 2
	ind = 3 if coding[4] == '-'
	frame = (coding[ind]%3)+1
	frame = frame * -1 if coding[4] == '-'
	seq.hit = [coding[2], coding[3], frame]
end

.correct_by_selected(selected, coding_info) ⇒ Object



422
423
424
425
426
427
428
429
430
431
432
433
434
435
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 422

def self.correct_by_selected(selected, coding_info)
	seqs_selected = []
	File.open(selected).each do |line|
		line.chomp!
		seq_name, orf_id = line.split('|', 2)
		seqs_selected << orf_id
	end
	coding_info.each do |seq_name, orfs|
		orfs.each do |orf, info|			
			info[1] = '-' if !seqs_selected.include?(orf)
		end
	end
	return coding_info
end

.end_work_managerObject

close files



190
191
192
193
194
195
196
197
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 190

def self.end_work_manager
	orf_prediction_with_transdecoder if @@options[:acess_db].include?('p') && !@@complete_sure.empty? && !@@seqs_to_analyze.empty?
	write_summary_stats(@@stats_hash, @@stats_taxonomy, @@stats_functional_annotation_by_seqs, @@stats_different_prot_id, @@stats_different_prot_id_complete_seqs, @@pre_fln_seq_lengths, @@seq_lengths, @@output_files['stats_txt'], @@output_files['stats_html'])
	write_mapping_report(@@fpkm, @@coverage_analysis, @@stats_functional_annotation_by_seqs)
	@@output_files.each do |key, handler|
		handler.close if handler.class != String
	end
end

.get_annotationsObject



550
551
552
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 550

def self.get_annotations
	return 	@@seqs_annotation_prot, @@seqs_some_coding, @@seqs_unknown
end

.get_coding_info(file_name) ⇒ Object



499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 499

def self.get_coding_info(file_name)
	coding_info = {}
	begin
		FastaQualFile.new(file_name, '').each do |name, seq, comments, qual|
			seq_length = seq.length
               f_len = seq.length.to_f
               x_len = seq.count('X')
			seq_name, orf_id = name.split('|')
			comments =~ /type:(\S+)/
			type = $1
			comments =~ /:(\d+)-(\d+)\(([+-])\)/
			start = $1.to_i
			stop = $2.to_i
			strand = $3
			record = coding_info[seq_name]
			info = [x_len / f_len, type, start, stop, strand]
			if record.nil?
				coding_info[seq_name] = {orf_id => info}
			else
				record[orf_id] = info
			end
		end
	rescue
		puts "Warning!!!!!!!!!!: Transdecoder file is missing. Check if Transdecoder is installed"
	end
	return coding_info
end

.get_max_score(orfs_hash) ⇒ Object



473
474
475
476
477
478
479
480
481
482
483
484
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 473

def self.get_max_score(orfs_hash)
	score = nil
	orfs_hash.each do |id, info|
		local = info.last
		if score.nil?
			score = local
		else
			score = local if local > score
		end
	end
	return score
end

.get_min_Xratio(orfs_hash) ⇒ Object



486
487
488
489
490
491
492
493
494
495
496
497
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 486

def self.get_min_Xratio(orfs_hash)
	ratio = nil
	orfs_hash.each do |id, info|
		local = info.first
		if ratio.nil?
			ratio = local
		else
			ratio = local if local < ratio
		end
	end
	return ratio
end

.get_scores(file_name, coding_info) ⇒ Object



527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 527

def self.get_scores(file_name, coding_info)
	File.open(file_name).each do |line|
		line.chomp!
		fields = line.split("\t")
		name = fields.shift
		seq, orf_id = name.split('|')
		coding = coding_info[seq]
		if !coding.nil?
			orf = coding[orf_id]
			if !orf.nil?
				score = fields.first.to_f
				if score > 0
					orf << fields.first.to_f if !orf.nil?
				else
					coding.delete(orf_id)
					coding_info.delete(seq) if coding.empty?
				end
			end
		end
	end
	return coding_info
end

.get_seqs(seqs) ⇒ Object



414
415
416
417
418
419
420
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 414

def self.get_seqs(seqs) 
	all_seqs = ''
	seqs.each do |seq|
		all_seqs << ">#{seq.seq_name}\n#{seq.seq_fasta}\n"
	end
	return all_seqs
end

.init_work_manager(options) ⇒ Object

open files and prepare global data



91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 91

def self.init_work_manager(options)
	@@stats_hash = initialize_stats_hash
	@@stats_taxonomy = {}
	@@stats_functional_annotation_by_seqs = {}
	@@stats_different_prot_id = []
	@@stats_different_prot_id_complete_seqs = []
	@@pre_fln_seq_lengths = []
	@@seq_lengths = []

	@@map_object = {}


	@@options = options

	$verbose = options[:verbose]

	input_file = options[:fasta]
	
	if !File.exists?('fln_results')
		Dir.mkdir('fln_results')
	end

	@@func_annot_type = {
		:go_id => 5,
		:go_description => 6,
		:kegg_id => 7,
		:interpro_id => 8,
		:interpro_description => 9,
		:ec_id => 10,
		:pfam_id => 11,
		:pfam_desc => 12,
		:unipathway_id => 13
	}

	@@functional_annotations = {}
	@@functional_annotations.merge!(load_functional_annotations(File.join(ENV['BLASTDB'], 'sp_'+options[:tax_group],'sp_'+options[:tax_group]+'.index'))) if options[:acess_db].include?('s')
	@@functional_annotations.merge!(load_functional_annotations(File.join(ENV['BLASTDB'], 'tr_'+options[:tax_group],'tr_'+options[:tax_group]+'.index'))) if options[:acess_db].include?('t')

	@@fasta_file = FastaQualFile.new(input_file,'')
	file_head = "Query_id\tfasta_length\tSubject_id\tdb_name\tStatus\te_value\tp_ident\ts_length\tprotein_length\tWarning_msgs\tframe\tORF_start\tORF_end\ts_start\ts_end\tDescription\tgo_id\tgo_description\tkegg_id\tinterpro_id\tinterpro_description\tec_id\tpfam_id\tpfam_description\tunipathway_id"

	@@output_files = {}
	# Seq annotation files
	if !options[:chimera].nil?
		@@output_files[CHIMERA]		= File.open("fln_results/chimeric_sequences.txt", 'w')
		@@output_files[CHIMERA].puts file_head
	elsif File.exists?("fln_results/chimeric_sequences.txt")
		File.delete("fln_results/chimeric_sequences.txt")
	end
	@@output_files[OTHER]			= File.open('fln_results/artifact_other.txt', 'w')
	@@output_files[MISASSEMBLED]	= File.open('fln_results/misassembled.txt', 'w')
	@@output_files[UNKNOWN]			= File.open('fln_results/unknown.txt', 'w')
	@@output_files['db']			= File.open('fln_results/pt_seqs', 'w')
	@@output_files[CODING]			= File.open('fln_results/new_coding.txt', 'w')
	@@output_files[NCRNA]			= File.open('fln_results/nc_rnas.txt', 'w')

	# Complementary files
	@@output_files['align']			= File.open('fln_results/alignments.txt', 'w')
	@@output_files['prot']			= File.open('fln_results/proteins.fasta', 'w') # FASTA
	@@output_files['nts']			= File.open("fln_results/nt_seq.txt", 'w')
	@@output_files['seqs']			= File.open('fln_results/unigenes.fasta', 'w') # FASTA
	@@output_files['stats_html']	= 'fln_results/summary_stats.html'
	@@output_files['stats_txt']		= File.open('fln_results/summary_stats.txt', 'w')

	@@output_files[CODING].puts file_head
	@@output_files['db'].puts file_head
	@@output_files[NCRNA].puts file_head

	if !options[:files2map].empty?
		@@output_files['fpkm']			= File.open('fln_results/fpkm_per_transcript.txt', 'w')
		@@output_files['coverage']		= File.open('fln_results/coverage_per_transcript.txt', 'w')
		@@output_files['fpkm'].puts %w[Transcript_id fpkm Read_counts].join("\t")
		@@output_files['coverage'].puts %w[seq_name mean_normalized_differences mean_max mean_coverage proportion_sequence_mapped].join("\t")	
		if options[:remove_unmapped]
			@@output_files[UNMAPPED]	= File.open('fln_results/unmapped.txt', 'w')
			@@output_files[UNMAPPED].puts "Query_id\tfasta_length"
		end
	end

	#RepTrans module
	@@seqs_annotation_prot	= []
	@@seqs_some_coding 		= []
	@@seqs_unknown 			= []

	#Transdecoder module
	@@complete_sure = []
	@@seqs_to_analyze = []

	#Mapping_info
	@@fpkm = {}
	@@coverage_analysis = {}

end

.load_functional_annotations(annotation_file) ⇒ Object

CUSTOM FUNCTIONS



309
310
311
312
313
314
315
316
317
318
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 309

def self.load_functional_annotations(annotation_file)
	functional_annotations = {}
	File.open(annotation_file).each do |line|
		line.chomp!
		fields = line.split("\t")
		acc = fields.shift
		functional_annotations[acc] = fields
	end
	return functional_annotations
end

.orf_prediction_with_transdecoderObject



390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 390

def self.orf_prediction_with_transdecoder
	clusters_seqs_annot_prot = clustering_by_id(@@complete_sure)
	final_seqs = select_representative(clusters_seqs_annot_prot)
	coding_info = nil
	Dir.chdir(@@options[:temp]) do
		orfs = get_seqs(final_seqs)
		File.open('training_set.fasta', 'w') {|f| f.write(orfs)}
		orfs = get_seqs(@@seqs_to_analyze)
		File.open('analyse_set.fasta', 'w') {|f| f.write(orfs)}
		cmd = "TransDecoder -t analyse_set.fasta --workdir transdecoder --train training_set.fasta"
		cmd << ' --reuse' if Dir.exists?('transdecoder')
		system(cmd)
		coding_info = get_coding_info('transdecoder/longest_orfs.pep')
		coding_info = get_scores('transdecoder/longest_orfs.cds.scores', coding_info)
		coding_info = correct_by_selected('transdecoder/longest_orfs.cds.scores.selected', coding_info)
	end
	@@seqs_to_analyze.each do |seq|
		coding = coding_info[seq.seq_name]
		asign_coding_attributes(seq, coding) if !coding.nil?
		repTrans_keep_seq(seq)
		seq.write_info(@@output_files)
	end
end

.repTrans_keep_seq(seq) ⇒ Object



364
365
366
367
368
369
370
371
372
373
374
375
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 364

def self.repTrans_keep_seq(seq)
	if !@@options[:reptrans].nil?
		case seq.type 
			when COMPLETE .. INTERNAL
				@@seqs_annotation_prot << seq
			when CODING
				@@seqs_some_coding << seq
			when UNKNOWN
				@@seqs_unknown << seq
		end
	end
end

.select_orf(orfs_hash) ⇒ Object



461
462
463
464
465
466
467
468
469
470
471
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 461

def self.select_orf(orfs_hash)
	orf = nil
	ratioX = get_min_Xratio(orfs_hash)
	orfs_hash.select!{|id, info| info.first == ratioX}
	orfs = orfs_hash.select{|id, info| info[1] == 'complete'}
	orfs = orfs_hash if orfs.empty?
	max_score = get_max_score(orfs)
	orfs.select!{|id, info| info.last == max_score}
	orf = orfs.values.first
	return orf
end

Instance Method Details

#error_received(worker_error, obj) ⇒ Object



285
286
287
288
289
290
291
292
293
294
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 285

def error_received(worker_error, obj)
	data = nil
	sample = obj.first 
	if sample.class == Sequence
		data = obj.seq_name
	else
		data = obj.inspect
	end
	puts "WARNING!!!!!. CHUNK FAILED:Error while processing object #{data}\n" + worker_error.original_exception.message + ":\n" +worker_error.original_exception.backtrace.join("\n")
end

#get_functional_annotations(seq) ⇒ Object



320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 320

def get_functional_annotations(seq)
	all_info = @@functional_annotations[seq.hit.acc.gsub(/-\d+/,'')]  #gsub removes splicing code of uniprot accesion
	if !all_info.nil?
		annotations = {}
		@@func_annot_type.each do |type, position|
			annotations[type] = all_info[position]
		end
		annotations[:go_description].split(";").each do |annot|
			query = @@stats_functional_annotation_by_seqs[annot]
			if query.nil?
				@@stats_functional_annotation_by_seqs[annot] = [seq.seq_name]
			else
				query << seq.seq_name 
			end
		end
		seq.functional_annotations = annotations
	end
end

#next_workObject

this method is called every time a worker needs new data to work. This method is executed many times like the chunk size says. Return the work data or nil if no more data is available



205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 205

def next_work #Manage INput's worker
	obj = nil
	#send = 'None'
	if !@@options[:files2map].empty? && !@@options[:ref_files].empty? #Mapping task
	#	send = 'Map'
		obj = send_mapping_data 
	elsif !@@options[:files2map].empty? && @@options[:ref_files].empty? && @@map_object.length < @@options[:n_refs]
	#	send = 'Sleep'
		obj = :sleep
	else
	#	send = 'Fln'
		obj = send_fln_data
	end
	#$LOG.info "Next_work: Sent => #{send}. ref_files.length = #{@@options[:ref_files].length}; n_refs = #{@@options[:n_refs]}; map_object.length = #{@@map_object.length}"
	return obj
end

#receive_fln_data(objs) ⇒ Object



265
266
267
268
269
270
271
272
273
274
275
276
277
278
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 265

def receive_fln_data(objs)
	objs.each do |seq|
		transdecoder_keep_seq(seq)
		repTrans_keep_seq(seq)
		if seq.type > UNKNOWN && seq.type < NCRNA
			get_taxonomy(seq.hit.definition, @@stats_taxonomy) 
			get_functional_annotations(seq)
		end
		@@fpkm[seq.seq_name] = seq.fpkm if !seq.fpkm.empty?
		@@coverage_analysis[seq.seq_name] = seq.coverage_analysis if !seq.coverage_analysis.empty?
		write_seq(seq) if @@options[:acess_db].include?('c') || !@@options[:acess_db].include?('p') || ( seq.type != UNKNOWN && seq.type != CODING ) #Don't write Unknown or coding sequences when use transdecoder
	end
	@@stats_hash, @@stats_different_prot_id, @@stats_different_prot_id_complete_seqs, @@seq_lengths = summary_stats(objs, @@stats_hash, @@stats_different_prot_id, @@stats_different_prot_id_complete_seqs, @@seq_lengths)
end

#receive_mapping_data(objs) ⇒ Object



280
281
282
283
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 280

def receive_mapping_data(objs)
	data = objs
	@@map_object.merge!(data)
end

#repTrans_keep_seq(seq) ⇒ Object



351
352
353
354
355
356
357
358
359
360
361
362
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 351

def repTrans_keep_seq(seq)
	if !@@options[:reptrans].nil?
		case seq.type 
			when COMPLETE .. INTERNAL
				@@seqs_annotation_prot << seq
			when CODING
				@@seqs_some_coding << seq
			when UNKNOWN
				@@seqs_unknown << seq
		end
	end
end

#send_fln_dataObject



226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 226

def send_fln_data
	obj = nil
	n,f,q = @@fasta_file.next_seq
	if !n.nil?
		seq = Sequence.new(n,f,q)
		begin
			
			if !@@map_object.empty?
				cov_analysis = @@map_object[n]
				if cov_analysis.nil?
					seq.fpkm = []
					seq.coverage_analysis = []
				else
					seq.fpkm = cov_analysis.pop(2)
					seq.coverage_analysis = cov_analysis
				end
			end
		rescue Exception => e
			puts e.message, e.backtrace.join("\n")
		end

		@@pre_fln_seq_lengths << f.length
		sequence_stats(seq, @@stats_hash)
		obj = {fln: seq} 
	end
	return obj
end

#send_mapping_dataObject



222
223
224
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 222

def send_mapping_data
	return {mapping: @@options[:ref_files].shift}
end

#too_many_errors_receivedObject



296
297
298
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 296

def too_many_errors_received
	$LOG.error "Too many errors: #{@@error_count} errors on #{@@count} executed sequences, exiting before finishing"
end

#transdecoder_keep_seq(seq) ⇒ Object



377
378
379
380
381
382
383
384
385
386
387
388
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 377

def	transdecoder_keep_seq(seq)
	if @@options[:acess_db].include?('p')
		case seq.type 
			when COMPLETE
				@@complete_sure << seq if seq.status && seq.hit.ident >= @@options[:training_ident]
			when CODING
				@@seqs_to_analyze << seq
			when UNKNOWN
				@@seqs_to_analyze << seq
		end			
	end
end

#work_received(objs) ⇒ Object

this method is ejecuted each time an obj is finished



255
256
257
258
259
260
261
262
263
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 255

def work_received(objs) #Manage OUTput's worker
	task = objs.keys.first
	data = objs.values.first
	if task == :fln
		receive_fln_data(data)
	elsif task == :mapping
		receive_mapping_data(data)
	end
end

#worker_initial_configObject

send initial config



301
302
303
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 301

def worker_initial_config
	return @@options
end

#write_seq(seq) ⇒ Object

write results to files



341
342
343
344
345
346
347
348
349
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 341

def write_seq(seq)
	begin
		seq.write_info(@@output_files)
	rescue Exception => e
		puts "Error printing #{seq.seq_name}"
		puts e.message, e.backtrace.join("\n")
	end
	
end