Class: MyWorkerManagerFln
- Inherits:
-
ScbiMapreduce::WorkManager
- Object
- ScbiMapreduce::WorkManager
- MyWorkerManagerFln
- Defined in:
- lib/full_lengther_next/my_worker_manager_fln.rb
Instance Attribute Summary collapse
-
#seqs_annotation_prot ⇒ Object
MANAGER INITIALIZATION.
-
#seqs_some_coding ⇒ Object
MANAGER INITIALIZATION.
-
#seqs_unknown ⇒ Object
MANAGER INITIALIZATION.
Class Method Summary collapse
- .asign_coding_attributes(seq, coding) ⇒ Object
- .correct_by_selected(selected, coding_info) ⇒ Object
-
.end_work_manager ⇒ Object
close files.
- .get_annotations ⇒ Object
- .get_coding_info(file_name) ⇒ Object
- .get_max_score(orfs_hash) ⇒ Object
- .get_min_Xratio(orfs_hash) ⇒ Object
- .get_scores(file_name, coding_info) ⇒ Object
- .get_seqs(seqs) ⇒ Object
-
.init_work_manager(options) ⇒ Object
open files and prepare global data.
-
.load_functional_annotations(annotation_file) ⇒ Object
CUSTOM FUNCTIONS.
- .orf_prediction_with_transdecoder ⇒ Object
- .repTrans_keep_seq(seq) ⇒ Object
- .select_orf(orfs_hash) ⇒ Object
Instance Method Summary collapse
- #error_received(worker_error, obj) ⇒ Object
- #get_functional_annotations(seq) ⇒ Object
-
#next_work ⇒ Object
this method is called every time a worker needs new data to work.
- #receive_fln_data(objs) ⇒ Object
- #receive_mapping_data(objs) ⇒ Object
- #repTrans_keep_seq(seq) ⇒ Object
- #send_fln_data ⇒ Object
- #send_mapping_data ⇒ Object
- #too_many_errors_received ⇒ Object
- #transdecoder_keep_seq(seq) ⇒ Object
-
#work_received(objs) ⇒ Object
this method is ejecuted each time an obj is finished.
-
#worker_initial_config ⇒ Object
send initial config.
-
#write_seq(seq) ⇒ Object
write results to files.
Methods inherited from ScbiMapreduce::WorkManager
Instance Attribute Details
#seqs_annotation_prot ⇒ Object
MANAGER INITIALIZATION
89 90 91 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 89 def seqs_annotation_prot @seqs_annotation_prot end |
#seqs_some_coding ⇒ Object
MANAGER INITIALIZATION
89 90 91 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 89 def seqs_some_coding @seqs_some_coding end |
#seqs_unknown ⇒ Object
MANAGER INITIALIZATION
89 90 91 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 89 def seqs_unknown @seqs_unknown end |
Class Method Details
.asign_coding_attributes(seq, coding) ⇒ Object
437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 437 def self.asign_coding_attributes(seq, coding) seq.type = CODING @@stats_hash['unknown'] -= 1 @@stats_hash['unknown_>200'] -= 1 if seq.seq_fasta.length > 200 @@stats_hash['unknown_>500'] -= 1 if seq.seq_fasta.length > 500 @@stats_hash['coding_>200'] += 1 if seq.seq_fasta.length > 200 @@stats_hash['coding_>500'] += 1 if seq.seq_fasta.length > 500 @@stats_hash['coding'] += 1 coding = select_orf(coding) if coding[1] == 'complete' seq.status = true @@stats_hash['coding_sure'] += 1 else @@stats_hash['coding_putative'] += 1 end seq.t_code = coding.last ind = 2 ind = 3 if coding[4] == '-' frame = (coding[ind]%3)+1 frame = frame * -1 if coding[4] == '-' seq.hit = [coding[2], coding[3], frame] end |
.correct_by_selected(selected, coding_info) ⇒ Object
422 423 424 425 426 427 428 429 430 431 432 433 434 435 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 422 def self.correct_by_selected(selected, coding_info) seqs_selected = [] File.open(selected).each do |line| line.chomp! seq_name, orf_id = line.split('|', 2) seqs_selected << orf_id end coding_info.each do |seq_name, orfs| orfs.each do |orf, info| info[1] = '-' if !seqs_selected.include?(orf) end end return coding_info end |
.end_work_manager ⇒ Object
close files
190 191 192 193 194 195 196 197 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 190 def self.end_work_manager orf_prediction_with_transdecoder if @@options[:acess_db].include?('p') && !@@complete_sure.empty? && !@@seqs_to_analyze.empty? write_summary_stats(@@stats_hash, @@stats_taxonomy, @@stats_functional_annotation_by_seqs, @@stats_different_prot_id, @@stats_different_prot_id_complete_seqs, @@pre_fln_seq_lengths, @@seq_lengths, @@output_files['stats_txt'], @@output_files['stats_html']) write_mapping_report(@@fpkm, @@coverage_analysis, @@stats_functional_annotation_by_seqs) @@output_files.each do |key, handler| handler.close if handler.class != String end end |
.get_annotations ⇒ Object
550 551 552 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 550 def self.get_annotations return @@seqs_annotation_prot, @@seqs_some_coding, @@seqs_unknown end |
.get_coding_info(file_name) ⇒ Object
499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 499 def self.get_coding_info(file_name) coding_info = {} begin FastaQualFile.new(file_name, '').each do |name, seq, comments, qual| seq_length = seq.length f_len = seq.length.to_f x_len = seq.count('X') seq_name, orf_id = name.split('|') comments =~ /type:(\S+)/ type = $1 comments =~ /:(\d+)-(\d+)\(([+-])\)/ start = $1.to_i stop = $2.to_i strand = $3 record = coding_info[seq_name] info = [x_len / f_len, type, start, stop, strand] if record.nil? coding_info[seq_name] = {orf_id => info} else record[orf_id] = info end end rescue puts "Warning!!!!!!!!!!: Transdecoder file is missing. Check if Transdecoder is installed" end return coding_info end |
.get_max_score(orfs_hash) ⇒ Object
473 474 475 476 477 478 479 480 481 482 483 484 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 473 def self.get_max_score(orfs_hash) score = nil orfs_hash.each do |id, info| local = info.last if score.nil? score = local else score = local if local > score end end return score end |
.get_min_Xratio(orfs_hash) ⇒ Object
486 487 488 489 490 491 492 493 494 495 496 497 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 486 def self.get_min_Xratio(orfs_hash) ratio = nil orfs_hash.each do |id, info| local = info.first if ratio.nil? ratio = local else ratio = local if local < ratio end end return ratio end |
.get_scores(file_name, coding_info) ⇒ Object
527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 527 def self.get_scores(file_name, coding_info) File.open(file_name).each do |line| line.chomp! fields = line.split("\t") name = fields.shift seq, orf_id = name.split('|') coding = coding_info[seq] if !coding.nil? orf = coding[orf_id] if !orf.nil? score = fields.first.to_f if score > 0 orf << fields.first.to_f if !orf.nil? else coding.delete(orf_id) coding_info.delete(seq) if coding.empty? end end end end return coding_info end |
.get_seqs(seqs) ⇒ Object
414 415 416 417 418 419 420 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 414 def self.get_seqs(seqs) all_seqs = '' seqs.each do |seq| all_seqs << ">#{seq.seq_name}\n#{seq.seq_fasta}\n" end return all_seqs end |
.init_work_manager(options) ⇒ Object
open files and prepare global data
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 91 def self.init_work_manager() @@stats_hash = initialize_stats_hash @@stats_taxonomy = {} @@stats_functional_annotation_by_seqs = {} @@stats_different_prot_id = [] @@stats_different_prot_id_complete_seqs = [] @@pre_fln_seq_lengths = [] @@seq_lengths = [] @@map_object = {} @@options = $verbose = [:verbose] input_file = [:fasta] if !File.exists?('fln_results') Dir.mkdir('fln_results') end @@func_annot_type = { :go_id => 5, :go_description => 6, :kegg_id => 7, :interpro_id => 8, :interpro_description => 9, :ec_id => 10, :pfam_id => 11, :pfam_desc => 12, :unipathway_id => 13 } @@functional_annotations = {} @@functional_annotations.merge!(load_functional_annotations(File.join(ENV['BLASTDB'], 'sp_'+[:tax_group],'sp_'+[:tax_group]+'.index'))) if [:acess_db].include?('s') @@functional_annotations.merge!(load_functional_annotations(File.join(ENV['BLASTDB'], 'tr_'+[:tax_group],'tr_'+[:tax_group]+'.index'))) if [:acess_db].include?('t') @@fasta_file = FastaQualFile.new(input_file,'') file_head = "Query_id\tfasta_length\tSubject_id\tdb_name\tStatus\te_value\tp_ident\ts_length\tprotein_length\tWarning_msgs\tframe\tORF_start\tORF_end\ts_start\ts_end\tDescription\tgo_id\tgo_description\tkegg_id\tinterpro_id\tinterpro_description\tec_id\tpfam_id\tpfam_description\tunipathway_id" @@output_files = {} # Seq annotation files if ![:chimera].nil? @@output_files[CHIMERA] = File.open("fln_results/chimeric_sequences.txt", 'w') @@output_files[CHIMERA].puts file_head elsif File.exists?("fln_results/chimeric_sequences.txt") File.delete("fln_results/chimeric_sequences.txt") end @@output_files[OTHER] = File.open('fln_results/artifact_other.txt', 'w') @@output_files[MISASSEMBLED] = File.open('fln_results/misassembled.txt', 'w') @@output_files[UNKNOWN] = File.open('fln_results/unknown.txt', 'w') @@output_files['db'] = File.open('fln_results/pt_seqs', 'w') @@output_files[CODING] = File.open('fln_results/new_coding.txt', 'w') @@output_files[NCRNA] = File.open('fln_results/nc_rnas.txt', 'w') # Complementary files @@output_files['align'] = File.open('fln_results/alignments.txt', 'w') @@output_files['prot'] = File.open('fln_results/proteins.fasta', 'w') # FASTA @@output_files['nts'] = File.open("fln_results/nt_seq.txt", 'w') @@output_files['seqs'] = File.open('fln_results/unigenes.fasta', 'w') # FASTA @@output_files['stats_html'] = 'fln_results/summary_stats.html' @@output_files['stats_txt'] = File.open('fln_results/summary_stats.txt', 'w') @@output_files[CODING].puts file_head @@output_files['db'].puts file_head @@output_files[NCRNA].puts file_head if ![:files2map].empty? @@output_files['fpkm'] = File.open('fln_results/fpkm_per_transcript.txt', 'w') @@output_files['coverage'] = File.open('fln_results/coverage_per_transcript.txt', 'w') @@output_files['fpkm'].puts %w[Transcript_id fpkm Read_counts].join("\t") @@output_files['coverage'].puts %w[seq_name mean_normalized_differences mean_max mean_coverage proportion_sequence_mapped].join("\t") if [:remove_unmapped] @@output_files[UNMAPPED] = File.open('fln_results/unmapped.txt', 'w') @@output_files[UNMAPPED].puts "Query_id\tfasta_length" end end #RepTrans module @@seqs_annotation_prot = [] @@seqs_some_coding = [] @@seqs_unknown = [] #Transdecoder module @@complete_sure = [] @@seqs_to_analyze = [] #Mapping_info @@fpkm = {} @@coverage_analysis = {} end |
.load_functional_annotations(annotation_file) ⇒ Object
CUSTOM FUNCTIONS
309 310 311 312 313 314 315 316 317 318 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 309 def self.load_functional_annotations(annotation_file) functional_annotations = {} File.open(annotation_file).each do |line| line.chomp! fields = line.split("\t") acc = fields.shift functional_annotations[acc] = fields end return functional_annotations end |
.orf_prediction_with_transdecoder ⇒ Object
390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 390 def self.orf_prediction_with_transdecoder clusters_seqs_annot_prot = clustering_by_id(@@complete_sure) final_seqs = select_representative(clusters_seqs_annot_prot) coding_info = nil Dir.chdir(@@options[:temp]) do orfs = get_seqs(final_seqs) File.open('training_set.fasta', 'w') {|f| f.write(orfs)} orfs = get_seqs(@@seqs_to_analyze) File.open('analyse_set.fasta', 'w') {|f| f.write(orfs)} cmd = "TransDecoder -t analyse_set.fasta --workdir transdecoder --train training_set.fasta" cmd << ' --reuse' if Dir.exists?('transdecoder') system(cmd) coding_info = get_coding_info('transdecoder/longest_orfs.pep') coding_info = get_scores('transdecoder/longest_orfs.cds.scores', coding_info) coding_info = correct_by_selected('transdecoder/longest_orfs.cds.scores.selected', coding_info) end @@seqs_to_analyze.each do |seq| coding = coding_info[seq.seq_name] asign_coding_attributes(seq, coding) if !coding.nil? repTrans_keep_seq(seq) seq.write_info(@@output_files) end end |
.repTrans_keep_seq(seq) ⇒ Object
364 365 366 367 368 369 370 371 372 373 374 375 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 364 def self.repTrans_keep_seq(seq) if !@@options[:reptrans].nil? case seq.type when COMPLETE .. INTERNAL @@seqs_annotation_prot << seq when CODING @@seqs_some_coding << seq when UNKNOWN @@seqs_unknown << seq end end end |
.select_orf(orfs_hash) ⇒ Object
461 462 463 464 465 466 467 468 469 470 471 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 461 def self.select_orf(orfs_hash) orf = nil ratioX = get_min_Xratio(orfs_hash) orfs_hash.select!{|id, info| info.first == ratioX} orfs = orfs_hash.select{|id, info| info[1] == 'complete'} orfs = orfs_hash if orfs.empty? max_score = get_max_score(orfs) orfs.select!{|id, info| info.last == max_score} orf = orfs.values.first return orf end |
Instance Method Details
#error_received(worker_error, obj) ⇒ Object
285 286 287 288 289 290 291 292 293 294 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 285 def error_received(worker_error, obj) data = nil sample = obj.first if sample.class == Sequence data = obj.seq_name else data = obj.inspect end puts "WARNING!!!!!. CHUNK FAILED:Error while processing object #{data}\n" + worker_error.original_exception. + ":\n" +worker_error.original_exception.backtrace.join("\n") end |
#get_functional_annotations(seq) ⇒ Object
320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 320 def get_functional_annotations(seq) all_info = @@functional_annotations[seq.hit.acc.gsub(/-\d+/,'')] #gsub removes splicing code of uniprot accesion if !all_info.nil? annotations = {} @@func_annot_type.each do |type, position| annotations[type] = all_info[position] end annotations[:go_description].split(";").each do |annot| query = @@stats_functional_annotation_by_seqs[annot] if query.nil? @@stats_functional_annotation_by_seqs[annot] = [seq.seq_name] else query << seq.seq_name end end seq.functional_annotations = annotations end end |
#next_work ⇒ Object
this method is called every time a worker needs new data to work. This method is executed many times like the chunk size says. Return the work data or nil if no more data is available
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 205 def next_work #Manage INput's worker obj = nil #send = 'None' if !@@options[:files2map].empty? && !@@options[:ref_files].empty? #Mapping task # send = 'Map' obj = send_mapping_data elsif !@@options[:files2map].empty? && @@options[:ref_files].empty? && @@map_object.length < @@options[:n_refs] # send = 'Sleep' obj = :sleep else # send = 'Fln' obj = send_fln_data end #$LOG.info "Next_work: Sent => #{send}. ref_files.length = #{@@options[:ref_files].length}; n_refs = #{@@options[:n_refs]}; map_object.length = #{@@map_object.length}" return obj end |
#receive_fln_data(objs) ⇒ Object
265 266 267 268 269 270 271 272 273 274 275 276 277 278 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 265 def receive_fln_data(objs) objs.each do |seq| transdecoder_keep_seq(seq) repTrans_keep_seq(seq) if seq.type > UNKNOWN && seq.type < NCRNA get_taxonomy(seq.hit.definition, @@stats_taxonomy) get_functional_annotations(seq) end @@fpkm[seq.seq_name] = seq.fpkm if !seq.fpkm.empty? @@coverage_analysis[seq.seq_name] = seq.coverage_analysis if !seq.coverage_analysis.empty? write_seq(seq) if @@options[:acess_db].include?('c') || !@@options[:acess_db].include?('p') || ( seq.type != UNKNOWN && seq.type != CODING ) #Don't write Unknown or coding sequences when use transdecoder end @@stats_hash, @@stats_different_prot_id, @@stats_different_prot_id_complete_seqs, @@seq_lengths = summary_stats(objs, @@stats_hash, @@stats_different_prot_id, @@stats_different_prot_id_complete_seqs, @@seq_lengths) end |
#receive_mapping_data(objs) ⇒ Object
280 281 282 283 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 280 def receive_mapping_data(objs) data = objs @@map_object.merge!(data) end |
#repTrans_keep_seq(seq) ⇒ Object
351 352 353 354 355 356 357 358 359 360 361 362 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 351 def repTrans_keep_seq(seq) if !@@options[:reptrans].nil? case seq.type when COMPLETE .. INTERNAL @@seqs_annotation_prot << seq when CODING @@seqs_some_coding << seq when UNKNOWN @@seqs_unknown << seq end end end |
#send_fln_data ⇒ Object
226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 226 def send_fln_data obj = nil n,f,q = @@fasta_file.next_seq if !n.nil? seq = Sequence.new(n,f,q) begin if !@@map_object.empty? cov_analysis = @@map_object[n] if cov_analysis.nil? seq.fpkm = [] seq.coverage_analysis = [] else seq.fpkm = cov_analysis.pop(2) seq.coverage_analysis = cov_analysis end end rescue Exception => e puts e., e.backtrace.join("\n") end @@pre_fln_seq_lengths << f.length sequence_stats(seq, @@stats_hash) obj = {fln: seq} end return obj end |
#send_mapping_data ⇒ Object
222 223 224 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 222 def send_mapping_data return {mapping: @@options[:ref_files].shift} end |
#too_many_errors_received ⇒ Object
296 297 298 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 296 def too_many_errors_received $LOG.error "Too many errors: #{@@error_count} errors on #{@@count} executed sequences, exiting before finishing" end |
#transdecoder_keep_seq(seq) ⇒ Object
377 378 379 380 381 382 383 384 385 386 387 388 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 377 def transdecoder_keep_seq(seq) if @@options[:acess_db].include?('p') case seq.type when COMPLETE @@complete_sure << seq if seq.status && seq.hit.ident >= @@options[:training_ident] when CODING @@seqs_to_analyze << seq when UNKNOWN @@seqs_to_analyze << seq end end end |
#work_received(objs) ⇒ Object
this method is ejecuted each time an obj is finished
255 256 257 258 259 260 261 262 263 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 255 def work_received(objs) #Manage OUTput's worker task = objs.keys.first data = objs.values.first if task == :fln receive_fln_data(data) elsif task == :mapping receive_mapping_data(data) end end |
#worker_initial_config ⇒ Object
send initial config
301 302 303 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 301 def worker_initial_config return @@options end |
#write_seq(seq) ⇒ Object
write results to files
341 342 343 344 345 346 347 348 349 |
# File 'lib/full_lengther_next/my_worker_manager_fln.rb', line 341 def write_seq(seq) begin seq.write_info(@@output_files) rescue Exception => e puts "Error printing #{seq.seq_name}" puts e., e.backtrace.join("\n") end end |