Class: ProtXMLToGFFTool

Inherits:
Tool
  • Object
show all
Defined in:
lib/protk/protxml_to_gff_tool.rb

Defined Under Namespace

Classes: CDSInfo

Instance Attribute Summary

Attributes inherited from Tool

#option_parser, #options, #options_defined_by_user

Instance Method Summary collapse

Methods inherited from Tool

#add_boolean_option, #add_value_option, #check_options, #database_info, default_output_path, extension_from_filename, #jobid_prefix, #jobid_prefix=, #method_missing, #run, #supported_options

Constructor Details

#initializeProtXMLToGFFTool

Returns a new instance of ProtXMLToGFFTool.



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# File 'lib/protk/protxml_to_gff_tool.rb', line 5

def initialize

	super([:explicit_output])

	@option_parser.banner = "Create a gff containing peptide Observations.\n\nUsage: protxml_to_gff.rb "

	add_value_option(:database,nil,['-d filename','--database filename','Database used for ms/ms searches (Fasta Format)'])
	add_value_option(:genome,nil,['-g filename','--genome filename', 'Nucleotide sequences for scaffolds (Fasta Format)'])
	add_value_option(:coords_file,nil,['-c filename','--coords-file filename.gff3', 'If genomic coordinates are not encoded in protein db entries look them up from a supplied gff file'])
	# add_value_option(:contig_regex,nil,['--contig-regex expression','Regular expression with a single capture group to get contig ids from protein ids'])
	add_value_option(:protein_find,nil,['-f term','--find term', 'Restrict output to proteins whose name matches the specified string'])
	add_value_option(:nterm_minlen,7,['-n len','--nterm-min-len len', 'Only include inferred N-terminal sequences if longer than len'])
	add_boolean_option(:skip_fasta_indexing,false,['--skip-index','Don\'t index database (Index should already exist)'])
	add_boolean_option(:stack_charge_states,false,['--stack-charge-states','Different peptide charge states get separate gff entries'])
	add_boolean_option(:collapse_redundant_proteins,false,['--collapse-redundant-proteins','Proteins that cover genomic regions already covered will be skipped'])
	add_value_option(:peptide_probability_threshold,0.95,['--threshold prob','Peptide Probability Threshold (Default 0.95)'])
	add_value_option(:protein_probability_threshold,0.99,['--prot-threshold prob','Protein Probability Threshold (Default 0.99)'])

end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method in the class Tool

Instance Method Details

#add_putative_nterm_to_gff(gff_records, peptide_seq, protein_seq, protein_info, prot_id, peptide_count, dna_sequence, genomedb) ⇒ Object



449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
# File 'lib/protk/protxml_to_gff_tool.rb', line 449

def add_putative_nterm_to_gff(gff_records,peptide_seq,protein_seq,protein_info,prot_id,peptide_count,dna_sequence,genomedb)
  pep_id = "#{prot_id}.p#{peptide_count.to_s}"
  signal_peptide = get_nterm_peptide_for_peptide(peptide_seq,protein_seq)
  if signal_peptide
    $stdout.write "Nterm\t#{signal_peptide}\t#{protein_info.name}\t#{protein_seq}\n"
    raw_signal_peptide=signal_peptide.sub(/\(cleaved\)/,"")
    # Get raw signal_peptide sequence

    signal_peptide_coords=get_peptide_coordinates(protein_seq,raw_signal_peptide,protein_info,dna_sequence)
    if signal_peptide_coords
     signal_peptide_coords.each do |spcoords|  
      signal_peptide_gff = generate_fragment_gffs_for_coords(spcoords,protein_info,pep_id,raw_signal_peptide,genomedb,"signalpeptide")
          gff_records += signal_peptide_gff
      end
    end
  end
end

#cds_info_from_fasta(fasta_entry) ⇒ Object



72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/protk/protxml_to_gff_tool.rb', line 72

def cds_info_from_fasta(fasta_entry)
  info=CDSInfo.new
  info.fasta_id=fasta_entry
  positions = fasta_entry.identifiers.description.split(' ').collect { |coords| coords.split('|').collect {|pos| pos.to_i} }
  info.coding_sequences=[]
  info.gene_id
  if ( positions.length < 1 )
    raise EncodingError
  elsif ( positions.length > 1)
    info.coding_sequences = positions[1..-1]
  end

  info.start = positions[0][0]
  info.end = positions[0][1]

  info.scaffold=fasta_entry.entry_id.scan(/(scaffold_?\d+)_/)[0][0]
  info.name = fasta_entry.entry_id.scan(/lcl\|(.*)/)[0][0]

  if fasta_entry.entry_id =~ /frame/
    info.frame=info.name.scan(/frame_(\d)/)[0][0]
    info.strand = (info.frame.to_i > 3) ? '-' : '+'
    info.is_sixframe = true
  else
    info.strand = (info.name =~ /rev/) ? '-' : '+'
    info.gene_id=info.name.scan(/_\w{3}_(.*)\.t/)[0][0]
    info.is_sixframe = false
  end
  info
end

#fragment_coords_from_protein_coords(pepstart, pepend, gene_start, gene_end, coding_sequences) ⇒ Object



152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# File 'lib/protk/protxml_to_gff_tool.rb', line 152

def fragment_coords_from_protein_coords(pepstart,pepend,gene_start,gene_end,coding_sequences)
  
  sorted_cds = coding_sequences.sort { |a, b| a[0] <=> b[0] }


  # Assume positive strand
  pi_start=pepstart*3+gene_start-1
  pi_end=pepend*3+gene_start-1

  fragments=[]
  p_i = pi_start #Initially we are looking for the first fragment
  finding_start=true

  sorted_cds.each_with_index do |cds_coords, i|
    cds_start=cds_coords[0]
    cds_end = cds_coords[1]
    if cds_end < p_i # Exon is before index in sequence and doesn't contain p_i
      if sorted_cds.length <= i+1
        require 'debugger';debugger
      end

      next_coords = sorted_cds[i+1]
      intron_offset = ((next_coords[0]-cds_end)-1)
      p_i+=intron_offset
      pi_end+=intron_offset
      if !finding_start
        # This is a middle exon
        fragments << [cds_start,cds_end]
      end
    else 
      if finding_start

        if ( pi_end <= cds_end) #Whole peptide contained in a single exon
          fragments << [p_i+1,pi_end]
          break;
        end


        fragments << [p_i+1,(cds_end)]
        next_coords = sorted_cds[i+1]
        intron_offset = ((next_coords[0]-cds_end)-1)
        p_i+=intron_offset
        pi_end+=intron_offset
        p_i = pi_end
        finding_start=false
      else # A terminal exon
#        require 'debugger';debugger
        fragments << [(cds_start),(p_i)]
        break;
      end
    end
  end
  [fragments]
end

#generate_fragment_gffs_for_coords(coords, protein_info, pep_id, peptide_seq, genomedb, name = "fragment") ⇒ Object



270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
# File 'lib/protk/protxml_to_gff_tool.rb', line 270

def generate_fragment_gffs_for_coords(coords,protein_info,pep_id,peptide_seq,genomedb,name="fragment")
  scaff = get_fasta_record(protein_info.scaffold,genomedb)
  scaffold_seq = Bio::Sequence::NA.new(scaff.seq)

  fragment_phase = 0
  ordered_coords= protein_info.strand=='+' ? coords : coords.reverse
  if name=="CDS"
    frag_id="#{pep_id}.fg"    
  else
    frag_id="#{pep_id}.sp"    
  end
  gff_lines = ordered_coords.collect do |frag_start,frag_end|
    frag_naseq = scaffold_seq[frag_start-1..frag_end-1]

    begin
      frag_frame = fragment_phase+1
      frag_seq = nil
      if ( protein_info.strand=='-')
        frag_seq = frag_naseq.reverse_complement.translate(frag_frame)
      else
        frag_seq = frag_naseq.translate(frag_frame)
      end
    rescue
      if frag_naseq.length > 1
        puts "Unable to translate #{frag_naseq}"
#        require 'debugger'        
      end
      frag_seq="*"
    end

    fragment_record=Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
        feature_type=name,start_position=frag_start,end_position=frag_end,score='',
        strand=protein_info.strand,frame=fragment_phase,attributes=[["Parent",pep_id],["ID",frag_id],["Name",frag_seq]])


    remainder=(frag_naseq.length-fragment_phase) % 3
    fragment_phase=(3-remainder) % 3

    fragment_record
  end


  concat_seq=nil

  coords.each do |frag_start,frag_end|
    frag_naseq = scaffold_seq[frag_start-1..frag_end-1]
    concat_seq += frag_naseq unless concat_seq == nil
    concat_seq = frag_naseq if concat_seq==nil
  end

  check_seq = protein_info.strand=='-' ? concat_seq.reverse_complement.translate : concat_seq.translate
  if ( check_seq != peptide_seq)
    # require 'debugger';debugger
    puts "Fragment seqs not equal to peptide seqs"
  end

  return gff_lines

end

#generate_gff_for_peptide_mapped_to_protein(protein_seq, peptide_seq, protein_info, prot_id, peptide_prob, peptide_count, dna_sequence, genomedb = nil) ⇒ Object



393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
# File 'lib/protk/protxml_to_gff_tool.rb', line 393

def generate_gff_for_peptide_mapped_to_protein(protein_seq,peptide_seq,protein_info,prot_id,peptide_prob,peptide_count,dna_sequence,genomedb=nil)

  prot_seq = protein_seq
  pep_seq = peptide_seq


  peptide_coords = get_peptide_coordinates(prot_seq,pep_seq,protein_info,dna_sequence)  

  if ( peptide_coords==nil ) # Return value of nil means the entry is a predicted transcript that should already be covered by 6-frame
    return []
  end

  gff_records=[]

  # Now convert peptide coordinate to genome coordinates
  # And create gff lines for each match
  peptide_coords.each do |coords|

#    require 'debugger';debugger
    pep_genomic_start = coords.first[0]
    pep_genomic_end = coords.last[1]

    pep_id = "#{prot_id}.p#{peptide_count.to_s}"
    pep_attributes = [["ID",pep_id],["Parent",prot_id],["Name",pep_seq]]

    pep_gff_line = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
      feature_type="peptide",start_position=pep_genomic_start,end_position=pep_genomic_end,score=peptide_prob,
      strand=protein_info.strand,frame=nil,attributes=pep_attributes)

    # For standard peptides
    frag_gffs = generate_fragment_gffs_for_coords(coords,protein_info,pep_id,peptide_seq,genomedb,"CDS")
    gff_records += [pep_gff_line] + frag_gffs
    # require 'debugger';debugger
    # For peptides with only 1 tryptic terminus
    start_codon_coords=get_start_codon_coords_for_peptide(pep_genomic_start,pep_genomic_end,peptide_seq,protein_seq,protein_info.strand)
    if start_codon_coords
      start_codon_gff = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
        feature_type="start_codon",start_position=start_codon_coords[0],end_position=start_codon_coords[1],score='',
        strand=protein_info.strand,frame=nil,attributes=["Parent",pep_id])
      gff_records+=[start_codon_gff]
    end

    cterm_coords = get_cterm_coords_for_peptide(pep_genomic_start,pep_genomic_end,peptide_seq,protein_seq,protein_info.strand)
    if ( cterm_coords )
      cterm_gff = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
        feature_type="cterm",start_position=cterm_coords[0],end_position=cterm_coords[1],score='',
        strand=protein_info.strand,frame=nil,attributes=["Parent",pep_id])
      gff_records+=[start_codon_gff]
    end

  end
#  puts gff_records

  gff_records
end

#generate_protein_gff(protein_name, entry_info, prot_prob, prot_id) ⇒ Object



122
123
124
125
126
127
128
# File 'lib/protk/protxml_to_gff_tool.rb', line 122

def generate_protein_gff(protein_name,,prot_prob,prot_id)
  prot_qualifiers = {"source" => "MSMS", "score" => prot_prob, "ID" => prot_id}
  prot_attributes = [["ID",prot_id],["Name",.name]]
  prot_gff_line = Bio::GFF::GFF3::Record.new(seqid = .scaffold,source="MSMS",feature_type="protein",
    start_position=.start,end_position=.end,score=prot_prob,strand=.strand,frame=nil,attributes=prot_attributes)
  prot_gff_line
end

#get_cterm_coords_for_peptide(peptide_genomic_start, peptide_genomic_end, peptide_seq, protein_seq, strand) ⇒ Object



349
350
351
352
353
354
355
356
357
358
359
# File 'lib/protk/protxml_to_gff_tool.rb', line 349

def get_cterm_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,peptide_seq,protein_seq,strand)

  if ( (peptide_seq[-1]!='K' && peptide_seq[-1]!='R' ) )

    codon_coord = (strand=='+') ? peptide_genomic_end-3 : peptide_genomic_start+1
    # require 'debugger';debugger
    return [codon_coord,codon_coord+2]
  else
    return nil
  end
end

#get_dna_sequence(protein_info, genomedb) ⇒ Object



130
131
132
133
134
135
136
137
138
139
140
# File 'lib/protk/protxml_to_gff_tool.rb', line 130

def get_dna_sequence(protein_info,genomedb)

  scaffold_sequence = get_fasta_record(protein_info.scaffold,genomedb)
  gene_sequence = scaffold_sequence.naseq.to_s[(protein_info.start-1)..protein_info.end]

  if ( protein_info.strand == "-")
    gene_sequence = Bio::Sequence::NA.new(gene_sequence).reverse_complement
  end

  gene_sequence
end

#get_fasta_record(protein_name, fastadb) ⇒ Object



40
41
42
43
44
45
46
47
48
# File 'lib/protk/protxml_to_gff_tool.rb', line 40

def get_fasta_record(protein_name,fastadb)
#  puts "Looking up #{protein_name}"
  entry = fastadb.get_by_id protein_name
  if ( entry == nil)
    puts "Failed lookup for #{protein_name}"
    raise KeyError
  end
  entry
end

#get_nterm_peptide_for_peptide(peptide_seq, protein_seq) ⇒ Object



362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
# File 'lib/protk/protxml_to_gff_tool.rb', line 362

def get_nterm_peptide_for_peptide(peptide_seq,protein_seq)
  pi=protein_seq.index(peptide_seq)
  if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R' && protein_seq[pi]!='M') )
    # Since trypsin sometimes cleaves before P (ie breaking the rule) 
    # we don't check for it and assume those cases are real tryptic termini
    reverse_leader_seq=protein_seq[0..pi].reverse
    mi=reverse_leader_seq.index('M')

    if ( mi==nil )
      puts "No methionine found ahead of peptide sequence. Unable to determine n-term sequence"
      return nil
    end

    mi=pi-mi

    ntermseq=protein_seq[mi..(pi-1)]

    # if ( ntermseq.length < minlen )
    #   return nil
    # end

#    $STDOUT.write protein_seq[mi..(pi+peptide_seq.length-1)]
#    require 'debugger';debugger
    full_seq_with_annotations = "#{ntermseq}(cleaved)#{protein_seq[(pi..(pi+peptide_seq.length-1))]}"

    return full_seq_with_annotations
  else
    return nil
  end
end

#get_peptide_coordinates(prot_seq, pep_seq, protein_info, gene_seq) ⇒ Object

Returns a 4-mer [genomic_start,fragment1_end(or0),frag2_start(or0),genomic_end]



261
262
263
264
265
266
267
# File 'lib/protk/protxml_to_gff_tool.rb', line 261

def get_peptide_coordinates(prot_seq,pep_seq,protein_info,gene_seq)
  if ( protein_info.is_sixframe)
    return get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
  else
    return get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
  end
end

#get_peptide_coordinates_from_transcript_info(prot_seq, pep_seq, protein_info, gene_seq) ⇒ Object

gene_seq should already have been reverse_complemented if on reverse strand



208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# File 'lib/protk/protxml_to_gff_tool.rb', line 208

def get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
  # if ( peptide_is_in_sixframe(pep_seq,gene_seq))
    # Peptide is in 6-frame but on a predicted transcript
    # return nil
  # else

    # puts "Found a gap #{protein_info.fasta_id}"
    if protein_info.strand=='-'
      pep_index = prot_seq.reverse.index(pep_seq.reverse)
      if pep_index==nil
#        require 'debugger';debugger
        puts "Warning: Unable to find peptide #{pep_seq} in this protein! #{protein_info}"
        return nil
      end
      pep_start_i = prot_seq.reverse.index(pep_seq.reverse)+1
      # Plus 1 because on reverse stand stop-codon will be at the beginning of the sequence (when read forwards). Need to eliminate it.
    else
      pep_start_i = prot_seq.index(pep_seq)
      if pep_start_i==nil
#        require 'debugger';debugger
        puts "Warning: Unable to find peptide #{pep_seq} in this protein! #{protein_info}"
        return nil
      end
    end
    pep_end_i = pep_start_i+pep_seq.length

    return fragment_coords_from_protein_coords(pep_start_i,pep_end_i,protein_info.start,protein_info.end,protein_info.coding_sequences)
  # end
end

#get_peptide_coordinates_sixframe(prot_seq, pep_seq, protein_info) ⇒ Object



238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# File 'lib/protk/protxml_to_gff_tool.rb', line 238

def get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)

  if ( protein_info.strand == '-' )
    prot_seq = prot_seq.reverse
    pep_seq = pep_seq.reverse
  end

  start_indexes = [0]
  
  prot_seq.scan /#{pep_seq}/  do |match| 
  start_indexes << prot_seq.index(match,start_indexes.last)
  end  
  start_indexes.delete_at(0)

  start_indexes.collect do |si| 
    pep_genomic_start = protein_info.start + 3*si
    pep_genomic_end = pep_genomic_start + 3*pep_seq.length - 1
    [[pep_genomic_start,pep_genomic_end]]  
  end

end

#get_start_codon_coords_for_peptide(peptide_genomic_start, peptide_genomic_end, peptide_seq, protein_seq, strand) ⇒ Object



330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
# File 'lib/protk/protxml_to_gff_tool.rb', line 330

def get_start_codon_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,peptide_seq,protein_seq,strand)
  pi=protein_seq.index(peptide_seq)
  if ( protein_seq[pi]=='M' )
    is_tryptic=false
    if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R') )
      is_tryptic=true
    elsif (pi==0)
      is_tryptic=true
    end
    return nil unless is_tryptic

    start_codon_coord = (strand=='+') ? peptide_genomic_start : peptide_genomic_end-2
    # require 'debugger';debugger
    return [start_codon_coord,start_codon_coord+2]
  else
    return nil
  end
end

#is_new_genome_location(candidate_entry, existing_entries) ⇒ Object



103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/protk/protxml_to_gff_tool.rb', line 103

def is_new_genome_location(candidate_entry,existing_entries)
  # puts existing_entries
  # require 'debugger';debugger

  # genes=existing_entries.collect { |e|  e.gene_id  }.compact

  # if genes.include?(candidate_entry.gene_id)
  #   return false
  # end

  existing_entries.each do |existing|  
    return false if existing.gene_id==candidate_entry.gene_id
    return false if existing.overlap(candidate_entry)
  end

  return true
end

#peptide_gff_is_duplicate(peptide_gff, peptides_covered_genome) ⇒ Object



467
468
469
470
471
472
473
474
# File 'lib/protk/protxml_to_gff_tool.rb', line 467

def peptide_gff_is_duplicate(peptide_gff,peptides_covered_genome)
  nameindex = peptide_gff.attributes.index {|obj| obj[0]=="Name" }
  pep_seq = peptide_gff.attributes[nameindex][1]
  existing = peptides_covered_genome[pep_seq]
  return true if existing==peptide_gff.start

  return false
end

#peptide_is_in_sixframe(pep_seq, gene_seq) ⇒ Object



142
143
144
145
146
147
148
149
150
# File 'lib/protk/protxml_to_gff_tool.rb', line 142

def peptide_is_in_sixframe(pep_seq,gene_seq)
  gs=Bio::Sequence::NA.new(gene_seq)
  (1..6).each do |frame|  
    if gs.translate(frame).index(pep_seq)
      return true
    end
  end
  return false
end

#peptide_nodes(protein_node) ⇒ Object



35
36
37
# File 'lib/protk/protxml_to_gff_tool.rb', line 35

def peptide_nodes(protein_node)
  return protein_node.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
end

#protein_names(protein_node) ⇒ Object



26
27
28
29
30
31
32
33
# File 'lib/protk/protxml_to_gff_tool.rb', line 26

def protein_names(protein_node)
  indis_proteins = protein_node.find('protxml:indistinguishable_protein','protxml:http://regis-web.systemsbiology.net/protXML')
  prot_names = [protein_node['protein_name']]
  for protein in indis_proteins
   	prot_names += [protein['protein_name']]
  end
  prot_names
end