Class: Peptide

Inherits:
Object
  • Object
show all
Defined in:
lib/protk/peptide.rb

Direct Known Subclasses

IndistinguishablePeptide

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializePeptide

Returns a new instance of Peptide.



105
106
107
# File 'lib/protk/peptide.rb', line 105

def initialize()

end

Instance Attribute Details

#chargeObject

Returns the value of attribute charge.



20
21
22
# File 'lib/protk/peptide.rb', line 20

def charge
  @charge
end

#indistinguishable_peptidesObject

Returns the value of attribute indistinguishable_peptides.



25
26
27
# File 'lib/protk/peptide.rb', line 25

def indistinguishable_peptides
  @indistinguishable_peptides
end

#modificationsObject

Returns the value of attribute modifications.



23
24
25
# File 'lib/protk/peptide.rb', line 23

def modifications
  @modifications
end

#modified_sequenceObject

Returns the value of attribute modified_sequence.



24
25
26
# File 'lib/protk/peptide.rb', line 24

def modified_sequence
  @modified_sequence
end

#probabilityObject

Returns the value of attribute probability.



21
22
23
# File 'lib/protk/peptide.rb', line 21

def probability
  @probability
end

#protein_nameObject

Returns the value of attribute protein_name.



19
20
21
# File 'lib/protk/peptide.rb', line 19

def protein_name
  @protein_name
end

#sequenceObject

Stripped sequence (no modifications)



18
19
20
# File 'lib/protk/peptide.rb', line 18

def sequence
  @sequence
end

#theoretical_neutral_massObject

Returns the value of attribute theoretical_neutral_mass.



22
23
24
# File 'lib/protk/peptide.rb', line 22

def theoretical_neutral_mass
  @theoretical_neutral_mass
end

Class Method Details

.from_mzid(xmlnode, mzid_doc) ⇒ Object

<ProteinDetectionHypothesis id=“PAG_0_1” dBSequence_ref=“JEMP01000193.1_rev_g3500.t1 280755” passThreshold=“false”> <PeptideHypothesis peptideEvidence_ref=“PepEv_1”> <SpectrumIdentificationItemRef spectrumIdentificationItem_ref=“SII_1_1”/> </PeptideHypothesis> <cvParam cvRef=“PSI-MS” accession=“MS:1002403” name=“group representative”/> <cvParam cvRef=“PSI-MS” accession=“MS:1002401” name=“leading protein”/> <cvParam cvRef=“PSI-MS” accession=“MS:1001093” name=“sequence coverage” value=“0.0”/> </ProteinDetectionHypothesis>



75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/protk/peptide.rb', line 75

def from_mzid(xmlnode,mzid_doc)
  pep=new()
  pep.sequence=mzid_doc.get_sequence_for_peptide(xmlnode)
  best_psm = mzid_doc.get_best_psm_for_peptide(xmlnode)
  # require 'byebug';byebug if !best_psm
  pep.probability = mzid_doc.get_cvParam(best_psm,"MS:1002466")['value'].to_f
  pep.theoretical_neutral_mass = mzid_doc.get_cvParam(best_psm,"MS:1001117")['value'].to_f
  pep.charge = best_psm.attributes['chargeState'].to_i
  pep.protein_name = mzid_doc.get_dbsequence(xmlnode.parent,xmlnode.parent.attributes['dBSequence_ref']).attributes['accession']


  pep
end

.from_protxml(xmlnode) ⇒ Object



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/protk/peptide.rb', line 37

def from_protxml(xmlnode)
  pep=new()
  pep.sequence=xmlnode['peptide_sequence']
  pep.probability=xmlnode['nsp_adjusted_probability'].to_f
  pep.charge=xmlnode['charge'].to_i

  # This deal with the case where mods are on the primary peptide
  #
  mod_info_node = xmlnode.find('protxml:modification_info','protxml:http://regis-web.systemsbiology.net/protXML')

  # The pepXML spec says there can be multiple modification_info's but in practice there never is.
  # We assume either 1 or 0
  if ( mod_info_node.length > 0 )
    throw "Encountered multiple modification_info nodes for a peptide" if mod_info_node.length > 1
    pep.modified_sequence = mod_info_node[0]['modified_peptide']
    mod_nodes = mod_info_node[0].find('protxml:mod_aminoacid_mass','protxml:http://regis-web.systemsbiology.net/protXML')
    # require 'byebug';byebug
    pep.modifications = mod_nodes.collect { |e| PeptideMod.from_protxml(e) }
  end

  # This deals with indistinguishable peptides
  #
  ips = xmlnode.find('protxml:indistinguishable_peptide','protxml:http://regis-web.systemsbiology.net/protXML')
  # require 'byebug';byebug
  pep.indistinguishable_peptides = ips.collect { |e| IndistinguishablePeptide.from_protxml(e) }

  pep
end

.from_sequence(seq, charge = nil) ⇒ Object



89
90
91
92
93
94
95
96
97
98
99
# File 'lib/protk/peptide.rb', line 89

def from_sequence(seq,charge=nil)
  pep=new()

  pep.modifications = pep.modifications_from_sequence(seq)
  pep.modified_sequence = seq

  seq = seq.sub(/^n\[[0-9]+?\]/,"")
  pep.sequence = seq.gsub(/[0-9\.\[\]]/,"")
  pep.charge=charge
  pep
end

Instance Method Details

#as_protxmlObject



27
28
29
30
31
32
33
34
# File 'lib/protk/peptide.rb', line 27

def as_protxml
  node = XML::Node.new('peptide')
  node['peptide_sequence']=self.sequence.to_s
  node['charge']=self.charge.to_s
  node['nsp_adjusted_probability']=self.probability.to_s
  node['calc_neutral_pep_mass']=self.theoretical_neutral_mass.to_s
  node
end

#coords_in_protein(prot_seq, reverse = false) ⇒ Object

Expects prot_seq not to contain explicit stop codon (ie * at end) AA coords are 0-based unlike genomic coords which are 1 based



132
133
134
135
136
137
138
139
140
141
142
143
# File 'lib/protk/peptide.rb', line 132

def coords_in_protein(prot_seq,reverse=false)
  if reverse
    pep_index = prot_seq.reverse.index(self.sequence.reverse)
    raise PeptideNotInProteinError.new("Peptide #{self.sequence} not found in protein #{prot_seq} ") if pep_index.nil?
    pep_start_i = pep_index
  else
    pep_start_i = prot_seq.index(self.sequence)
    raise PeptideNotInProteinError.new("Peptide #{self.sequence} not found in protein #{prot_seq} ") if pep_start_i.nil?      
  end
  pep_end_i = pep_start_i+self.sequence.length
  {:start => pep_start_i,:end => pep_end_i}
end

#gff_record_for_peptide_fragment(start_i, end_i, parent_record, record_info) ⇒ Object



270
271
272
273
274
275
276
277
278
279
280
281
# File 'lib/protk/peptide.rb', line 270

def gff_record_for_peptide_fragment(start_i,end_i,parent_record,record_info)
  cds_id = parent_record.id
  mod_sequence = record_info[:modified_sequence]
  this_id = mod_sequence ? "#{cds_id}.#{mod_sequence}" : "#{cds_id}.#{self.sequence}"
  this_id << ".#{self.charge}" unless self.charge.nil?
  mod = record_info[:mod]
  this_id << ".#{mod.position}.#{mod.mass}" unless mod.nil?
  score = self.probability.nil? ? "." : self.probability.to_s
  record_type = mod.nil? ? record_info[:type] : "#{record_info[:type]}_#{mod.amino_acid}"
  gff_string = "#{parent_record.seqid}\tMSMS\t#{record_type}\t#{start_i}\t#{end_i}\t#{score}\t#{parent_record.strand}\t0\tID=#{this_id};Parent=#{cds_id}"
  Bio::GFF::GFF3::Record.new(gff_string)
end

#gff_records_for_coords_in_protein(aa_coords, seqlen, parent_record, cds_records, record_info = {:type => "polypeptide"}) ⇒ Object



207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
# File 'lib/protk/peptide.rb', line 207

def gff_records_for_coords_in_protein(aa_coords,seqlen,parent_record,cds_records,record_info ={:type => "polypeptide"})
  on_reverse_strand = (parent_record.strand=="-") ? true : false
  ordered_cds_records = on_reverse_strand ? cds_records.sort.reverse : cds_records.sort

  # Initial position is the number of NA's from the start of translation
  #
  pep_nalen = seqlen*3

  i = 0; #Current protein position (in nucleic acids)

  pep_start_i = aa_coords[:start]*3
  pep_end_i = pep_start_i+seqlen*3
  gff_records=[]
  ordered_cds_records.each do |cds_record|

    gff_record = nil
    fragment_len = 0
    if on_reverse_strand

      in_peptide = (i<pep_end_i) && (i>=pep_start_i)
      before_len = [pep_start_i-i,0].max

      if in_peptide
        fragment_end = cds_record.end
        fragment_len = [cds_record.length,pep_end_i-i].min
        fragment_start = fragment_end-fragment_len+1
        gff_record = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record,record_info)
      elsif before_len>0
        fragment_end = cds_record.end - before_len
        fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
        fragment_start = fragment_end - fragment_len + 1
        if fragment_len>0
          gff_record = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record,record_info)
        end
      else
        gff_record=nil
      end        
    else
      in_peptide = (i<pep_end_i) && (i>=pep_start_i)
      before_len = [pep_start_i-i,0].max
      if in_peptide
        fragment_start = cds_record.start
        fragment_len = [cds_record.length,pep_end_i-i].min
        fragment_end = fragment_start+fragment_len-1
        gff_record = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record,record_info)
      elsif before_len>0
        fragment_start = cds_record.start + before_len
        fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
        fragment_end = fragment_start + fragment_len-1
        if fragment_len>0
          gff_record = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record,record_info)
        end
      else
        gff_record = nil
      end

    end
    i+=cds_record.length
    gff_records << gff_record unless gff_record.nil?
  end
  gff_records
end

#modifications_from_sequence(seq) ⇒ Object



109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/protk/peptide.rb', line 109

def modifications_from_sequence(seq)

  seq = seq.sub(/^n\[[0-9]+?\]/,"")
  offset = 0
  mods = seq.enum_for(:scan, /([A-Z])\[([0-9\.]+)\]/).map {
    pm = PeptideMod.from_data(Regexp.last_match.begin(0)+1-offset,Regexp.last_match.captures[0],Regexp.last_match.captures[1].to_f)
    offset += Regexp.last_match.captures[1].length+2
    pm
  }

  # if ( seq == "N[115]VMN[115]LTPAETQ[129]QLHAALESQLSPGELAK" )
  #  require 'byebug';byebug
  #  puts "hi"
  # end


  mods
end

#mods_to_gff3_records(prot_seq, parent_record, cds_records) ⇒ Object



173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# File 'lib/protk/peptide.rb', line 173

def mods_to_gff3_records(prot_seq,parent_record,cds_records)

  throw "Expected GFF3 Record but got #{parent_record.class}" unless parent_record.class==Bio::GFF::GFF3::Record
  throw "Expected Array but got #{cds_records.class}" unless cds_records.class==Array

  pep_aa_coords = coords_in_protein(prot_seq,false)

  mod_records = []
  
  unless ( self.modifications.nil? )
    self.modifications.each { |mod|
      prot_position = mod.position+pep_aa_coords[:start]
      mod_aa_coords = {:start => prot_position, :end => prot_position+1}
      mod_records << gff_records_for_coords_in_protein(mod_aa_coords,1,parent_record,cds_records, {:type => "modified_amino_acid_feature", :mod => mod, :modified_sequence => self.modified_sequence}) 
    }
  end

  unless ( self.indistinguishable_peptides.nil? )
    self.indistinguishable_peptides.each { |ip|
      unless ( ip.modifications.nil? )
        ip.modifications.each { |mod|
          prot_position = mod.position+pep_aa_coords[:start]-1
          mod_aa_coords = {:start => prot_position, :end => prot_position+1}
          mod_records << gff_records_for_coords_in_protein(mod_aa_coords,1,parent_record,cds_records, {:type => "modified_amino_acid_feature", :mod => mod, :modified_sequence => ip.modified_sequence}) 
        }
      end
    } 
  end

  mod_records.flatten

end

#to_gff3_records(prot_seq, parent_record, cds_records) ⇒ Object

Returns a list of fragments (hashes with start and end) in GFF style (1 based) genomic coordinates

Assumes that cds_coords is inclusive of the entire protein sequence including start-met

We assume that gff records conform to the spec

www.sequenceontology.org/gff3.shtml

This part of the spec is crucial

  • The START and STOP codons are included in the CDS.

  • That is, if the locations of the start and stop codons are known,

  • the first three base pairs of the CDS should correspond to the start codon

  • and the last three correspond the stop codon.

We also assume that all the cds records provided, actually form part of the protein (ie skipped exons should not be included)



163
164
165
166
167
168
169
170
171
# File 'lib/protk/peptide.rb', line 163

def to_gff3_records(prot_seq,parent_record,cds_records)

  throw "Expected GFF3 Record but got #{parent_record.class}" unless parent_record.class==Bio::GFF::GFF3::Record
  throw "Expected Array but got #{cds_records.class}" unless cds_records.class==Array

  aa_coords = coords_in_protein(prot_seq,false) # Always use forward protein coordinates

  gff_records_for_coords_in_protein(aa_coords,self.sequence.length,parent_record,cds_records)    
end