Module: Ms::Ident

Defined in:
lib/ms/ident.rb

Constant Summary collapse

IPI_RE =
/IPI:([\w\d\.]+)\|/
GI_RE =
/gi|([\w\d\.]+)\|/
STANDARD_AA =

the twenty standard amino acids

%w(A C D E F G H I K L M N P Q R S T V W Y)
DEFAULT_PEPTIDE_CENTRIC_DB =
{:missed_cleavages => 1, :min_length => 8, :enzyme => Ms::InSilico::Digester::TRYPSIN, :id_regexp => nil, :remove_digestion_file => true, :cleave_initiator_methionine => true, :expand_aa => {'X' => STANDARD_AA}}

Class Method Summary collapse

Class Method Details

.expand_peptides(peptide, expand_aa) ⇒ Object

does combinatorial expansion of all letters requesting it. expand_aa is hash like: ‘X’=>STANDARD_AA



105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/ms/ident.rb', line 105

def self.expand_peptides(peptide, expand_aa)
  letters_in_order = expand_aa.keys.sort
  index_and_key = []
  peptide.split('').each_with_index do |char,i| 
    if let_index = letters_in_order.index(char) 
      index_and_key << [i, letters_in_order[let_index]]
    end
  end
  to_expand = [peptide]
  index_and_key.each do |i,letter|
    new_peps = []
    while current_pep = to_expand.shift do
      new_peps << expand_aa[letter].map {|v| dp = current_pep.dup ; dp[i] = v ; dp }
    end
    to_expand = new_peps.flatten
  end
  to_expand
end

.peptide_centric_db(fasta_file, opts = {}) ⇒ Object

writes a new file with the added ‘min_aaseq<Integer>’ creates a temporary digestion file that contains all peptides digesting with certain missed_cleavages (i.e., min_seq_length is not applied to this file but on the final peptide centric db)



19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# File 'lib/ms/ident.rb', line 19

def self.peptide_centric_db(fasta_file, opts={})
  opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)

  (missed_cleavages, min_length, enzyme, id_regexp, remove_digestion_file, cleave_initiator_methionine, expand_aa) = opts.values_at(:missed_cleavages, :min_length, :enzyme, :id_regexp, :remove_digestion_file, :cleave_initiator_methionine, :expand_aa) 

  unless id_regexp
    id_regexp = Ms::Fasta.id_regexp(Ms::Fasta.filetype(fasta_file))
    raise RuntimeError, "fasta file type not recognized, supply id_regexp" unless id_regexp
  end

  start_time = Time.now
  print "Digesting #{fasta_file} ..." if $VERBOSE

  if expand_aa
    letters_to_expand_re = Regexp.new("[" << Regexp.escape(expand_aa.keys.join) << "]")
  end

  base = fasta_file.chomp(File.extname(fasta_file))
  digestion_file = base + ".msd_clvg#{missed_cleavages}.peptides"
  File.open(digestion_file, "w") do |fh|
    Ms::Fasta.open(fasta_file) do |fasta|
      fasta.each do |prot|
        peptides = enzyme.digest(prot.sequence, missed_cleavages)
        if (cleave_initiator_methionine && (prot.sequence[0,1] == "M"))
          m_peps = []
          init_methionine_peps = []
          peptides.each do |pep|
            # if the peptide is at the beginning of the protein sequence
            if prot.sequence[0,pep.size] == pep
              m_peps << pep[1..-1]
            end
          end
          peptides.push(*m_peps)
        end
        if expand_aa
          peptides = peptides.map do |pep|
            if pep =~ letters_to_expand_re
              expand_peptides(pep, expand_aa)
            else
              pep
            end
          end.flatten
        end
        fh.puts( prot.header.split(/\s+/).first + "\t" + peptides.join(" ") )
      end
    end
  end
  puts "#{Time.now - start_time} sec" if $VERBOSE

  
  start_time = Time.now
  print "Organizing raw digestion #{digestion_file} ..." if $VERBOSE

  hash = Hash.new {|h,k| h[k] = [] }
  IO.foreach(digestion_file) do |line|
    (prot, *peps) = line.chomp!.split(/\s+/)
    id = prot.match(id_regexp)[1]
    peps.each do |pep|
      if pep.size >= min_length
        hash[pep] << id
      end
    end
  end
  puts "#{Time.now - start_time} sec" if $VERBOSE

  base = digestion_file.chomp(File.extname(digestion_file))
  final_outfile = base + ".min_aaseq#{min_length}" + ".yml"

  start_time = Time.now
  print "Writing results to #{} ..." if $VERBOSE

  File.open(final_outfile, 'w') do |out|
    hash.each do |k,v|
      out.puts( "#{k}: #{v.join('-')}" )
    end
  end
  puts "#{Time.now - start_time} sec" if $VERBOSE

  if remove_digestion_file
    File.unlink(digestion_file)
  end

end