Class: Ms::Ident::Peptide::Db

Inherits:
Hash
  • Object
show all
Defined in:
lib/ms/ident/peptide/db.rb

Overview

the object itself is a modified Hash. It is initialized with the database file and a protein array can be retrieved with the #[] method given an amino acid sequence. All other methods are untested at this time and should be avoided!

Defined Under Namespace

Classes: IO

Constant Summary collapse

MAX_NUM_AA_EXPANSION =
3
STANDARD_AA =

the twenty standard amino acids

%w(A C D E F G H I K L M N P Q R S T V W Y)
DEFAULT_PEPTIDE_CENTRIC_DB =
{:missed_cleavages => 2, :min_length => 4, :enzyme => Ms::InSilico::Digester::TRYPSIN, :id_regexp => nil, :remove_digestion_file => true, :cleave_initiator_methionine => true, :expand_aa => {'X' => STANDARD_AA}}
PROTEIN_DELIMITER =
"\t"
KEY_VALUE_DELIMITER =
": "

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(db_file) ⇒ Db

Returns a new instance of Db.



173
174
175
# File 'lib/ms/ident/peptide/db.rb', line 173

def initialize(db_file)
  self.replace(YAML.load_file(db_file))
end

Class Method Details

.cmdline(argv) ⇒ Object



24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/ms/ident/peptide/db.rb', line 24

def self.cmdline(argv)

  opt = {
    :remove_digestion_file => true,
    :enzyme => Ms::InSilico::Digester::TRYPSIN
  }
  opts = OptionParser.new do |op|
    op.banner = "usage: #{File.basename($0)} <file>.fasta ..."
    op.separator "output: "
    op.separator "    <file>.msd_clvg<missed_cleavages>.min_aaseq<min_length>.yml"
    op.separator "format:"
    op.separator "    PEPTIDE: ID1<tab>ID2<tab>ID3..."
    op.separator ""    
    op.separator "    Initiator Methionines - by default, will generate two peptides"
    op.separator "    for any peptide found at the N-termini starting with 'M'"
    op.separator "    (i.e., one with and one without the leading methionine)"
    op.separator ""
    op.on("--missed-cleavages <#{opt[:missed_cleavages]}>", Integer, "max num of missed cleavages") {|v| opt[:missed_cleavages] = v }
    op.on("--min-length <#{opt[:min_length]}>", Integer, "the minimum peptide aaseq length") {|v| opt[:min_length] = v }
    op.on("--no-cleaved-methionine", "does not cleave off initiator methionine") { opt[:cleave_initiator_methionine] = false }
    op.on("--no-expand-x", "don't enumerate aa 'X' possibilities") { opt[:expand_aa] = nil }
    op.on("-e", "--enzyme <name>", "enzyme for digestion") {|v| opt[:enzyme] = Ms::Insilico::Digester.const_get(v.upcase) }
    op.on("--list-enzymes", "lists approved enzymes and exits") do
      puts Ms::InSilico::Digester::ENZYMES.keys.join("\n")
      exit
    end
  end

  opts.parse!(argv)

  if argv.size == 0
    puts opts || exit
  end

  argv.map do |file|
    Ms::Ident::Peptide::Db.peptide_centric_db(file, opt)
  end
end

.expand_peptides(peptide, expand_aa) ⇒ Object

does combinatorial expansion of all letters requesting it. expand_aa is hash like: ‘X’=>STANDARD_AA returns nil if there are more than MAX_NUM_AA_EXPANSION amino acids to be expanded returns an empty array if there is no expansion



151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# File 'lib/ms/ident/peptide/db.rb', line 151

def self.expand_peptides(peptide, expand_aa)
  letters_in_order = expand_aa.keys.sort
  index_and_key = []
  peptide.split('').each_with_index do |char,i| 
    if let_index = letters_in_order.index(char) 
      index_and_key << [i, letters_in_order[let_index]]
    end
  end
  if index_and_key.size > MAX_NUM_AA_EXPANSION
    return nil
  end
  to_expand = [peptide]
  index_and_key.each do |i,letter|
    new_peps = []
    while current_pep = to_expand.shift do
      new_peps << expand_aa[letter].map {|v| dp = current_pep.dup ; dp[i] = v ; dp }
    end
    to_expand = new_peps.flatten
  end
  to_expand
end

.peptide_centric_db(fasta_file, opts = {}) ⇒ Object

writes a new file with the added ‘min_aaseq<Integer>’ creates a temporary digestion file that contains all peptides digesting with certain missed_cleavages (i.e., min_seq_length is not applied to this file but on the final peptide centric db) returns the full name of the written file.



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# File 'lib/ms/ident/peptide/db.rb', line 68

def self.peptide_centric_db(fasta_file, opts={})
  opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)

  (missed_cleavages, min_length, enzyme, id_regexp, remove_digestion_file, cleave_initiator_methionine, expand_aa) = opts.values_at(:missed_cleavages, :min_length, :enzyme, :id_regexp, :remove_digestion_file, :cleave_initiator_methionine, :expand_aa) 
  start_time = Time.now
  print "Digesting #{fasta_file} ..." if $VERBOSE

  if expand_aa
    letters_to_expand_re = Regexp.new("[" << Regexp.escape(expand_aa.keys.join) << "]")
  end

  base = fasta_file.chomp(File.extname(fasta_file))
  digestion_file = base + ".msd_clvg#{missed_cleavages}.peptides"
  File.open(digestion_file, "w") do |fh|
    Ms::Fasta.open(fasta_file) do |fasta|
      fasta.each do |prot|
        peptides = enzyme.digest(prot.sequence, missed_cleavages)
        if (cleave_initiator_methionine && (prot.sequence[0,1] == "M"))
          m_peps = []
          init_methionine_peps = []
          peptides.each do |pep|
            # if the peptide is at the beginning of the protein sequence
            if prot.sequence[0,pep.size] == pep
              m_peps << pep[1..-1]
            end
          end
          peptides.push(*m_peps)
        end
        if expand_aa
          peptides = peptides.map do |pep|
            if pep =~ letters_to_expand_re
              expand_peptides(pep, expand_aa)
            else
              pep
            end
          end.flatten
        end
        fh.puts( prot.header.split(/\s+/).first + "\t" + peptides.join(" ") )
      end
    end
  end
  puts "#{Time.now - start_time} sec" if $VERBOSE


  start_time = Time.now
  print "Organizing raw digestion #{digestion_file} ..." if $VERBOSE

  hash = Hash.new {|h,k| h[k] = [] }
  ::IO.foreach(digestion_file) do |line|
    (prot, *peps) = line.chomp!.split(/\s+/)
    # prot is something like this: "sp|P31946|1433B_HUMAN" in uniprot 
    peps.each do |pep|
      if pep.size >= min_length
        hash[pep] << prot
      end
    end
  end
  puts "#{Time.now - start_time} sec" if $VERBOSE

  base = digestion_file.chomp(File.extname(digestion_file))
  final_outfile = base + ".min_aaseq#{min_length}" + ".yml"

  start_time = Time.now
  print "Writing results to #{} ..." if $VERBOSE

  File.open(final_outfile, 'w') do |out|
    hash.each do |k,v|
      out.puts( [k, v.join(PROTEIN_DELIMITER)].join(KEY_VALUE_DELIMITER) )
    end
  end
  puts "#{Time.now - start_time} sec" if $VERBOSE

  if remove_digestion_file
    File.unlink(digestion_file)
  end
  File.expand_path(final_outfile)
end

Instance Method Details

#[](key) ⇒ Object

returns the protein id’s as an array



180
181
182
# File 'lib/ms/ident/peptide/db.rb', line 180

def [](key)
  old_bracket(key).chomp.split(PROTEIN_DELIMITER)
end