Class: MzID::StreamingParserLines

Inherits:
StreamingParser show all
Defined in:
lib/mzid/streaming_parser_lines.rb

Overview

class to parse an mzIdentML file in a streaming (i.e., mem-efficient) manner not using any XML parsing library, only exploiting the structure of mzIdentML files

Instance Method Summary collapse

Methods inherited from StreamingParser

#cache_pep_ev

Methods inherited from BatchParser

#cache_db_seq_entries, #cache_pep_ev, #each_spectrum

Constructor Details

#initialize(file, sp_thresh = 10.0**-10,, use_pbar = nil, tda_flag = true) ⇒ StreamingParserLines

Returns a new instance of StreamingParserLines.



14
15
16
17
18
19
20
21
22
23
# File 'lib/mzid/streaming_parser_lines.rb', line 14

def initialize(file, sp_thresh = 10.0**-10, use_pbar = nil, tda_flag = true)
  @num_spec = 0
  @tda_flag = tda_flag
  #
  @pep_ev_h_protID = Hash.new
  @pep_ev_h_startPos = Hash.new
  @pep_ev_h_endPos = Hash.new
  @pep_ev_h_dbseqRef = Hash.new
  super(file, use_pbar)
end

Instance Method Details

#cache_ids(use_pbar = @use_pbar) ⇒ Object

store peptide sequences in hash for lookup



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/mzid/streaming_parser_lines.rb', line 44

def cache_ids(use_pbar = @use_pbar)
  num_pep, num_db_seq, num_pep_ev = get_num_elements(nil)
  
  @pep_h = Hash.new
  @mod_h = Hash.new
  pbar1 = ProgressBar.new("peptides", num_pep/2) if use_pbar
  reader = Nokogiri::XML::Reader(File.open(@mzid_file))
  reader.each do |node|
    # parse Peptide items
    if node.name == "Peptide" then
      # parse local peptide entry
      tmp_node = Nokogiri::XML.parse(node.outer_xml)
      tmp_node.remove_namespaces!
      root = tmp_node.root          
      pep_id = root["id"].to_sym
      # skip if already handled PepID
      next if @pep_h.has_key?(pep_id)
      # parse sequence/mods if haven't seen it yet
      pep_seq = get_peptide_sequence(root)
      mod_line = get_modifications(root)
      @pep_h[pep_id] = pep_seq
      @mod_h[pep_id] = mod_line
      pbar1.inc if use_pbar
    end
  end
  pbar1.finish if use_pbar
  # now parse DBSequence items
  dbseq_re = Regexp.new(/^\s*<DBSequence\s/)
  pbar2 = ProgressBar.new("db_seq", num_db_seq) if use_pbar
  IO.foreach(@mzid_file) do |line|
    next if !dbseq_re.match(line)
    
    prot_id = line.match(/accession=\"([\w|\|]+)/)[1]
    db_id = line.match(/id=\"(\w+)/)[1]
    
    @db_seq_h[db_id.to_sym] = prot_id.to_sym
    pbar2.inc if use_pbar
  end
  pbar2.finish if use_pbar
  # now parse PeptideEvidence items
  pepev_re = Regexp.new(/^\s*<PeptideEvidence\s/)
  pbar3 = ProgressBar.new("pep_ev", num_pep_ev) if use_pbar
  IO.foreach(@mzid_file) do |line|
    next if !pepev_re.match(line)
    
    db_id = line.match(/dBSequence_ref=\"(\w+)/)[1]
    start_pos = line.match(/start=\"(\d+)/)[1].to_i
    end_pos = line.match(/end=\"(\d+)/)[1].to_i
    pep_ev = line.match(/id=\"(\w+)/)[1]
    is_decoy = line.match(/isDecoy=\"(\w+)\"/)[1]
    # @pep_ev_h_dbseqRef[pep_ev.to_sym] = db_id.to_sym
    @pep_ev_h[pep_ev.to_sym] = PeptideEvidence.new(:db_seq_ref => db_id.to_sym,
                                                   :start_pos => start_pos,
                                                   :end_pos => end_pos,
                                                   :is_decoy => is_decoy)
    pbar3.inc if use_pbar
  end
  pbar3.finish if use_pbar      
end

#each_psm(use_pbar = @use_pbar) ⇒ Object

iterate through each psm by identifying them parsing the file one line at a time - faster than using XML parser



107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# File 'lib/mzid/streaming_parser_lines.rb', line 107

def each_psm(use_pbar=@use_pbar)     
  num_lines = `wc -l #{@mzid_file}`.to_i if use_pbar
  curr_psm = nil
  pbar = ProgressBar.new("PSMs", num_lines) if use_pbar
  specid_item_re = Regexp.new(/^\s+<SpectrumIdentificationItem\s/)
  pepevref_re = Regexp.new(/^\s+<PeptideEvidenceRef\s/)
  specprob_re = Regexp.new(/name=\"MS-GF:SpecEValue\"\/>$/)
  specid_item_end_re = Regexp.new(/^\s+<\/SpectrumIdentificationItem>\s*$/)
  IO.foreach(@mzid_file) do |line|
    pbar.inc if use_pbar
    # skip line if not one pertaiing to spectrum ID item
    next if !specid_item_re.match(line) &&
      !pepevref_re.match(line) &&
      !specprob_re.match(line) &&
      !specid_item_end_re.match(line)
    # beginning of spectrum ID item
    if specid_item_re.match(line) then
      spec_id_id = line.match(/id=\"(\w+)/)[1]
      spec_num = spec_id_id.split("_")[1].to_i
      pep_ref = line.match(/peptide_ref=\"(\w+)/)[1]
      # get peptide
      pep_seq = @pep_h[pep_ref.to_sym]
      mods = @mod_h[pep_ref.to_sym]
      curr_psm = PSM.new(:spec_num => spec_num, :pep => pep_seq, :mods => mods)
    elsif pepevref_re.match(line) then
      pep_ev = line.match(/peptideEvidence_ref=\"(\w+)/)[1]
      curr_psm.add_pep_ev(pep_ev.to_sym) if curr_psm
    elsif specprob_re.match(line) then
      sprob = line.match(/value=\"([\d|\w|\.|-]+)\"/)[1]
      curr_psm.set_spec_prob(sprob.to_f) if curr_psm
    elsif specid_item_end_re.match(line) then
      yield curr_psm
      curr_psm = nil # kill current PSM object 
    end        
  end
  pbar.finish if use_pbar
end

#get_is_decoy(pep_ev_id) ⇒ Object



38
# File 'lib/mzid/streaming_parser_lines.rb', line 38

def get_is_decoy(pep_ev_id) @pep_ev_h[pep_ev_id].get_is_decoy end

#get_pep_end(pep_ev_id) ⇒ Object



37
# File 'lib/mzid/streaming_parser_lines.rb', line 37

def get_pep_end(pep_ev_id) @pep_ev_h[pep_ev_id].get_end_pos end

#get_pep_start(pep_ev_id) ⇒ Object



36
# File 'lib/mzid/streaming_parser_lines.rb', line 36

def get_pep_start(pep_ev_id) @pep_ev_h[pep_ev_id].get_start_pos end

#get_prot_id(pep_ev_id) ⇒ Object

get a protein ID from a PeptideEvidenceID



27
28
29
30
31
32
# File 'lib/mzid/streaming_parser_lines.rb', line 27

def get_prot_id(pep_ev_id) 
  #dbref = @pep_ev_h_dbseqRef[pep_ev_id]
  dbref = @pep_ev_h[pep_ev_id].get_db_seq_ref
  prot_id = @db_seq_h[dbref]
  prot_id
end

#write_to_csv(outfile = "result.csv", use_pbar = @use_pbar) ⇒ Object

load PSMs into memory, and go back to perform lookup for prot ids



147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# File 'lib/mzid/streaming_parser_lines.rb', line 147

def write_to_csv(outfile="result.csv", use_pbar=@use_pbar)
  CSV.open(outfile, "w", {:col_sep => "\t"}) do |csv|
    headerAry = ["#spec_num", "peptide", "spec_prob", "decoy", "prot_ids", "start", "end", "num_prot"]
    headerAry.delete("decoy") if !@tda_flag
    csv << headerAry
    
    # each PSM
    self.each_psm do |psm|
      pep_seq = psm.get_pep
      spec_num = psm.get_spec_num
      sp_prob = psm.get_spec_prob
      pass_thresh = psm.get_pass_threshold
      pep_ev_ref_lst = psm.get_pep_ev
      # number of proteins with matching peptide
      num_prot = pep_ev_ref_lst.size
      # for each PeptideEvidence, write a different line
      pep_ev_ref_lst.each do |pepev| 
        prot_id = self.get_prot_id(pepev)             
        start_pos = self.get_pep_start(pepev)
        end_pos = self.get_pep_end(pepev)
        is_decoy = self.get_is_decoy(pepev)
        ary = [spec_num, pep_seq, sp_prob, is_decoy, prot_id, start_pos, end_pos, num_prot]
        ary.delete_at(3) if !@tda_flag
        csv << ary
      end 
    end 
  end
end