Class: MzID::StreamingParserLines
- Inherits:
-
StreamingParser
- Object
- BaseParser
- BatchParser
- StreamingParser
- MzID::StreamingParserLines
- Defined in:
- lib/mzid/streaming_parser_lines.rb
Overview
class to parse an mzIdentML file in a streaming (i.e., mem-efficient) manner not using any XML parsing library, only exploiting the structure of mzIdentML files
Instance Method Summary collapse
-
#cache_ids(use_pbar = @use_pbar) ⇒ Object
store peptide sequences in hash for lookup.
-
#each_psm(use_pbar = @use_pbar) ⇒ Object
iterate through each psm by identifying them parsing the file one line at a time - faster than using XML parser.
- #get_is_decoy(pep_ev_id) ⇒ Object
- #get_pep_end(pep_ev_id) ⇒ Object
- #get_pep_start(pep_ev_id) ⇒ Object
-
#get_prot_id(pep_ev_id) ⇒ Object
get a protein ID from a PeptideEvidenceID.
-
#initialize(file, sp_thresh = 10.0**-10,, use_pbar = nil, tda_flag = true) ⇒ StreamingParserLines
constructor
A new instance of StreamingParserLines.
-
#write_to_csv(outfile = "result.csv", use_pbar = @use_pbar) ⇒ Object
load PSMs into memory, and go back to perform lookup for prot ids.
Methods inherited from StreamingParser
Methods inherited from BatchParser
#cache_db_seq_entries, #cache_pep_ev, #each_spectrum
Constructor Details
#initialize(file, sp_thresh = 10.0**-10,, use_pbar = nil, tda_flag = true) ⇒ StreamingParserLines
Returns a new instance of StreamingParserLines.
14 15 16 17 18 19 20 21 22 23 |
# File 'lib/mzid/streaming_parser_lines.rb', line 14 def initialize(file, sp_thresh = 10.0**-10, = nil, tda_flag = true) @num_spec = 0 @tda_flag = tda_flag # @pep_ev_h_protID = Hash.new @pep_ev_h_startPos = Hash.new @pep_ev_h_endPos = Hash.new @pep_ev_h_dbseqRef = Hash.new super(file, ) end |
Instance Method Details
#cache_ids(use_pbar = @use_pbar) ⇒ Object
store peptide sequences in hash for lookup
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
# File 'lib/mzid/streaming_parser_lines.rb', line 44 def cache_ids( = @use_pbar) num_pep, num_db_seq, num_pep_ev = get_num_elements(nil) @pep_h = Hash.new @mod_h = Hash.new = ProgressBar.new("peptides", num_pep/2) if reader = Nokogiri::XML::Reader(File.open(@mzid_file)) reader.each do |node| # parse Peptide items if node.name == "Peptide" then # parse local peptide entry tmp_node = Nokogiri::XML.parse(node.outer_xml) tmp_node.remove_namespaces! root = tmp_node.root pep_id = root["id"].to_sym # skip if already handled PepID next if @pep_h.has_key?(pep_id) # parse sequence/mods if haven't seen it yet pep_seq = get_peptide_sequence(root) mod_line = get_modifications(root) @pep_h[pep_id] = pep_seq @mod_h[pep_id] = mod_line .inc if end end .finish if # now parse DBSequence items dbseq_re = Regexp.new(/^\s*<DBSequence\s/) = ProgressBar.new("db_seq", num_db_seq) if IO.foreach(@mzid_file) do |line| next if !dbseq_re.match(line) prot_id = line.match(/accession=\"([\w|\|]+)/)[1] db_id = line.match(/id=\"(\w+)/)[1] @db_seq_h[db_id.to_sym] = prot_id.to_sym .inc if end .finish if # now parse PeptideEvidence items pepev_re = Regexp.new(/^\s*<PeptideEvidence\s/) = ProgressBar.new("pep_ev", num_pep_ev) if IO.foreach(@mzid_file) do |line| next if !pepev_re.match(line) db_id = line.match(/dBSequence_ref=\"(\w+)/)[1] start_pos = line.match(/start=\"(\d+)/)[1].to_i end_pos = line.match(/end=\"(\d+)/)[1].to_i pep_ev = line.match(/id=\"(\w+)/)[1] is_decoy = line.match(/isDecoy=\"(\w+)\"/)[1] # @pep_ev_h_dbseqRef[pep_ev.to_sym] = db_id.to_sym @pep_ev_h[pep_ev.to_sym] = PeptideEvidence.new(:db_seq_ref => db_id.to_sym, :start_pos => start_pos, :end_pos => end_pos, :is_decoy => is_decoy) .inc if end .finish if end |
#each_psm(use_pbar = @use_pbar) ⇒ Object
iterate through each psm by identifying them parsing the file one line at a time - faster than using XML parser
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
# File 'lib/mzid/streaming_parser_lines.rb', line 107 def each_psm(=@use_pbar) num_lines = `wc -l #{@mzid_file}`.to_i if curr_psm = nil = ProgressBar.new("PSMs", num_lines) if specid_item_re = Regexp.new(/^\s+<SpectrumIdentificationItem\s/) pepevref_re = Regexp.new(/^\s+<PeptideEvidenceRef\s/) specprob_re = Regexp.new(/name=\"MS-GF:SpecEValue\"\/>$/) specid_item_end_re = Regexp.new(/^\s+<\/SpectrumIdentificationItem>\s*$/) IO.foreach(@mzid_file) do |line| .inc if # skip line if not one pertaiing to spectrum ID item next if !specid_item_re.match(line) && !pepevref_re.match(line) && !specprob_re.match(line) && !specid_item_end_re.match(line) # beginning of spectrum ID item if specid_item_re.match(line) then spec_id_id = line.match(/id=\"(\w+)/)[1] spec_num = spec_id_id.split("_")[1].to_i pep_ref = line.match(/peptide_ref=\"(\w+)/)[1] # get peptide pep_seq = @pep_h[pep_ref.to_sym] mods = @mod_h[pep_ref.to_sym] curr_psm = PSM.new(:spec_num => spec_num, :pep => pep_seq, :mods => mods) elsif pepevref_re.match(line) then pep_ev = line.match(/peptideEvidence_ref=\"(\w+)/)[1] curr_psm.add_pep_ev(pep_ev.to_sym) if curr_psm elsif specprob_re.match(line) then sprob = line.match(/value=\"([\d|\w|\.|-]+)\"/)[1] curr_psm.set_spec_prob(sprob.to_f) if curr_psm elsif specid_item_end_re.match(line) then yield curr_psm curr_psm = nil # kill current PSM object end end .finish if end |
#get_is_decoy(pep_ev_id) ⇒ Object
38 |
# File 'lib/mzid/streaming_parser_lines.rb', line 38 def get_is_decoy(pep_ev_id) @pep_ev_h[pep_ev_id].get_is_decoy end |
#get_pep_end(pep_ev_id) ⇒ Object
37 |
# File 'lib/mzid/streaming_parser_lines.rb', line 37 def get_pep_end(pep_ev_id) @pep_ev_h[pep_ev_id].get_end_pos end |
#get_pep_start(pep_ev_id) ⇒ Object
36 |
# File 'lib/mzid/streaming_parser_lines.rb', line 36 def get_pep_start(pep_ev_id) @pep_ev_h[pep_ev_id].get_start_pos end |
#get_prot_id(pep_ev_id) ⇒ Object
get a protein ID from a PeptideEvidenceID
27 28 29 30 31 32 |
# File 'lib/mzid/streaming_parser_lines.rb', line 27 def get_prot_id(pep_ev_id) #dbref = @pep_ev_h_dbseqRef[pep_ev_id] dbref = @pep_ev_h[pep_ev_id].get_db_seq_ref prot_id = @db_seq_h[dbref] prot_id end |
#write_to_csv(outfile = "result.csv", use_pbar = @use_pbar) ⇒ Object
load PSMs into memory, and go back to perform lookup for prot ids
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
# File 'lib/mzid/streaming_parser_lines.rb', line 147 def write_to_csv(outfile="result.csv", =@use_pbar) CSV.open(outfile, "w", {:col_sep => "\t"}) do |csv| headerAry = ["#spec_num", "peptide", "spec_prob", "decoy", "prot_ids", "start", "end", "num_prot"] headerAry.delete("decoy") if !@tda_flag csv << headerAry # each PSM self.each_psm do |psm| pep_seq = psm.get_pep spec_num = psm.get_spec_num sp_prob = psm.get_spec_prob pass_thresh = psm.get_pass_threshold pep_ev_ref_lst = psm.get_pep_ev # number of proteins with matching peptide num_prot = pep_ev_ref_lst.size # for each PeptideEvidence, write a different line pep_ev_ref_lst.each do |pepev| prot_id = self.get_prot_id(pepev) start_pos = self.get_pep_start(pepev) end_pos = self.get_pep_end(pepev) is_decoy = self.get_is_decoy(pepev) ary = [spec_num, pep_seq, sp_prob, is_decoy, prot_id, start_pos, end_pos, num_prot] ary.delete_at(3) if !@tda_flag csv << ary end end end end |