Class: Bioworks

Inherits:
Object
  • Object
show all
Includes:
SpecID
Defined in:
lib/ms/sequest/bioworks.rb

Overview

For dealing with Bioworks .xml format

Defined Under Namespace

Modules: XML Classes: Pep, Prot, XMLParser

Constant Summary collapse

@@bioworksinfo_re =

Regular expressions

/<bioworksinfo>(.*)<\/bioworksinfo>/o
@@modifications_re =
/<modifications>(.*)<\/modifications>/o
@@protein_re =
/<protein>/o
@@origfilename_re =
/<origfilename>(.*)<\/origfilename>/o
@@origfilepath_re =
/<origfilepath>(.*)<\/origfilepath>/o

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file = nil) ⇒ Bioworks

Returns a new instance of Bioworks.



155
156
157
158
159
160
161
162
# File 'lib/ms/sequest/bioworks.rb', line 155

def initialize(file=nil)
  @peps = nil
  if file
    @filename = file
    parse_xml(file)
    #parse_xml_by_xmlparser(file)
  end
end

Instance Attribute Details

#global_filenameObject

Returns the value of attribute global_filename.



30
31
32
# File 'lib/ms/sequest/bioworks.rb', line 30

def global_filename
  @global_filename
end

#modificationsObject

a string of modifications e.g., “(M* 15.99491) (S@ 14.9322) ”



32
33
34
# File 'lib/ms/sequest/bioworks.rb', line 32

def modifications
  @modifications
end

#origfilenameObject

Returns the value of attribute origfilename.



30
31
32
# File 'lib/ms/sequest/bioworks.rb', line 30

def origfilename
  @origfilename
end

#origfilepathObject

Returns the value of attribute origfilepath.



30
31
32
# File 'lib/ms/sequest/bioworks.rb', line 30

def origfilepath
  @origfilepath
end

#pepsObject

Returns the value of attribute peps.



30
31
32
# File 'lib/ms/sequest/bioworks.rb', line 30

def peps
  @peps
end

#protsObject

Returns the value of attribute prots.



30
31
32
# File 'lib/ms/sequest/bioworks.rb', line 30

def prots
  @prots
end

#versionObject

Returns the value of attribute version.



30
31
32
# File 'lib/ms/sequest/bioworks.rb', line 30

def version
  @version
end

Instance Method Details

#_uniq_peps_by_sequence_charge(peps) ⇒ Object

returns (peptides, proteins) where peptides is the unique list of peps and proteins is a parallel array of arrays of represented proteins note that each pep will contain its original prot it belongs to, even though the parallel protein actually represents the proteins it belongs to. assumes that each peptide points to all its proteins in pep.prots



136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# File 'lib/ms/sequest/bioworks.rb', line 136

def _uniq_peps_by_sequence_charge(peps)
  new_arr = []
  prot_arr = []
  index_accounted_for = []
  (0...peps.size).each do |i|
    next if index_accounted_for.include?(i)
    new_arr << peps[i]
    prot_arr.push( peps[i].prots )
    ((i+1)...peps.size).each do |j|
      pep1, pep2 = peps[i], peps[j]
      if pep1.sequence == pep2.sequence && pep1.charge == pep2.charge
        prot_arr.last.push( *(pep2.prots) )
        index_accounted_for << j
      end
    end
  end
  return new_arr, prot_arr
end

#get_prots_from_xml_stream(fh) ⇒ Object

returns proteins and peptides



198
199
200
201
202
203
204
205
206
207
208
209
210
# File 'lib/ms/sequest/bioworks.rb', line 198

def get_prots_from_xml_stream(fh)
  uniq_pephit_hash = {}
  prots = []
  while line = fh.gets
    if line =~ @@protein_re
      prot =  Bioworks::Prot.new
      prot.bioworks = self
      prot.set_from_xml_stream(fh, uniq_pephit_hash)
      prots << prot
    end
  end
  [prots, uniq_pephit_hash.values] 
end

#get_regex_val(fh, regex) ⇒ Object

gets the regex and stops (and rewinds if it hits a protein) if no regex is found, returns nil and rewinds the filehandle



214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# File 'lib/ms/sequest/bioworks.rb', line 214

def get_regex_val(fh, regex)
  ver = nil
  last_pos = fh.pos
  while line = fh.gets
    if line =~ regex
      ver = $1.dup     
      break
    elsif line =~ @@protein_re
      fh.seek last_pos
      break
    end
    last_pos = fh.pos
  end
  unless ver then fh.rewind end
  ver
end

#hi_prob_bestObject



34
# File 'lib/ms/sequest/bioworks.rb', line 34

def hi_prob_best ; false end

#num_prots(file) ⇒ Object

returns the number of prots. Raises an Exception if open and closing xml tags don’t agree



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/ms/sequest/bioworks.rb', line 45

def num_prots(file)
  re = /(<protein>)|(<\/protein>)/mo
  begin_tags = 0
  end_tags = 0
  IO.read(file).scan(re) do |match| 
    if match.first
      begin_tags += 1
    else
      end_tags += 1
    end
  end
  if begin_tags != end_tags 
    puts "WARNING: #{file} doesn't have matching closing tags"
    puts "for the <protein> tag.  Returning # of beginning tags."
  end
  begin_tags
end

#parse_xml(file) ⇒ Object

This is highly specific to Bioworks 3.2 xml export. In other words, unless the newlines, etc. are duplicated, this parser will fail! Not robust, but it is faster than xmlparser (which is based on the speedy expat)



179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# File 'lib/ms/sequest/bioworks.rb', line 179

def parse_xml(file)
  fh = nil
  if file =~ /\.gz$/
    fh = Zlib::GzipReader.open(file)  
  else
    fh = File.open(file)
  end
  @origfilename = get_regex_val(fh, @@origfilename_re)
  @origfilepath = get_regex_val(fh, @@origfilepath_re)
  if @origfilename
    @global_filename = @origfilename.gsub(File.extname(@origfilename), "")
  end
  @version = get_regex_val(fh, @@bioworksinfo_re)
  @modifications = get_regex_val(fh, @@modifications_re)
  @prots, @peps = get_prots_from_xml_stream(fh)
  fh.close
end

#parse_xml_by_xmlparser(file) ⇒ Object



164
165
166
167
168
169
170
171
172
173
# File 'lib/ms/sequest/bioworks.rb', line 164

def parse_xml_by_xmlparser(file)
  parser = Bioworks::XMLParser.new
  File.open(file) do |fh|
    #3.times do fh.gets end  ## TEMPFIX
    parser.parse(fh)
  end
  #puts "ETETWSST"
  #p parser.prots
  @prots = parser.prots
end

#to_excel(file) ⇒ Object

Outputs the bioworks browser excel format (tab delimited) to file. Useful if you have more than ~65,000 lines (can export bioworks.xml and then convert to excel format). Currently, the only things not precisely identical are:

1. The peptide hit counts (although the first number [total # peptides] is accurate)
2. The precise ordering of peptides within each protein.  When dealing with output from multiple runs, peptides with runs with exactly the  same scan numbers are not guaranteed to be in the same order.


71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/ms/sequest/bioworks.rb', line 71

def to_excel(file)
  update_peptide_hit_counts
  arr = []
  arr << ['', 'Reference', '', '', '', 'Score', 'Coverage', 'MW', 'Accession', 'Peptide (Hits)', '', ' ']
  arr << ['', '"File, Scan(s)"', 'Peptide', 'MH+', 'z', 'XC', 'DeltaCn', 'Sp', 'RSp', 'Ions', 'Count', ' ']
  @prots.each_with_index do |prot,index|
    line_arr = prot.get(:consensus_score, :coverage, :weight, :accession)
    if line_arr[1] == "0.0" then line_arr[1] = "" end
    line_arr.unshift('', '', '')
    line_arr.unshift('"' + prot.reference.split('|')[-1] + '"')
    line_arr.unshift(index+1)
    pep_hit_counts = prot.peptide_hit_counts
    pep_hit_counts_string = pep_hit_counts[0].to_s + ' (' + pep_hit_counts[1..-1].join(" ") + ')' 
    line_arr.push( pep_hit_counts_string )
    line_arr.push("")
    line_arr.push(" ")
    arr.push( line_arr )
    prot.peps.sort_by{|obj| [obj.first_scan.to_i, obj.last_scan.to_i] }.each do |pep|

      pep_arr = pep.get(:sequence, :mass, :charge, :xcorr, :deltacn, :sp, :rsp, :ions)
      count = pep.count
      if count == '0' then count = "" end
      pep_arr.push(count)
      pep_arr.push(' ')
      pep_arr.unshift('"' + pep.file + '"')
      pep_arr.unshift( '' )
      arr.push( pep_arr )
    end
  end
  File.open(file, "w") do |out|
    arr.each do |line|
      out.print(line.join("\t"), "\n")
    end
  end

end

#to_pepxmlObject

Outputs sequest xml files (pepxml) for the trans-proteomics pipeline



232
233
234
235
# File 'lib/ms/sequest/bioworks.rb', line 232

def to_pepxml
  string = xml_version
  string 
end

#to_sqt(params_file) ⇒ Object

-> prints to file filename1.sqt, filename2.sqt @TODO: sqt file output



38
39
40
41
# File 'lib/ms/sequest/bioworks.rb', line 38

def to_sqt(params_file)
  ## hash peps by filename 
  ## hash prots by peptide
end

#update_peptide_hit_countsObject

for output to excel format or other things, updates each protein with a peptide hit count array based on ranking of xcorr per dta file where each array is the total number of peptide hits, then rank 1,2,3,4,5 @TODO: Can’t get this to check out yet. Perhaps they use normalized Xcorr?



113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# File 'lib/ms/sequest/bioworks.rb', line 113

def update_peptide_hit_counts
  @prots.each do |prot|
    prot.peptide_hit_counts[0] = prot.peps.size
  end
  hash = peps.hash_by(:file)
  hash.sort.each do |k,v|
    sorted = v.sort_by {|obj| obj.xcorr.to_f }
    peps, prot_groups = _uniq_peps_by_sequence_charge(sorted) ## but not on prot!!!!!uniq_peps_by_sequence_charge!

    prot_groups.each_with_index do |prot_group, i|
      prot_group.each do |prot|
        prot.peptide_hit_counts[i+1] += 1 if prot.peptide_hit_counts[i+1]
      end
    end
  end
end