Class: Ms::Sequest::Srf
- Inherits:
-
Object
- Object
- Ms::Sequest::Srf
- Defined in:
- lib/ms/sequest/srf.rb,
lib/ms/sequest/srf/sqt.rb,
lib/ms/sequest/srf/search.rb
Defined Under Namespace
Modules: Search, Sqt Classes: DTA, DTAGen, Header, NoSequestParamsError, Out
Instance Attribute Summary collapse
-
#base_name ⇒ Object
Returns the value of attribute base_name.
-
#dta_files ⇒ Object
Returns the value of attribute dta_files.
-
#filtered_by_precursor_mass_tolerance ⇒ Object
a boolean to indicate if the results have been filtered by the sequest.params precursor mass tolerance.
-
#header ⇒ Object
Returns the value of attribute header.
-
#index ⇒ Object
a parallel array to dta_files and out_files where each entry is: [first_scan, last_scan, charge].
-
#out_files ⇒ Object
Returns the value of attribute out_files.
-
#params ⇒ Object
Returns the value of attribute params.
-
#version ⇒ Object
a String: 3.5, 3.3 or 3.2.
Class Method Summary collapse
-
.get_sequest_params_and_finish_pos(filename) ⇒ Object
returns a Sequest::Params object or nil if none.
Instance Method Summary collapse
- #dta_start_byte ⇒ Object
-
#filter_by_precursor_mass_tolerance! ⇒ Object
1.
-
#from_file(filename, opts) ⇒ Object
returns self opts are the same as for ‘new’.
-
#initialize(filename = nil, opts = {}) ⇒ Srf
constructor
opts: :filter_by_precursor_mass_tolerance => true | false (default true) # this will filter by the sequest params prec tolerance as is # typically done by Bioworks.
- #protein_class ⇒ Object
- #read_dta_and_out_interleaved(fh, num_files, unpack_35, dup_refs_gt_0) ⇒ Object
-
#read_dta_files(fh, num_files, unpack_35) ⇒ Object
returns an array of dta_files.
-
#read_out_files(fh, number_files, unpack_35, dup_refs_gt_0) ⇒ Object
filehandle (fh) must be at the start of the outfiles.
-
#read_scan_index(fh, num) ⇒ Object
returns an index where each entry is [first_scan, last_scan, charge].
Methods included from Search
Methods included from Sqt
Constructor Details
#initialize(filename = nil, opts = {}) ⇒ Srf
opts:
:filter_by_precursor_mass_tolerance => true | false (default true)
# this will filter by the sequest params prec tolerance as is
# typically done by Bioworks.
:link_protein_hits => true | false (default true)
# if true, generates the @prot attribute for the :prot method
# and creates one protein per reference that is linked to each
# relevant peptide hit.
# if false, each protein for each peptide hit is a unique object
# and the :prots method returns nil. If you are merging multiple
# searches then you probably want to set this to false to avoid
# recalculation.
:read_pephits => true | false (default true)
# will attempt to read peptide hit information (equivalent to .out
# files), otherwise, just reads the dta information.
99 100 101 102 103 104 105 106 107 |
# File 'lib/ms/sequest/srf.rb', line 99 def initialize(filename=nil, opts={}) @peps = [] @dta_files = [] @out_files = [] if filename from_file(filename, opts) end end |
Instance Attribute Details
#base_name ⇒ Object
Returns the value of attribute base_name.
40 41 42 |
# File 'lib/ms/sequest/srf.rb', line 40 def base_name @base_name end |
#dta_files ⇒ Object
Returns the value of attribute dta_files.
34 35 36 |
# File 'lib/ms/sequest/srf.rb', line 34 def dta_files @dta_files end |
#filtered_by_precursor_mass_tolerance ⇒ Object
a boolean to indicate if the results have been filtered by the sequest.params precursor mass tolerance
44 45 46 |
# File 'lib/ms/sequest/srf.rb', line 44 def filtered_by_precursor_mass_tolerance @filtered_by_precursor_mass_tolerance end |
#header ⇒ Object
Returns the value of attribute header.
33 34 35 |
# File 'lib/ms/sequest/srf.rb', line 33 def header @header end |
#index ⇒ Object
a parallel array to dta_files and out_files where each entry is:
- first_scan, last_scan, charge
39 40 41 |
# File 'lib/ms/sequest/srf.rb', line 39 def index @index end |
#out_files ⇒ Object
Returns the value of attribute out_files.
35 36 37 |
# File 'lib/ms/sequest/srf.rb', line 35 def out_files @out_files end |
#params ⇒ Object
Returns the value of attribute params.
36 37 38 |
# File 'lib/ms/sequest/srf.rb', line 36 def params @params end |
#version ⇒ Object
a String: 3.5, 3.3 or 3.2
31 32 33 |
# File 'lib/ms/sequest/srf.rb', line 31 def version @version end |
Class Method Details
.get_sequest_params_and_finish_pos(filename) ⇒ Object
returns a Sequest::Params object or nil if none
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
# File 'lib/ms/sequest/srf.rb', line 51 def self.get_sequest_params_and_finish_pos(filename) # split the file in half and only read the second half (since we can be # confident that the params file will be there!) params = nil finish_parsing_io_pos = nil File.open(filename, 'rb') do |handle| halfway = handle.stat.size / 2 handle.seek halfway last_half = handle.read if sequest_start_from_last_half = last_half.rindex('[SEQUEST]') params_start_index = sequest_start_from_last_half + halfway handle.seek(params_start_index) params = Ms::Sequest::Params.new.parse_io(handle) finish_parsing_io_pos = handle.pos else nil # not found end end [params, finish_parsing_io_pos] end |
Instance Method Details
#dta_start_byte ⇒ Object
73 74 75 76 77 78 79 |
# File 'lib/ms/sequest/srf.rb', line 73 def dta_start_byte case @version when '3.2' ; 3260 when '3.3' ; 3644 when '3.5' ; 3644 end end |
#filter_by_precursor_mass_tolerance! ⇒ Object
-
updates the out_file’s list of hits based on passing peptides (but not
the original hit id; rank is implicit in array ordering)
-
recalculates deltacn values completely if number of hits changed (does
not touch deltacn orig)
This can spoil proper protein -> peptide linkages. Ms::Id::Search.merge! should be run after this method to ensure correct protein -> peptide linkages.
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
# File 'lib/ms/sequest/srf.rb', line 118 def filter_by_precursor_mass_tolerance! pmt = params.peptide_mass_tolerance.to_f methd = nil # the method to case params.peptide_mass_units when '0' amu_based = true milli_amu = false when '1' amu_based = true milli_amu = true when '2' amu_based = false end self.filtered_by_precursor_mass_tolerance = true self.out_files.each do |out_file| hits = out_file.hits before = hits.size hits.reject! do |pep| if amu_based if milli_amu (pep.deltamass.abs > (pmt/1000)) else (pep.deltamass.abs > pmt) end else (pep.ppm.abs > pmt) end end if hits.size != before out_file.hits = hits # <- is this necessary Ms::Sequest::Srf::Out::Pep.update_deltacns_from_xcorr(hits) out_file.num_hits = hits.size end end self end |
#from_file(filename, opts) ⇒ Object
returns self opts are the same as for ‘new’
174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 |
# File 'lib/ms/sequest/srf.rb', line 174 def from_file(filename, opts) opts = { :filter_by_precursor_mass_tolerance => true, :link_protein_hits => true, :read_pephits => true}.merge(opts) (@params, after_params_io_pos) = Ms::Sequest::Srf.get_sequest_params_and_finish_pos(filename) return unless @params dup_references = 0 dup_refs_gt_0 = false dup_references = @params.print_duplicate_references.to_i if dup_references == 0 # warn %Q{ #***************************************************************************** #WARNING: This srf file lists only 1 protein per peptide! (based on the #print_duplicate_references parameter in the sequest.params file used in its #creation) So, downstream output will likewise only contain a single protein #for each peptide hit. In many instances this is OK since downstream programs #will recalculate protein-to-peptide linkages from the database file anyway. #For complete protein lists per peptide hit, .srf files must be created with #print_duplicate_references > 0. HINT: to capture all duplicate references, #set the sequest parameter 'print_duplicate_references' to 100 or greater. #***************************************************************************** # } else dup_refs_gt_0 = true end File.open(filename, 'rb') do |fh| @header = Ms::Sequest::Srf::Header.new.from_io(fh) @version = @header.version unpack_35 = case @version when '3.2' false when '3.3' false when '3.5' true end if @header.combined @base_name = File.basename(filename, '.*') # I'm not sure why this is the case, but the reported number is too # big by one on the 2 files I've seen so far, so we will correct it here! @header.dta_gen.num_dta_files = @header.dta_gen.num_dta_files - 1 if opts[:read_pephits] == false raise NotImplementedError, "on combined files must read everything right now!" end (@dta_files, @out_files) = read_dta_and_out_interleaved(fh, @header.num_dta_files, unpack_35, dup_refs_gt_0) else @base_name = @header.raw_filename.scan(/[\\\/]([^\\\/]+)\.RAW$/).first.first @dta_files = read_dta_files(fh, @header.num_dta_files, unpack_35) if opts[:read_pephits] # need the params file to know if the duplicate_references is set > 0 raise NoSequestParamsError, "no sequest params info in srf file!\npass in path to sequest.params file" if @params.nil? @out_files = read_out_files(fh,@header.num_dta_files, unpack_35, dup_refs_gt_0) # FOR DISPLAY ONLY! #@out_files.each do |f| # if f.num_hits == 10 # p f.hits.last # end #end if fh.eof? #warn "FILE: '#{filename}' appears to be an abortive run (no params in srf file)\nstill continuing..." @params = nil @index = [] end end end fh.pos = after_params_io_pos # This is very sensitive to the grab_params method in sequest params fh.read(12) ## gap between last params entry and index @index = read_scan_index(fh,@header.num_dta_files) end ### UPDATE SOME THINGS: # give each hit a base_name, first_scan, last_scan if opts[:read_pephits] && !@header.combined @index.each_with_index do |ind,i| mass_measured = @dta_files[i][0] @out_files[i][0,3] = *ind pep_hits = @out_files[i][6] @peps.push( *pep_hits ) pep_hits.each do |pep_hit| pep_hit[15,4] = @base_name, *ind # add the deltamass pep_hit[12] = pep_hit[0] - mass_measured # real - measured (deltamass) pep_hit[13] = 1.0e6 * pep_hit[12].abs / mass_measured ## ppm pep_hit[19] = self ## link with the srf object end end filter_by_precursor_mass_tolerance! if params if opts[:link_protein_hits] (@peps, @prots) = merge!([peps]) do |_prot, _peps| prot = Ms::Sequest::Srf::Out::Prot.new(_prot.reference, _peps) end end end self end |
#protein_class ⇒ Object
46 47 48 |
# File 'lib/ms/sequest/srf.rb', line 46 def protein_class Ms::Sequest::Srf::Out::Prot end |
#read_dta_and_out_interleaved(fh, num_files, unpack_35, dup_refs_gt_0) ⇒ Object
157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
# File 'lib/ms/sequest/srf.rb', line 157 def read_dta_and_out_interleaved(fh, num_files, unpack_35, dup_refs_gt_0) dta_files = Array.new(num_files) out_files = Array.new(num_files) start = dta_start_byte fh.pos = start num_files.times do |i| dta_files[i] = Ms::Sequest::Srf::DTA.new.from_io(fh, unpack_35) #p dta_files[i] out_files[i] = Ms::Sequest::Srf::Out.new.from_io(fh, unpack_35, dup_refs_gt_0) #p out_files[i] end [dta_files, out_files] end |
#read_dta_files(fh, num_files, unpack_35) ⇒ Object
returns an array of dta_files
308 309 310 311 312 313 314 315 316 317 |
# File 'lib/ms/sequest/srf.rb', line 308 def read_dta_files(fh, num_files, unpack_35) dta_files = Array.new(num_files) start = dta_start_byte fh.pos = start header.num_dta_files.times do |i| dta_files[i] = Ms::Sequest::Srf::DTA.new.from_io(fh, unpack_35) end dta_files end |
#read_out_files(fh, number_files, unpack_35, dup_refs_gt_0) ⇒ Object
filehandle (fh) must be at the start of the outfiles. ‘read_dta_files’ will put the fh there.
321 322 323 324 325 326 327 |
# File 'lib/ms/sequest/srf.rb', line 321 def read_out_files(fh,number_files, unpack_35, dup_refs_gt_0) out_files = Array.new(number_files) header.num_dta_files.times do |i| out_files[i] = Ms::Sequest::Srf::Out.new.from_io(fh, unpack_35, dup_refs_gt_0) end out_files end |
#read_scan_index(fh, num) ⇒ Object
returns an index where each entry is [first_scan, last_scan, charge]
287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 |
# File 'lib/ms/sequest/srf.rb', line 287 def read_scan_index(fh, num) #string = fh.read(80) #puts "STRING: " #p string #puts string #File.open("tmp.tmp",'wb') {|out| out.print string } #abort 'her' ind_len = 24 index = Array.new(num) unpack_string = 'III' st = '' ind_len.times do st << '0' end ## create a 24 byte string to receive data num.times do |i| fh.read(ind_len, st) result = st.unpack(unpack_string) index[i] = st.unpack(unpack_string) end index end |