Class: ProteinSummary

Inherits:
Object
  • Object
show all
Includes:
HTML
Defined in:
lib/spec_id/protein_summary.rb,
lib/spec_id/protein_summary.rb

Defined Under Namespace

Modules: HTML

Instance Method Summary collapse

Methods included from HTML

#at, #header, #style, #table, #tds, #ths, #tr, #trailer

Instance Method Details

#accession(name) ⇒ Object

attempts to get the NCBI gi code



140
141
142
143
144
145
146
# File 'lib/spec_id/protein_summary.rb', line 140

def accession(name)
  if (name.include? '|') && (name[0,3] == 'gi|')
    name.split('|')[1]
  else
    name
  end
end

#bioworks_output(spec_id, outfn, file = nil, false_prefix = nil, fppr_output_as_html = nil) ⇒ Object

takes spec_id object the outfn is the output filename opt is an OpenStruct that holds opt.f = the false prefix



325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
# File 'lib/spec_id/protein_summary.rb', line 325

def bioworks_output(spec_id, outfn, file=nil, false_prefix=nil, fppr_output_as_html=nil)
  fppr_output_as_html ||= ''
  header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')]
  num_cols = header_anchors.size
  theaders = ths(header_anchors)
  proteins = spec_id.prots
  protein_num = 0
  rows = ""
  prefix_re = prefix_to_regex(false_prefix)
  proteins.each do |prot|
    if false_prefix && prot.reference =~ prefix_re
      next
    end
    uniq_peps = Hash.new {|h,k| h[k] = true; }
    protein_num += 1
    prot.peps.each do |pep|
      uniq_peps[pep.sequence.split('.')[1]] = true
    end
    pieces = prot.reference.split(' ')
    long_prot_name = pieces.shift 
    annotation = pieces.join(' ')
    accession = prot.accession
    if accession == '0' ; accession = long_prot_name end
    rows << tr{ tds([protein_num, prot.protein_probability, ref_html(accession, long_prot_name), annotation, prot.coverage, peptide_cell(protein_num, uniq_peps.keys), prot.peps.size]) }
  end
  table_string = table do 
    tr{theaders} + rows
  end
  print_html_pieces(outfn, header, fppr_output_as_html, file_info(file), bioworks_script_info(spec_id), table_string, trailer)
end

#bioworks_script_info(obj) ⇒ Object



235
236
237
238
239
240
241
# File 'lib/spec_id/protein_summary.rb', line 235

def bioworks_script_info(obj)
  version = "3.2??"
  if obj.version
    version = obj.version
  end
  script_info{"Bioworks version #{version}"}
end

#create_from_command_line_args(argv) ⇒ Object



386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
# File 'lib/spec_id/protein_summary.rb', line 386

def create_from_command_line_args(argv)
  @orig_argv = argv.dup

  opt = OpenStruct.new
  opt.f = DEF_PREFIX
  opts = OptionParser.new do |op|
    op.banner = "usage: #{File.basename(__FILE__)} [options] <file>.xml ..."
    op.separator "    where file = bioworks -or- <run>-prot (prophet output)"
    op.separator "    outputs: <file>.summary.html"
    op.separator ""
    op.on("-f", "--false <prefix>", "ignore proteins with prefix (def: #{DEF_PREFIX})") {|v| opt.f = v }
    op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v }
    op.separator("             if --precision then -f is used to specify a file or prefix")
    op.separator("             that indicates the false positives.")
    op.on("--peptide_count <filename>", "outputs text file with # peptides per protein") {|v| opt.peptide_count = v}
    op.separator ""
    op.separator "Options for #{PRECISION_PROGRAM_BASE}.rb :"
    op.on("--#{PRECISION_PROGRAM_BASE}", "include output of #{PRECISION_PROGRAM_BASE}.rb,") {|v| opt.precision = v}
    op.separator("                                     type '#{PRECISION_PROGRAM_BASE}.rb' for details")
    op.separator ""
    op.separator "Specific to ProteinProphet (with no concatenated DB):"
    op.on("-c", "--cutoff percent", "false positive predictive rate (FPPR)% for given cutoff") {|v| opt.c = v }
    op.on("--cut_at percent", "only reports proteins within FPPR %") {|v| opt.cut_at = v }
  end

  opts.parse!(argv)

  if argv.size < 1
    puts opts
    return
  end

  fppr_output_as_html = ''
  files = argv.to_a
  files.each do |file| 
    outfn = file.sub(/\.xml$/, '.summary.html')
    outfn = outfn.sub(/\.srg$/, '.summary.html')
    ## False Positive Rate Calculation:
    if opt.precision
      opt.o = outfn # won't actually be written over, but used
      to_use_argv = create_precision_argv(file, opt)
      (out_string, opt) = Prec.new.precision(to_use_argv)
      fppr_output_as_html = prefix_as_decoy_to_html(out_string)
    end

    case SpecID.file_type(file)
    when "protproph"
      #spec_id = SpecID.new(file)
      proph_output(file, outfn, opt, fppr_output_as_html)
    when "bioworks"
      spec_id = SpecID.new(file)
      bioworks_output(spec_id, outfn, file, opt.f, fppr_output_as_html)
    else 
      abort "filetype for #{file} not recognized!"
    end
  end

end

#create_precision_argv(file, opt) ⇒ Object

method create_from_command_line



445
446
447
448
449
450
451
# File 'lib/spec_id/protein_summary.rb', line 445

def create_precision_argv(file, opt)
  # include only those options specific
  new_argv = [file]
  if opt.f ; new_argv << '-f' << opt.f end
  if opt.o ; new_argv << '-o' << opt.o end
  new_argv
end

#error_info(prot_file_name) ⇒ Object

Takes the -prot.xml filename and grabs the png file (if available)



133
134
135
136
137
# File 'lib/spec_id/protein_summary.rb', line 133

def error_info(prot_file_name)
  img = prot_file_name.gsub('.xml', '.png')
  img_bn = File.basename(img)
    "<div id=\"error\"><img src=\"#{img_bn}\" alt=\"[ Optional: To view error/sensitivity image, put #{img_bn} in the same directory as #{File.basename(prot_file_name)} ]\"/>\n</div>"
end

#file_as_decoy_to_html(string) ⇒ Object

transforms the output string of file_as_decoy into html



367
368
369
370
371
372
373
374
375
376
# File 'lib/spec_id/protein_summary.rb', line 367

def file_as_decoy_to_html(string)  
  lines = string.split("\n")
  #puts lines ?? is this supposed to be commented out?
  lines = lines.reject do |obj| obj =~ /\*{10}/ end
  lines.map! do |line| "#{line}<br/>" end
  "<div class=\"fppr\">
  <h3>Classification Analysis</h3>
  #{lines.join("\n")} 
  </div>"
end

#file_info(file) ⇒ Object



228
229
230
231
232
233
# File 'lib/spec_id/protein_summary.rb', line 228

def file_info(file)
  "<div class=\"file_info\"><h3>Source File Information</h3>File: #{File.expand_path(file)}
  <br/>Last Modified: #{File.mtime(file)}
  <br/>Size: #{File.size(file)/1000} KB
  </div>"
end

#filter_and_sort(uniq_prots, prefix = nil) ⇒ Object

filters on the false positive regex and sorts by prot probability



167
168
169
170
171
172
173
174
175
# File 'lib/spec_id/protein_summary.rb', line 167

def filter_and_sort(uniq_prots, prefix=nil)
  prefix_re = prefix_to_regex(prefix) 
  sorted = uniq_prots.sort_by {|prt| [prt._probability, prt.parent._probability]}.reverse
  ## filter on prefix
  if prefix
    sorted = sorted.reject {|prot| prot._protein_name =~ prefix_re }
  end
  sorted
end

#mspire_versionObject



257
258
259
260
261
262
263
264
265
266
# File 'lib/spec_id/protein_summary.rb', line 257

def mspire_version
  string = "mspire"
  begin
    if `gem list --local mspire` =~ /mspire \((.*?)\)/
      string << (" v" + $1)
    end
  rescue Exception
  end
  string
end

#num_prots_above_fppr(prots, desired_fppr) ⇒ Object

assumes that these are sorted on probability desired_fppr is a float returns [number_of_prots, actual_fppr]



180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# File 'lib/spec_id/protein_summary.rb', line 180

def num_prots_above_fppr(prots, desired_fppr)
  current_fppr_rate_percent = 0.0
  previous_fppr_rate_percent = 0.0
  current_sum_one_minus_prob = 0.0
  proteins_within_fppr = 0
  actual_fppr = nil
  already_found = false
  prot_cnt = 0
  prots.each do |prot|
    prot_cnt += 1
    # SUM(1-probX)/#prots
    current_sum_one_minus_prob += 1.0 - prot._probability.to_f
    current_fppr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100

    if current_fppr_rate_percent > desired_fppr && !already_found
      actual_fppr = previous_fppr_rate_percent
      proteins_within_fppr = prot_cnt
      already_found = true
    end
    previous_fppr_rate_percent = current_fppr_rate_percent
  end
  [proteins_within_fppr, actual_fppr]
end

#num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins) ⇒ Object

bioworks_output



356
357
358
359
360
361
362
363
364
# File 'lib/spec_id/protein_summary.rb', line 356

def num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins)
  actual_cutoff = sprintf("%.3f", actual_cutoff)
  desired_cutoff = sprintf("%.3f", desired_cutoff)
  "<div class=\"num_proteins\"><h3>False Positive Predictive Rate [ FP/(TP+FP) ]</h3>
  Desired FPPR: #{desired_cutoff} %<br/>
  Actual FPPR: #{actual_cutoff} %<br/>
  Number of Proteins at Actual FPPR: #{num_proteins}
  </div>"
end

#output_peptide_counts_file(prots, filename) ⇒ Object

given a list of proteins, output a tab delimited textfile with protein name and the total number of peptides found



158
159
160
161
162
163
164
# File 'lib/spec_id/protein_summary.rb', line 158

def output_peptide_counts_file(prots, filename)
  File.open(filename, "w") do |fh_out|
    prots.each do |prot|
      fh_out.puts [prot._protein_name, prot._total_number_peptides].join("\t")
    end
  end
end

#peptide_cell(prot_num, peptide_sequences) ⇒ Object

given a list of peptide sequences creates javascript to hide/show them



318
319
320
# File 'lib/spec_id/protein_summary.rb', line 318

def peptide_cell(prot_num, peptide_sequences)
  "<a href=\"#prot#{prot_num}\" onclick=\"toggle_vis('#{prot_num}');\">#{peptide_sequences.size}</a><div id=\"#{prot_num}\" style=\"display:none;\">#{peptide_sequences.join(', ')}</div>"
end

#prefix_as_decoy_to_html(string) ⇒ Object

transforms the output string of file_as_decoy into html



379
380
381
382
383
384
# File 'lib/spec_id/protein_summary.rb', line 379

def prefix_as_decoy_to_html(string)  
  "<div class=\"fppr\">
  <h3>Classification Analysis</h3>
  </div>" +
  string
end

#prefix_to_regex(prefix) ⇒ Object



148
149
150
151
152
153
154
# File 'lib/spec_id/protein_summary.rb', line 148

def prefix_to_regex(prefix)
  if prefix
    /^#{Regexp.escape(prefix)}/
  else
    nil
  end
end


220
221
222
223
224
225
226
# File 'lib/spec_id/protein_summary.rb', line 220

def print_html_pieces(file, *pieces)
  File.open(file, "w") do |out|
    pieces.each do |piece|
      out.print piece
    end
  end
end

#proph_output(file, outfn, opt, fppr_output_as_html) ⇒ Object



272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
# File 'lib/spec_id/protein_summary.rb', line 272

def proph_output(file, outfn, opt, fppr_output_as_html)
  header_anchors = [at('#', 'number'), at('prob','protein probability (for Prophet, higher is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (includes non-contributing peptides). Click number to show/hide'), at('#peps', 'total number of corresponding peptides that contributed to protein probability'),  at('%ids', 'fraction of correct dataset peptide identifications corresponding to protein')]
  num_cols = header_anchors.size
  theaders = ths(header_anchors)

  root = AXML.parse_file(file)
  prots = []
  ## find the min_prob at a fppr of XX
  min_prob_redline = 1.01  # if no fppr is less than what they give, then all are redlined!

  if opt.c 
    actual_percent_fp = opt.c.to_f
  elsif opt.cut_at
    actual_percent_fp = opt.cut_at.to_f
  else
    actual_percent_fp = nil
  end
  root.protein_group.each do |group|
    group.protein.each do |prt|
      prots << prt 
    end
  end
  uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
  filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f)

  ## num proteins above cutoff (if opt.c)
  num_prots_html = ''
  if opt.c || opt.cut_at
    (num_prots, actual_fppr) = num_prots_above_fppr(filtered_sorted_prots, actual_percent_fp)
    num_prots_html = num_prots_to_html(actual_percent_fp, actual_fppr, num_prots)
  end
  if opt.cut_at
    filtered_sorted_prots = filtered_sorted_prots[0,num_prots]
  end

  output_peptide_counts_file(filtered_sorted_prots, opt.peptide_count) if opt.peptide_count

  table_string = table do 
    tr{theaders} + table_rows(filtered_sorted_prots, opt.f, actual_percent_fp, num_cols, opt.c.to_f, actual_percent_fp, opt.peptide_count)
  end
  er_info = opt.precision ? error_info(file) : ""
  html_pieces = [outfn, header, fppr_output_as_html, er_info, file_info(file), protproph_script_info, num_prots_html, table_string, trailer]
  print_html_pieces(*html_pieces)
end

#protproph_script_infoObject



243
244
245
246
247
248
249
250
251
252
253
254
255
# File 'lib/spec_id/protein_summary.rb', line 243

def protproph_script_info
  begin
    where = `which xinteract`
    reply = `#{where}`
  rescue Exception
    reply = ""
  end
  prophet = "TPP (version unknown)"  # put your version here if you can't get it dynamically
  if reply =~ /xinteract.*?\((TPP .*)\)/
    prophet = $1.dup
  end
  script_info { "ProteinProphet from: #{prophet}" }
end

#ref_html(gi, name) ⇒ Object



128
129
130
# File 'lib/spec_id/protein_summary.rb', line 128

def ref_html(gi, name)
"<a href=\"http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?db=protein&val=#{gi}\" title=\"#{name}\">#{gi}</a>"
end

#script_infoObject



268
269
270
# File 'lib/spec_id/protein_summary.rb', line 268

def script_info
  "<div class=\"software\"><h3>Software Information</h3>#{yield}<br/>Ruby package: #{mspire_version}<br/>Command: #{[File.basename(__FILE__), *@orig_argv].join(" ")}</div>"
end

#table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, peptide_count_filename = nil) ⇒ Object

returns a string of the table rows false_positive_rate (give as a %) is the cutoff mark returns the number of proteins at the desired_fppr (if given)



209
210
211
212
213
214
215
216
217
218
# File 'lib/spec_id/protein_summary.rb', line 209

def table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, peptide_count_filename=nil)
  prot_cnt = 0
  uniq_prots.map do |prot| 
    tr do
      prot_cnt += 1
      gi = accession(prot._protein_name)
      tds([prot_cnt, prot._probability, ref_html(gi, prot._protein_name), prot.annotation.first._protein_description, prot._percent_coverage, peptide_cell(prot_cnt, prot._unique_stripped_peptides.split('+')), prot._total_number_peptides, prot._pct_spectrum_ids])
    end
  end.join
end