Class: ProteinSummary

Inherits:
Object
  • Object
show all
Includes:
HTML
Defined in:
lib/spec_id/protein_summary.rb,
lib/spec_id/protein_summary.rb

Defined Under Namespace

Modules: HTML

Instance Method Summary collapse

Methods included from HTML

#at, #header, #style, #table, #tds, #ths, #tr, #trailer

Instance Method Details

#accession(name) ⇒ Object

attempts to get the NCBI gi code



141
142
143
144
145
146
147
# File 'lib/spec_id/protein_summary.rb', line 141

def accession(name)
  if (name.include? '|') && (name[0,3] == 'gi|')
    name.split('|')[1]
  else
    name
  end
end

#bioworks_output(spec_id, outfn, file = nil, false_flag_re = nil, fppr_output_as_html = nil) ⇒ Object

takes spec_id object the outfn is the output filename opt is an OpenStruct that holds opt.f = the false prefix



350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
# File 'lib/spec_id/protein_summary.rb', line 350

def bioworks_output(spec_id, outfn, file=nil, false_flag_re=nil, fppr_output_as_html=nil)
  fppr_output_as_html ||= ''
  header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')]
  num_cols = header_anchors.size
  theaders = ths(header_anchors)
  proteins = spec_id.prots
  protein_num = 0
  rows = ""
  proteins.each do |prot|
    if false_flag_re && prot.reference =~ false_flag_re
      next
    end
    uniq_peps = Hash.new {|h,k| h[k] = true; }
    protein_num += 1
    prot.peps.each do |pep|
      uniq_peps[pep.sequence.split('.')[1]] = true
    end
    pieces = prot.reference.split(' ')
    long_prot_name = pieces.shift 
    annotation = pieces.join(' ')
    accession = prot.accession
    if accession == '0' ; accession = long_prot_name end
    rows << tr{ tds([protein_num, prot.protein_probability, ref_html(accession, long_prot_name), annotation, prot.coverage, peptide_cell(protein_num, uniq_peps.keys), prot.peps.size]) }
  end
  table_string = table do 
    tr{theaders} + rows
  end
  print_html_pieces(outfn, header, fppr_output_as_html, file_info(file), bioworks_script_info(spec_id), table_string, trailer)
end

#bioworks_script_info(obj) ⇒ Object



253
254
255
256
257
258
259
# File 'lib/spec_id/protein_summary.rb', line 253

def bioworks_script_info(obj)
  version = "3.2??"
  if obj.version
    version = obj.version
  end
  script_info{"Bioworks version #{version}"}
end

#create_from_command_line_args(argv) ⇒ Object



410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
# File 'lib/spec_id/protein_summary.rb', line 410

def create_from_command_line_args(argv)
  @orig_argv = argv.dup

  opt = OpenStruct.new
  opt.f = DEF_PREFIX
  opts = OptionParser.new do |op|
    op.banner = "usage: #{File.basename(__FILE__)} [options] <file>.xml ..."
    op.separator "    where file = bioworks -or- <run>-prot (prophet output)"
    op.separator "    outputs: <file>.summary.html"
    op.separator ""
    op.on("-f", "--false <prefix>", "ignore proteins with flag (def: #{DEF_PREFIX})") {|v| opt.f = v }
    op.on("--prefix", "false flag for prefixes only") {|v| opt.prefix = v }
    op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v }
    op.separator("             if --precision then -f is used to specify a file or prefix")
    op.separator("             that indicates the false positives.")
    op.on("--peptide_count <filename>", "outputs text file with # peptides per protein") {|v| opt.peptide_count = v}
    op.separator ""
    op.separator "Options for #{PRECISION_PROGRAM_BASE}.rb :"
    op.on("--#{PRECISION_PROGRAM_BASE}", "include output of #{PRECISION_PROGRAM_BASE}.rb,") {|v| opt.precision = v}
    op.separator("                                     type '#{PRECISION_PROGRAM_BASE}.rb' for details")
    op.separator ""
    op.separator "specific to ProteinProphet (with no concatenated DB):"
    op.on("-c", "--cutoff percent", "false positive predictive rate (FPPR)% for given cutoff") {|v| opt.c = v }
    op.on("--cut_at percent", "only reports proteins within FPPR %") {|v| opt.cut_at = v }
    op.on("--get_annotation", "retrieves annotation by gi code") {|v| opt.get_annotation = v}
    op.separator "             (use if your proteins have gi's but no annotation) "
  end

  opts.parse!(argv)

  if argv.size < 1
    puts opts
    return
  end

  fppr_output_as_html = ''
  files = argv.to_a
  files.each do |file| 
    outfn = file.sub(/\.xml$/, '.summary.html')
    outfn = outfn.sub(/\.srg$/, '.summary.html')
    ## False Positive Rate Calculation:
    if opt.precision
      opt.o = outfn # won't actually be written over, but used
      to_use_argv = create_precision_argv(file, opt)
      (out_string, opt) = Prec.new.precision(to_use_argv)
      fppr_output_as_html = prefix_as_decoy_to_html(out_string)
    end

    case SpecID.file_type(file)
    when "protproph"
      #spec_id = SpecID.new(file)
      proph_output(file, outfn, opt, fppr_output_as_html)
    when "bioworks"
      spec_id = SpecID.new(file)

      false_regex = flag_to_regex(opt.f, opt.prefix)
      bioworks_output(spec_id, outfn, file, false_regex, fppr_output_as_html)
    else 
      abort "filetype for #{file} not recognized!"
    end
  end

end

#create_precision_argv(file, opt) ⇒ Object

method create_from_command_line



474
475
476
477
478
479
480
481
# File 'lib/spec_id/protein_summary.rb', line 474

def create_precision_argv(file, opt)
  # include only those options specific
  new_argv = [file]
  if opt.prefix ; new_argv << '--prefix' end
  if opt.f ; new_argv << '-f' << opt.f end
  if opt.o ; new_argv << '-o' << opt.o end
  new_argv
end

#error_info(prot_file_name) ⇒ Object

Takes the -prot.xml filename and grabs the png file (if available)



134
135
136
137
138
# File 'lib/spec_id/protein_summary.rb', line 134

def error_info(prot_file_name)
  img = prot_file_name.gsub('.xml', '.png')
  img_bn = File.basename(img)
    "<div id=\"error\"><img src=\"#{img_bn}\" alt=\"[ Optional: To view error/sensitivity image, put #{img_bn} in the same directory as #{File.basename(prot_file_name)} ]\"/>\n</div>"
end

#file_as_decoy_to_html(string) ⇒ Object

transforms the output string of file_as_decoy into html



391
392
393
394
395
396
397
398
399
400
# File 'lib/spec_id/protein_summary.rb', line 391

def file_as_decoy_to_html(string)  
  lines = string.split("\n")
  #puts lines ?? is this supposed to be commented out?
  lines = lines.reject do |obj| obj =~ /\*{10}/ end
  lines.map! do |line| "#{line}<br/>" end
  "<div class=\"fppr\">
  <h3>Classification Analysis</h3>
  #{lines.join("\n")} 
  </div>"
end

#file_info(file) ⇒ Object



246
247
248
249
250
251
# File 'lib/spec_id/protein_summary.rb', line 246

def file_info(file)
  "<div class=\"file_info\"><h3>Source File Information</h3>File: #{File.expand_path(file)}
  <br/>Last Modified: #{File.mtime(file)}
  <br/>Size: #{File.size(file)/1000} KB
  </div>"
end

#filter_and_sort(uniq_prots, flag = nil, prefix = false) ⇒ Object

filters on the false positive regex and sorts by prot probability



172
173
174
175
176
177
178
179
180
# File 'lib/spec_id/protein_summary.rb', line 172

def filter_and_sort(uniq_prots, flag=nil, prefix=false)
  false_flag_re = flag_to_regex(flag, prefix) 
  sorted = uniq_prots.sort_by {|prt| [prt._probability, prt.parent._probability]}.reverse
  ## filter on prefix
  if prefix
    sorted = sorted.reject {|prot| prot._protein_name =~ false_flag_re }
  end
  sorted
end

#flag_to_regex(flag, prefix = false) ⇒ Object



149
150
151
152
153
154
155
156
157
158
159
# File 'lib/spec_id/protein_summary.rb', line 149

def flag_to_regex(flag, prefix=false)
  if flag
    if prefix
      /^#{Regexp.escape(flag)}/
    else
      /#{Regexp.escape(flag)}/
    end
  else
    nil
  end
end

#mspire_versionObject



275
276
277
278
279
280
281
282
283
284
# File 'lib/spec_id/protein_summary.rb', line 275

def mspire_version
  string = "mspire"
  begin
    if `gem list --local mspire` =~ /mspire \((.*?)\)/
      string << (" v" + $1)
    end
  rescue Exception
  end
  string
end

#num_prots_above_fppr(prots, desired_fppr) ⇒ Object

assumes that these are sorted on probability desired_fppr is a float returns [number_of_prots, actual_fppr]



185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# File 'lib/spec_id/protein_summary.rb', line 185

def num_prots_above_fppr(prots, desired_fppr)
  current_fppr_rate_percent = 0.0
  previous_fppr_rate_percent = 0.0
  current_sum_one_minus_prob = 0.0
  proteins_within_fppr = 0
  actual_fppr = nil
  already_found = false
  prot_cnt = 0
  prots.each do |prot|
    prot_cnt += 1
    # SUM(1-probX)/#prots
    current_sum_one_minus_prob += 1.0 - prot._probability.to_f
    current_fppr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100

    if current_fppr_rate_percent > desired_fppr && !already_found
      actual_fppr = previous_fppr_rate_percent
      proteins_within_fppr = prot_cnt
      already_found = true
    end
    previous_fppr_rate_percent = current_fppr_rate_percent
  end
  [proteins_within_fppr, actual_fppr]
end

#num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins) ⇒ Object

bioworks_output



380
381
382
383
384
385
386
387
388
# File 'lib/spec_id/protein_summary.rb', line 380

def num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins)
  actual_cutoff = sprintf("%.3f", actual_cutoff)
  desired_cutoff = sprintf("%.3f", desired_cutoff)
  "<div class=\"num_proteins\"><h3>False Positive Predictive Rate [ FP/(TP+FP) ]</h3>
  Desired FPPR: #{desired_cutoff} %<br/>
  Actual FPPR: #{actual_cutoff} %<br/>
  Number of Proteins at Actual FPPR: #{num_proteins}
  </div>"
end

#output_peptide_counts_file(prots, filename) ⇒ Object

given a list of proteins, output a tab delimited textfile with protein name and the total number of peptides found



163
164
165
166
167
168
169
# File 'lib/spec_id/protein_summary.rb', line 163

def output_peptide_counts_file(prots, filename)
  File.open(filename, "w") do |fh_out|
    prots.each do |prot|
      fh_out.puts [prot._protein_name, prot._total_number_peptides].join("\t")
    end
  end
end

#peptide_cell(prot_num, peptide_sequences) ⇒ Object

given a list of peptide sequences creates javascript to hide/show them



343
344
345
# File 'lib/spec_id/protein_summary.rb', line 343

def peptide_cell(prot_num, peptide_sequences)
  "<a href=\"#prot#{prot_num}\" onclick=\"toggle_vis('#{prot_num}');\">#{peptide_sequences.size}</a><div id=\"#{prot_num}\" style=\"display:none;\">#{peptide_sequences.join(', ')}</div>"
end

#prefix_as_decoy_to_html(string) ⇒ Object

transforms the output string of file_as_decoy into html



403
404
405
406
407
408
# File 'lib/spec_id/protein_summary.rb', line 403

def prefix_as_decoy_to_html(string)  
  "<div class=\"fppr\">
  <h3>Classification Analysis</h3>
  </div>" +
  string
end


238
239
240
241
242
243
244
# File 'lib/spec_id/protein_summary.rb', line 238

def print_html_pieces(file, *pieces)
  File.open(file, "w") do |out|
    pieces.each do |piece|
      out.print piece
    end
  end
end

#proph_output(file, outfn, opt, fppr_output_as_html) ⇒ Object



290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
# File 'lib/spec_id/protein_summary.rb', line 290

def proph_output(file, outfn, opt, fppr_output_as_html)
  header_anchors = [at('#', 'number'), at('prob','protein probability (for Prophet, higher is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (includes non-contributing peptides). Click number to show/hide'), at('#peps', 'total number of corresponding peptides that contributed to protein probability'),  at('%ids', 'fraction of correct dataset peptide identifications corresponding to protein')]
  num_cols = header_anchors.size
  theaders = ths(header_anchors)

  root = AXML.parse_file(file)
  prots = []
  ## find the min_prob at a fppr of XX
  min_prob_redline = 1.01  # if no fppr is less than what they give, then all are redlined!

  if opt.c 
    actual_percent_fp = opt.c.to_f
  elsif opt.cut_at
    actual_percent_fp = opt.cut_at.to_f
  else
    actual_percent_fp = nil
  end
  root.protein_group.each do |group|
    group.protein.each do |prt|
      prots << prt 
    end
  end
  uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
  filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f, opt.prefix)

  ## num proteins above cutoff (if opt.c)
  num_prots_html = ''
  if opt.c || opt.cut_at
    (num_prots, actual_fppr) = num_prots_above_fppr(filtered_sorted_prots, actual_percent_fp)
    num_prots_html = num_prots_to_html(actual_percent_fp, actual_fppr, num_prots)
  end
  if opt.cut_at
    filtered_sorted_prots = filtered_sorted_prots[0,num_prots]
  end

  output_peptide_counts_file(filtered_sorted_prots, opt.peptide_count) if opt.peptide_count

  # get an array of annotations (or nil if no option)
  annotations =
    if opt.get_annotation
      gis = filtered_sorted_prots.map {|prot| accession(prot._protein_name) }
      GI.gi2annot(gis) 
    end

  table_string = table do 
    tr{theaders} + table_rows(filtered_sorted_prots, opt.f, actual_percent_fp, num_cols, opt.c.to_f, actual_percent_fp, annotations, opt.peptide_count)
  end
  er_info = opt.precision ? error_info(file) : ""
  html_pieces = [outfn, header, fppr_output_as_html, er_info, file_info(file), protproph_script_info, num_prots_html, table_string, trailer]
  print_html_pieces(*html_pieces)
end

#protproph_script_infoObject



261
262
263
264
265
266
267
268
269
270
271
272
273
# File 'lib/spec_id/protein_summary.rb', line 261

def protproph_script_info
  begin
    where = `which xinteract`
    reply = `#{where}`
  rescue Exception
    reply = ""
  end
  prophet = "TPP (version unknown)"  # put your version here if you can't get it dynamically
  if reply =~ /xinteract.*?\((TPP .*)\)/
    prophet = $1.dup
  end
  script_info { "ProteinProphet from: #{prophet}" }
end

#ref_html(gi, name) ⇒ Object



129
130
131
# File 'lib/spec_id/protein_summary.rb', line 129

def ref_html(gi, name)
"<a href=\"http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?db=protein&val=#{gi}\" title=\"#{name}\">#{gi}</a>"
end

#script_infoObject



286
287
288
# File 'lib/spec_id/protein_summary.rb', line 286

def script_info
  "<div class=\"software\"><h3>Software Information</h3>#{yield}<br/>Ruby package: #{mspire_version}<br/>Command: #{[File.basename(__FILE__), *@orig_argv].join(" ")}</div>"
end

#table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, annotations = nil, peptide_count_filename = nil) ⇒ Object

returns a string of the table rows false_positive_rate (give as a %) is the cutoff mark returns the number of proteins at the desired_fppr (if given)



214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# File 'lib/spec_id/protein_summary.rb', line 214

def table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, annotations=nil, peptide_count_filename=nil)
  prot_cnt = 0
  an_cnt = 0
 
  uniq_prots.map do |prot| 
    tr do
      prot_cnt += 1
      gi = accession(prot._protein_name)

      if annotations
        protein_description = annotations[an_cnt]
        an_cnt += 1
      else
        if prot.annotation.size > 0
          protein_description = prot.annotation.first._protein_description
        else
          protein_description = 'NA'
        end
      end
      tds([prot_cnt, prot._probability, ref_html(gi, prot._protein_name), protein_description, prot._percent_coverage, peptide_cell(prot_cnt, prot._unique_stripped_peptides.split('+')), prot._total_number_peptides, prot._pct_spectrum_ids])
    end
  end.join
end