Class: ProteinSummary

Inherits:

Object

Object
ProteinSummary

show all

Includes:: HTML

Defined in:: lib/spec_id/protein_summary.rb,
lib/spec_id/protein_summary.rb

Defined Under Namespace

Modules: HTML

Instance Method Summary collapse

#accession(name) ⇒ Object

attempts to get the NCBI gi code.
#bioworks_output(spec_id, outfn, file = nil, false_flag_re = nil, fppr_output_as_html = nil) ⇒ Object

takes spec_id object the outfn is the output filename opt is an OpenStruct that holds opt.f = the false prefix.
#bioworks_script_info(obj) ⇒ Object
#create_from_command_line_args(argv) ⇒ Object
#create_precision_argv(file, opt) ⇒ Object

method create_from_command_line.
#error_info(prot_file_name) ⇒ Object

Takes the -prot.xml filename and grabs the png file (if available).
#file_as_decoy_to_html(string) ⇒ Object

transforms the output string of file_as_decoy into html.
#file_info(file) ⇒ Object
#filter_and_sort(uniq_prots, flag = nil, prefix = false) ⇒ Object

filters on the false positive regex and sorts by prot probability.
#flag_to_regex(flag, prefix = false) ⇒ Object
#mspire_version ⇒ Object
#num_prots_above_fppr(prots, desired_fppr) ⇒ Object

assumes that these are sorted on probability desired_fppr is a float returns [number_of_prots, actual_fppr].
#num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins) ⇒ Object

bioworks_output.
#output_peptide_counts_file(prots, filename) ⇒ Object

given a list of proteins, output a tab delimited textfile with protein name and the total number of peptides found.
#peptide_cell(prot_num, peptide_sequences) ⇒ Object

given a list of peptide sequences creates javascript to hide/show them.
#prefix_as_decoy_to_html(string) ⇒ Object

transforms the output string of file_as_decoy into html.
#print_html_pieces(file, *pieces) ⇒ Object
#proph_output(file, outfn, opt, fppr_output_as_html) ⇒ Object
#protproph_script_info ⇒ Object
#ref_html(gi, name) ⇒ Object
#script_info ⇒ Object
#table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, annotations = nil, peptide_count_filename = nil) ⇒ Object

returns a string of the table rows false_positive_rate (give as a %) is the cutoff mark returns the number of proteins at the desired_fppr (if given).

Methods included from HTML

#at, #header, #style, #table, #tds, #ths, #tr, #trailer

Instance Method Details

#accession(name) ⇒ `Object`

attempts to get the NCBI gi code

# File 'lib/spec_id/protein_summary.rb', line 141

def accession(name)
  if (name.include? '|') && (name[0,3] == 'gi|')
    name.split('|')[1]
  else
    name
  end
end

#bioworks_output(spec_id, outfn, file = nil, false_flag_re = nil, fppr_output_as_html = nil) ⇒ `Object`

takes spec_id object the outfn is the output filename opt is an OpenStruct that holds opt.f = the false prefix

# File 'lib/spec_id/protein_summary.rb', line 350

def bioworks_output(spec_id, outfn, file=nil, false_flag_re=nil, fppr_output_as_html=nil)
  fppr_output_as_html ||= ''
  header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')]
  num_cols = header_anchors.size
  theaders = ths(header_anchors)
  proteins = spec_id.prots
  protein_num = 0
  rows = ""
  proteins.each do |prot|
    if false_flag_re && prot.reference =~ false_flag_re
      next
    end
    uniq_peps = Hash.new {|h,k| h[k] = true; }
    protein_num += 1
    prot.peps.each do |pep|
      uniq_peps[pep.sequence.split('.')[1]] = true
    end
    pieces = prot.reference.split(' ')
    long_prot_name = pieces.shift 
    annotation = pieces.join(' ')
    accession = prot.accession
    if accession == '0' ; accession = long_prot_name end
    rows << tr{ tds([protein_num, prot.protein_probability, ref_html(accession, long_prot_name), annotation, prot.coverage, peptide_cell(protein_num, uniq_peps.keys), prot.peps.size]) }
  end
  table_string = table do 
    tr{theaders} + rows
  end
  print_html_pieces(outfn, header, fppr_output_as_html, file_info(file), bioworks_script_info(spec_id), table_string, trailer)
end

#bioworks_script_info(obj) ⇒ `Object`

# File 'lib/spec_id/protein_summary.rb', line 253

def bioworks_script_info(obj)
  version = "3.2??"
  if obj.version
    version = obj.version
  end
  script_info{"Bioworks version #{version}"}
end

#create_from_command_line_args(argv) ⇒ `Object`

# File 'lib/spec_id/protein_summary.rb', line 410

def create_from_command_line_args(argv)
  @orig_argv = argv.dup

  opt = OpenStruct.new
  opt.f = DEF_PREFIX
  opts = OptionParser.new do |op|
    op.banner = "usage: #{File.basename(__FILE__)} [options] <file>.xml ..."
    op.separator "    where file = bioworks -or- <run>-prot (prophet output)"
    op.separator "    outputs: <file>.summary.html"
    op.separator ""
    op.on("-f", "--false <prefix>", "ignore proteins with flag (def: #{DEF_PREFIX})") {|v| opt.f = v }
    op.on("--prefix", "false flag for prefixes only") {|v| opt.prefix = v }
    op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v }
    op.separator("             if --precision then -f is used to specify a file or prefix")
    op.separator("             that indicates the false positives.")
    op.on("--peptide_count <filename>", "outputs text file with # peptides per protein") {|v| opt.peptide_count = v}
    op.separator ""
    op.separator "Options for #{PRECISION_PROGRAM_BASE}.rb :"
    op.on("--#{PRECISION_PROGRAM_BASE}", "include output of #{PRECISION_PROGRAM_BASE}.rb,") {|v| opt.precision = v}
    op.separator("                                     type '#{PRECISION_PROGRAM_BASE}.rb' for details")
    op.separator ""
    op.separator "specific to ProteinProphet (with no concatenated DB):"
    op.on("-c", "--cutoff percent", "false positive predictive rate (FPPR)% for given cutoff") {|v| opt.c = v }
    op.on("--cut_at percent", "only reports proteins within FPPR %") {|v| opt.cut_at = v }
    op.on("--get_annotation", "retrieves annotation by gi code") {|v| opt.get_annotation = v}
    op.separator "             (use if your proteins have gi's but no annotation) "
  end

  opts.parse!(argv)

  if argv.size < 1
    puts opts
    return
  end

  fppr_output_as_html = ''
  files = argv.to_a
  files.each do |file| 
    outfn = file.sub(/\.xml$/, '.summary.html')
    outfn = outfn.sub(/\.srg$/, '.summary.html')
    ## False Positive Rate Calculation:
    if opt.precision
      opt.o = outfn # won't actually be written over, but used
      to_use_argv = create_precision_argv(file, opt)
      (out_string, opt) = Prec.new.precision(to_use_argv)
      fppr_output_as_html = prefix_as_decoy_to_html(out_string)
    end

    case SpecID.file_type(file)
    when "protproph"
      #spec_id = SpecID.new(file)
      proph_output(file, outfn, opt, fppr_output_as_html)
    when "bioworks"
      spec_id = SpecID.new(file)

      false_regex = flag_to_regex(opt.f, opt.prefix)
      bioworks_output(spec_id, outfn, file, false_regex, fppr_output_as_html)
    else 
      abort "filetype for #{file} not recognized!"
    end
  end

end

#create_precision_argv(file, opt) ⇒ `Object`

method create_from_command_line

# File 'lib/spec_id/protein_summary.rb', line 474

def create_precision_argv(file, opt)
  # include only those options specific
  new_argv = [file]
  if opt.prefix ; new_argv << '--prefix' end
  if opt.f ; new_argv << '-f' << opt.f end
  if opt.o ; new_argv << '-o' << opt.o end
  new_argv
end

#error_info(prot_file_name) ⇒ `Object`

Takes the -prot.xml filename and grabs the png file (if available)

# File 'lib/spec_id/protein_summary.rb', line 134

def error_info(prot_file_name)
  img = prot_file_name.gsub('.xml', '.png')
  img_bn = File.basename(img)
    "<div id=\"error\"><img src=\"#{img_bn}\" alt=\"[ Optional: To view error/sensitivity image, put #{img_bn} in the same directory as #{File.basename(prot_file_name)} ]\"/>\n</div>"
end

#file_as_decoy_to_html(string) ⇒ `Object`

transforms the output string of file_as_decoy into html

# File 'lib/spec_id/protein_summary.rb', line 391

def file_as_decoy_to_html(string)  
  lines = string.split("\n")
  #puts lines ?? is this supposed to be commented out?
  lines = lines.reject do |obj| obj =~ /\*{10}/ end
  lines.map! do |line| "#{line}<br/>" end
  "<div class=\"fppr\">
  <h3>Classification Analysis</h3>
  #{lines.join("\n")} 
  </div>"
end

#file_info(file) ⇒ `Object`

# File 'lib/spec_id/protein_summary.rb', line 246

def file_info(file)
  "<div class=\"file_info\"><h3>Source File Information</h3>File: #{File.expand_path(file)}
  <br/>Last Modified: #{File.mtime(file)}
  <br/>Size: #{File.size(file)/1000} KB
  </div>"
end

#filter_and_sort(uniq_prots, flag = nil, prefix = false) ⇒ `Object`

filters on the false positive regex and sorts by prot probability

# File 'lib/spec_id/protein_summary.rb', line 172

def filter_and_sort(uniq_prots, flag=nil, prefix=false)
  false_flag_re = flag_to_regex(flag, prefix) 
  sorted = uniq_prots.sort_by {|prt| [prt._probability, prt.parent._probability]}.reverse
  ## filter on prefix
  if prefix
    sorted = sorted.reject {|prot| prot._protein_name =~ false_flag_re }
  end
  sorted
end

#flag_to_regex(flag, prefix = false) ⇒ `Object`

# File 'lib/spec_id/protein_summary.rb', line 149

def flag_to_regex(flag, prefix=false)
  if flag
    if prefix
      /^#{Regexp.escape(flag)}/
    else
      /#{Regexp.escape(flag)}/
    end
  else
    nil
  end
end

#mspire_version ⇒ `Object`

# File 'lib/spec_id/protein_summary.rb', line 275

def mspire_version
  string = "mspire"
  begin
    if `gem list --local mspire` =~ /mspire \((.*?)\)/
      string << (" v" + $1)
    end
  rescue Exception
  end
  string
end

#num_prots_above_fppr(prots, desired_fppr) ⇒ `Object`

assumes that these are sorted on probability desired_fppr is a float returns [number_of_prots, actual_fppr]

# File 'lib/spec_id/protein_summary.rb', line 185

def num_prots_above_fppr(prots, desired_fppr)
  current_fppr_rate_percent = 0.0
  previous_fppr_rate_percent = 0.0
  current_sum_one_minus_prob = 0.0
  proteins_within_fppr = 0
  actual_fppr = nil
  already_found = false
  prot_cnt = 0
  prots.each do |prot|
    prot_cnt += 1
    # SUM(1-probX)/#prots
    current_sum_one_minus_prob += 1.0 - prot._probability.to_f
    current_fppr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100

    if current_fppr_rate_percent > desired_fppr && !already_found
      actual_fppr = previous_fppr_rate_percent
      proteins_within_fppr = prot_cnt
      already_found = true
    end
    previous_fppr_rate_percent = current_fppr_rate_percent
  end
  [proteins_within_fppr, actual_fppr]
end

#num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins) ⇒ `Object`

bioworks_output

# File 'lib/spec_id/protein_summary.rb', line 380

def num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins)
  actual_cutoff = sprintf("%.3f", actual_cutoff)
  desired_cutoff = sprintf("%.3f", desired_cutoff)
  "<div class=\"num_proteins\"><h3>False Positive Predictive Rate [ FP/(TP+FP) ]</h3>
  Desired FPPR: #{desired_cutoff} %<br/>
  Actual FPPR: #{actual_cutoff} %<br/>
  Number of Proteins at Actual FPPR: #{num_proteins}
  </div>"
end

#output_peptide_counts_file(prots, filename) ⇒ `Object`

given a list of proteins, output a tab delimited textfile with protein name and the total number of peptides found

# File 'lib/spec_id/protein_summary.rb', line 163

def output_peptide_counts_file(prots, filename)
  File.open(filename, "w") do |fh_out|
    prots.each do |prot|
      fh_out.puts [prot._protein_name, prot._total_number_peptides].join("\t")
    end
  end
end

#peptide_cell(prot_num, peptide_sequences) ⇒ `Object`

given a list of peptide sequences creates javascript to hide/show them



343
344
345

# File 'lib/spec_id/protein_summary.rb', line 343

def peptide_cell(prot_num, peptide_sequences)
  "<a href=\"#prot#{prot_num}\" onclick=\"toggle_vis('#{prot_num}');\">#{peptide_sequences.size}</a><div id=\"#{prot_num}\" style=\"display:none;\">#{peptide_sequences.join(', ')}</div>"
end

#prefix_as_decoy_to_html(string) ⇒ `Object`

transforms the output string of file_as_decoy into html

# File 'lib/spec_id/protein_summary.rb', line 403

def prefix_as_decoy_to_html(string)  
  "<div class=\"fppr\">
  <h3>Classification Analysis</h3>
  </div>" +
  string
end

#print_html_pieces(file, *pieces) ⇒ `Object`

# File 'lib/spec_id/protein_summary.rb', line 238

def print_html_pieces(file, *pieces)
  File.open(file, "w") do |out|
    pieces.each do |piece|
      out.print piece
    end
  end
end

#proph_output(file, outfn, opt, fppr_output_as_html) ⇒ `Object`

# File 'lib/spec_id/protein_summary.rb', line 290

def proph_output(file, outfn, opt, fppr_output_as_html)
  header_anchors = [at('#', 'number'), at('prob','protein probability (for Prophet, higher is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (includes non-contributing peptides). Click number to show/hide'), at('#peps', 'total number of corresponding peptides that contributed to protein probability'),  at('%ids', 'fraction of correct dataset peptide identifications corresponding to protein')]
  num_cols = header_anchors.size
  theaders = ths(header_anchors)

  root = AXML.parse_file(file)
  prots = []
  ## find the min_prob at a fppr of XX
  min_prob_redline = 1.01  # if no fppr is less than what they give, then all are redlined!

  if opt.c 
    actual_percent_fp = opt.c.to_f
  elsif opt.cut_at
    actual_percent_fp = opt.cut_at.to_f
  else
    actual_percent_fp = nil
  end
  root.protein_group.each do |group|
    group.protein.each do |prt|
      prots << prt 
    end
  end
  uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
  filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f, opt.prefix)

  ## num proteins above cutoff (if opt.c)
  num_prots_html = ''
  if opt.c || opt.cut_at
    (num_prots, actual_fppr) = num_prots_above_fppr(filtered_sorted_prots, actual_percent_fp)
    num_prots_html = num_prots_to_html(actual_percent_fp, actual_fppr, num_prots)
  end
  if opt.cut_at
    filtered_sorted_prots = filtered_sorted_prots[0,num_prots]
  end

  output_peptide_counts_file(filtered_sorted_prots, opt.peptide_count) if opt.peptide_count

  # get an array of annotations (or nil if no option)
  annotations =
    if opt.get_annotation
      gis = filtered_sorted_prots.map {|prot| accession(prot._protein_name) }
      GI.gi2annot(gis) 
    end

  table_string = table do 
    tr{theaders} + table_rows(filtered_sorted_prots, opt.f, actual_percent_fp, num_cols, opt.c.to_f, actual_percent_fp, annotations, opt.peptide_count)
  end
  er_info = opt.precision ? error_info(file) : ""
  html_pieces = [outfn, header, fppr_output_as_html, er_info, file_info(file), protproph_script_info, num_prots_html, table_string, trailer]
  print_html_pieces(*html_pieces)
end

#protproph_script_info ⇒ `Object`

# File 'lib/spec_id/protein_summary.rb', line 261

def protproph_script_info
  begin
    where = `which xinteract`
    reply = `#{where}`
  rescue Exception
    reply = ""
  end
  prophet = "TPP (version unknown)"  # put your version here if you can't get it dynamically
  if reply =~ /xinteract.*?\((TPP .*)\)/
    prophet = $1.dup
  end
  script_info { "ProteinProphet from: #{prophet}" }
end

#ref_html(gi, name) ⇒ `Object`



129
130
131

# File 'lib/spec_id/protein_summary.rb', line 129

def ref_html(gi, name)
"<a href=\"http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?db=protein&val=#{gi}\" title=\"#{name}\">#{gi}</a>"
end

#script_info ⇒ `Object`



286
287
288

# File 'lib/spec_id/protein_summary.rb', line 286

def script_info
  "<div class=\"software\"><h3>Software Information</h3>#{yield}<br/>Ruby package: #{mspire_version}<br/>Command: #{[File.basename(__FILE__), *@orig_argv].join(" ")}</div>"
end

#table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, annotations = nil, peptide_count_filename = nil) ⇒ `Object`

returns a string of the table rows false_positive_rate (give as a %) is the cutoff mark returns the number of proteins at the desired_fppr (if given)

# File 'lib/spec_id/protein_summary.rb', line 214

def table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, annotations=nil, peptide_count_filename=nil)
  prot_cnt = 0
  an_cnt = 0
 
  uniq_prots.map do |prot| 
    tr do
      prot_cnt += 1
      gi = accession(prot._protein_name)

      if annotations
        protein_description = annotations[an_cnt]
        an_cnt += 1
      else
        if prot.annotation.size > 0
          protein_description = prot.annotation.first._protein_description
        else
          protein_description = 'NA'
        end
      end
      tds([prot_cnt, prot._probability, ref_html(gi, prot._protein_name), protein_description, prot._percent_coverage, peptide_cell(prot_cnt, prot._unique_stripped_peptides.split('+')), prot._total_number_peptides, prot._pct_spectrum_ids])
    end
  end.join
end

Class: ProteinSummary

Defined Under Namespace

Instance Method Summary collapse

Methods included from HTML

Instance Method Details

#accession(name) ⇒ Object

#bioworks_output(spec_id, outfn, file = nil, false_flag_re = nil, fppr_output_as_html = nil) ⇒ Object

#bioworks_script_info(obj) ⇒ Object

#create_from_command_line_args(argv) ⇒ Object

#create_precision_argv(file, opt) ⇒ Object

#error_info(prot_file_name) ⇒ Object

#file_as_decoy_to_html(string) ⇒ Object

#file_info(file) ⇒ Object

#filter_and_sort(uniq_prots, flag = nil, prefix = false) ⇒ Object

#flag_to_regex(flag, prefix = false) ⇒ Object

#mspire_version ⇒ Object

#num_prots_above_fppr(prots, desired_fppr) ⇒ Object

#num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins) ⇒ Object

#output_peptide_counts_file(prots, filename) ⇒ Object

#peptide_cell(prot_num, peptide_sequences) ⇒ Object

#prefix_as_decoy_to_html(string) ⇒ Object

#print_html_pieces(file, *pieces) ⇒ Object

#proph_output(file, outfn, opt, fppr_output_as_html) ⇒ Object

#protproph_script_info ⇒ Object

#ref_html(gi, name) ⇒ Object

#script_info ⇒ Object

#table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, annotations = nil, peptide_count_filename = nil) ⇒ Object