Class: ProteinSummary
- Inherits:
-
Object
- Object
- ProteinSummary
- Includes:
- HTML
- Defined in:
- lib/spec_id/protein_summary.rb,
lib/spec_id/protein_summary.rb
Defined Under Namespace
Modules: HTML
Instance Method Summary collapse
-
#accession(name) ⇒ Object
attempts to get the NCBI gi code.
-
#bioworks_output(spec_id, outfn, file = nil, false_prefix = nil, fppr_output_as_html = nil) ⇒ Object
takes spec_id object the outfn is the output filename opt is an OpenStruct that holds opt.f = the false prefix.
- #bioworks_script_info(obj) ⇒ Object
- #create_from_command_line_args(argv) ⇒ Object
-
#create_precision_argv(file, opt) ⇒ Object
method create_from_command_line.
-
#error_info(prot_file_name) ⇒ Object
Takes the -prot.xml filename and grabs the png file (if available).
-
#file_as_decoy_to_html(string) ⇒ Object
transforms the output string of file_as_decoy into html.
- #file_info(file) ⇒ Object
-
#filter_and_sort(uniq_prots, prefix = nil) ⇒ Object
filters on the false positive regex and sorts by prot probability.
- #mspire_version ⇒ Object
-
#num_prots_above_fppr(prots, desired_fppr) ⇒ Object
assumes that these are sorted on probability desired_fppr is a float returns [number_of_prots, actual_fppr].
-
#num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins) ⇒ Object
bioworks_output.
-
#output_peptide_counts_file(prots, filename) ⇒ Object
given a list of proteins, output a tab delimited textfile with protein name and the total number of peptides found.
-
#peptide_cell(prot_num, peptide_sequences) ⇒ Object
given a list of peptide sequences creates javascript to hide/show them.
-
#prefix_as_decoy_to_html(string) ⇒ Object
transforms the output string of file_as_decoy into html.
- #prefix_to_regex(prefix) ⇒ Object
- #print_html_pieces(file, *pieces) ⇒ Object
- #proph_output(file, outfn, opt, fppr_output_as_html) ⇒ Object
- #protproph_script_info ⇒ Object
- #ref_html(gi, name) ⇒ Object
- #script_info ⇒ Object
-
#table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, peptide_count_filename = nil) ⇒ Object
returns a string of the table rows false_positive_rate (give as a %) is the cutoff mark returns the number of proteins at the desired_fppr (if given).
Methods included from HTML
#at, #header, #style, #table, #tds, #ths, #tr, #trailer
Instance Method Details
#accession(name) ⇒ Object
attempts to get the NCBI gi code
140 141 142 143 144 145 146 |
# File 'lib/spec_id/protein_summary.rb', line 140 def accession(name) if (name.include? '|') && (name[0,3] == 'gi|') name.split('|')[1] else name end end |
#bioworks_output(spec_id, outfn, file = nil, false_prefix = nil, fppr_output_as_html = nil) ⇒ Object
takes spec_id object the outfn is the output filename opt is an OpenStruct that holds opt.f = the false prefix
325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 |
# File 'lib/spec_id/protein_summary.rb', line 325 def bioworks_output(spec_id, outfn, file=nil, false_prefix=nil, fppr_output_as_html=nil) fppr_output_as_html ||= '' header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')] num_cols = header_anchors.size theaders = ths(header_anchors) proteins = spec_id.prots protein_num = 0 rows = "" prefix_re = prefix_to_regex(false_prefix) proteins.each do |prot| if false_prefix && prot.reference =~ prefix_re next end uniq_peps = Hash.new {|h,k| h[k] = true; } protein_num += 1 prot.peps.each do |pep| uniq_peps[pep.sequence.split('.')[1]] = true end pieces = prot.reference.split(' ') long_prot_name = pieces.shift annotation = pieces.join(' ') accession = prot.accession if accession == '0' ; accession = long_prot_name end rows << tr{ tds([protein_num, prot.protein_probability, ref_html(accession, long_prot_name), annotation, prot.coverage, peptide_cell(protein_num, uniq_peps.keys), prot.peps.size]) } end table_string = table do tr{theaders} + rows end print_html_pieces(outfn, header, fppr_output_as_html, file_info(file), bioworks_script_info(spec_id), table_string, trailer) end |
#bioworks_script_info(obj) ⇒ Object
235 236 237 238 239 240 241 |
# File 'lib/spec_id/protein_summary.rb', line 235 def bioworks_script_info(obj) version = "3.2??" if obj.version version = obj.version end script_info{"Bioworks version #{version}"} end |
#create_from_command_line_args(argv) ⇒ Object
386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 |
# File 'lib/spec_id/protein_summary.rb', line 386 def create_from_command_line_args(argv) @orig_argv = argv.dup opt = OpenStruct.new opt.f = DEF_PREFIX opts = OptionParser.new do |op| op. = "usage: #{File.basename(__FILE__)} [options] <file>.xml ..." op.separator " where file = bioworks -or- <run>-prot (prophet output)" op.separator " outputs: <file>.summary.html" op.separator "" op.on("-f", "--false <prefix>", "ignore proteins with prefix (def: #{DEF_PREFIX})") {|v| opt.f = v } op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v } op.separator(" if --precision then -f is used to specify a file or prefix") op.separator(" that indicates the false positives.") op.on("--peptide_count <filename>", "outputs text file with # peptides per protein") {|v| opt.peptide_count = v} op.separator "" op.separator "Options for #{PRECISION_PROGRAM_BASE}.rb :" op.on("--#{PRECISION_PROGRAM_BASE}", "include output of #{PRECISION_PROGRAM_BASE}.rb,") {|v| opt.precision = v} op.separator(" type '#{PRECISION_PROGRAM_BASE}.rb' for details") op.separator "" op.separator "Specific to ProteinProphet (with no concatenated DB):" op.on("-c", "--cutoff percent", "false positive predictive rate (FPPR)% for given cutoff") {|v| opt.c = v } op.on("--cut_at percent", "only reports proteins within FPPR %") {|v| opt.cut_at = v } end opts.parse!(argv) if argv.size < 1 puts opts return end fppr_output_as_html = '' files = argv.to_a files.each do |file| outfn = file.sub(/\.xml$/, '.summary.html') outfn = outfn.sub(/\.srg$/, '.summary.html') ## False Positive Rate Calculation: if opt.precision opt.o = outfn # won't actually be written over, but used to_use_argv = create_precision_argv(file, opt) (out_string, opt) = Prec.new.precision(to_use_argv) fppr_output_as_html = prefix_as_decoy_to_html(out_string) end case SpecID.file_type(file) when "protproph" #spec_id = SpecID.new(file) proph_output(file, outfn, opt, fppr_output_as_html) when "bioworks" spec_id = SpecID.new(file) bioworks_output(spec_id, outfn, file, opt.f, fppr_output_as_html) else abort "filetype for #{file} not recognized!" end end end |
#create_precision_argv(file, opt) ⇒ Object
method create_from_command_line
445 446 447 448 449 450 451 |
# File 'lib/spec_id/protein_summary.rb', line 445 def create_precision_argv(file, opt) # include only those options specific new_argv = [file] if opt.f ; new_argv << '-f' << opt.f end if opt.o ; new_argv << '-o' << opt.o end new_argv end |
#error_info(prot_file_name) ⇒ Object
Takes the -prot.xml filename and grabs the png file (if available)
133 134 135 136 137 |
# File 'lib/spec_id/protein_summary.rb', line 133 def error_info(prot_file_name) img = prot_file_name.gsub('.xml', '.png') img_bn = File.basename(img) "<div id=\"error\"><img src=\"#{img_bn}\" alt=\"[ Optional: To view error/sensitivity image, put #{img_bn} in the same directory as #{File.basename(prot_file_name)} ]\"/>\n</div>" end |
#file_as_decoy_to_html(string) ⇒ Object
transforms the output string of file_as_decoy into html
367 368 369 370 371 372 373 374 375 376 |
# File 'lib/spec_id/protein_summary.rb', line 367 def file_as_decoy_to_html(string) lines = string.split("\n") #puts lines ?? is this supposed to be commented out? lines = lines.reject do |obj| obj =~ /\*{10}/ end lines.map! do |line| "#{line}<br/>" end "<div class=\"fppr\"> <h3>Classification Analysis</h3> #{lines.join("\n")} </div>" end |
#file_info(file) ⇒ Object
228 229 230 231 232 233 |
# File 'lib/spec_id/protein_summary.rb', line 228 def file_info(file) "<div class=\"file_info\"><h3>Source File Information</h3>File: #{File.expand_path(file)} <br/>Last Modified: #{File.mtime(file)} <br/>Size: #{File.size(file)/1000} KB </div>" end |
#filter_and_sort(uniq_prots, prefix = nil) ⇒ Object
filters on the false positive regex and sorts by prot probability
167 168 169 170 171 172 173 174 175 |
# File 'lib/spec_id/protein_summary.rb', line 167 def filter_and_sort(uniq_prots, prefix=nil) prefix_re = prefix_to_regex(prefix) sorted = uniq_prots.sort_by {|prt| [prt._probability, prt.parent._probability]}.reverse ## filter on prefix if prefix sorted = sorted.reject {|prot| prot._protein_name =~ prefix_re } end sorted end |
#mspire_version ⇒ Object
257 258 259 260 261 262 263 264 265 266 |
# File 'lib/spec_id/protein_summary.rb', line 257 def mspire_version string = "mspire" begin if `gem list --local mspire` =~ /mspire \((.*?)\)/ string << (" v" + $1) end rescue Exception end string end |
#num_prots_above_fppr(prots, desired_fppr) ⇒ Object
assumes that these are sorted on probability desired_fppr is a float returns [number_of_prots, actual_fppr]
180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
# File 'lib/spec_id/protein_summary.rb', line 180 def num_prots_above_fppr(prots, desired_fppr) current_fppr_rate_percent = 0.0 previous_fppr_rate_percent = 0.0 current_sum_one_minus_prob = 0.0 proteins_within_fppr = 0 actual_fppr = nil already_found = false prot_cnt = 0 prots.each do |prot| prot_cnt += 1 # SUM(1-probX)/#prots current_sum_one_minus_prob += 1.0 - prot._probability.to_f current_fppr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100 if current_fppr_rate_percent > desired_fppr && !already_found actual_fppr = previous_fppr_rate_percent proteins_within_fppr = prot_cnt already_found = true end previous_fppr_rate_percent = current_fppr_rate_percent end [proteins_within_fppr, actual_fppr] end |
#num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins) ⇒ Object
bioworks_output
356 357 358 359 360 361 362 363 364 |
# File 'lib/spec_id/protein_summary.rb', line 356 def num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins) actual_cutoff = sprintf("%.3f", actual_cutoff) desired_cutoff = sprintf("%.3f", desired_cutoff) "<div class=\"num_proteins\"><h3>False Positive Predictive Rate [ FP/(TP+FP) ]</h3> Desired FPPR: #{desired_cutoff} %<br/> Actual FPPR: #{actual_cutoff} %<br/> Number of Proteins at Actual FPPR: #{num_proteins} </div>" end |
#output_peptide_counts_file(prots, filename) ⇒ Object
given a list of proteins, output a tab delimited textfile with protein name and the total number of peptides found
158 159 160 161 162 163 164 |
# File 'lib/spec_id/protein_summary.rb', line 158 def output_peptide_counts_file(prots, filename) File.open(filename, "w") do |fh_out| prots.each do |prot| fh_out.puts [prot._protein_name, prot._total_number_peptides].join("\t") end end end |
#peptide_cell(prot_num, peptide_sequences) ⇒ Object
given a list of peptide sequences creates javascript to hide/show them
318 319 320 |
# File 'lib/spec_id/protein_summary.rb', line 318 def peptide_cell(prot_num, peptide_sequences) "<a href=\"#prot#{prot_num}\" onclick=\"toggle_vis('#{prot_num}');\">#{peptide_sequences.size}</a><div id=\"#{prot_num}\" style=\"display:none;\">#{peptide_sequences.join(', ')}</div>" end |
#prefix_as_decoy_to_html(string) ⇒ Object
transforms the output string of file_as_decoy into html
379 380 381 382 383 384 |
# File 'lib/spec_id/protein_summary.rb', line 379 def prefix_as_decoy_to_html(string) "<div class=\"fppr\"> <h3>Classification Analysis</h3> </div>" + string end |
#prefix_to_regex(prefix) ⇒ Object
148 149 150 151 152 153 154 |
# File 'lib/spec_id/protein_summary.rb', line 148 def prefix_to_regex(prefix) if prefix /^#{Regexp.escape(prefix)}/ else nil end end |
#print_html_pieces(file, *pieces) ⇒ Object
220 221 222 223 224 225 226 |
# File 'lib/spec_id/protein_summary.rb', line 220 def print_html_pieces(file, *pieces) File.open(file, "w") do |out| pieces.each do |piece| out.print piece end end end |
#proph_output(file, outfn, opt, fppr_output_as_html) ⇒ Object
272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 |
# File 'lib/spec_id/protein_summary.rb', line 272 def proph_output(file, outfn, opt, fppr_output_as_html) header_anchors = [at('#', 'number'), at('prob','protein probability (for Prophet, higher is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (includes non-contributing peptides). Click number to show/hide'), at('#peps', 'total number of corresponding peptides that contributed to protein probability'), at('%ids', 'fraction of correct dataset peptide identifications corresponding to protein')] num_cols = header_anchors.size theaders = ths(header_anchors) root = AXML.parse_file(file) prots = [] ## find the min_prob at a fppr of XX min_prob_redline = 1.01 # if no fppr is less than what they give, then all are redlined! if opt.c actual_percent_fp = opt.c.to_f elsif opt.cut_at actual_percent_fp = opt.cut_at.to_f else actual_percent_fp = nil end root.protein_group.each do |group| group.protein.each do |prt| prots << prt end end uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first } filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f) ## num proteins above cutoff (if opt.c) num_prots_html = '' if opt.c || opt.cut_at (num_prots, actual_fppr) = num_prots_above_fppr(filtered_sorted_prots, actual_percent_fp) num_prots_html = num_prots_to_html(actual_percent_fp, actual_fppr, num_prots) end if opt.cut_at filtered_sorted_prots = filtered_sorted_prots[0,num_prots] end output_peptide_counts_file(filtered_sorted_prots, opt.peptide_count) if opt.peptide_count table_string = table do tr{theaders} + table_rows(filtered_sorted_prots, opt.f, actual_percent_fp, num_cols, opt.c.to_f, actual_percent_fp, opt.peptide_count) end er_info = opt.precision ? error_info(file) : "" html_pieces = [outfn, header, fppr_output_as_html, er_info, file_info(file), protproph_script_info, num_prots_html, table_string, trailer] print_html_pieces(*html_pieces) end |
#protproph_script_info ⇒ Object
243 244 245 246 247 248 249 250 251 252 253 254 255 |
# File 'lib/spec_id/protein_summary.rb', line 243 def protproph_script_info begin where = `which xinteract` reply = `#{where}` rescue Exception reply = "" end prophet = "TPP (version unknown)" # put your version here if you can't get it dynamically if reply =~ /xinteract.*?\((TPP .*)\)/ prophet = $1.dup end script_info { "ProteinProphet from: #{prophet}" } end |
#ref_html(gi, name) ⇒ Object
128 129 130 |
# File 'lib/spec_id/protein_summary.rb', line 128 def ref_html(gi, name) "<a href=\"http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?db=protein&val=#{gi}\" title=\"#{name}\">#{gi}</a>" end |
#script_info ⇒ Object
268 269 270 |
# File 'lib/spec_id/protein_summary.rb', line 268 def script_info "<div class=\"software\"><h3>Software Information</h3>#{yield}<br/>Ruby package: #{mspire_version}<br/>Command: #{[File.basename(__FILE__), *@orig_argv].join(" ")}</div>" end |
#table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, peptide_count_filename = nil) ⇒ Object
returns a string of the table rows false_positive_rate (give as a %) is the cutoff mark returns the number of proteins at the desired_fppr (if given)
209 210 211 212 213 214 215 216 217 218 |
# File 'lib/spec_id/protein_summary.rb', line 209 def table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, peptide_count_filename=nil) prot_cnt = 0 uniq_prots.map do |prot| tr do prot_cnt += 1 gi = accession(prot._protein_name) tds([prot_cnt, prot._probability, ref_html(gi, prot._protein_name), prot.annotation.first._protein_description, prot._percent_coverage, peptide_cell(prot_cnt, prot._unique_stripped_peptides.split('+')), prot._total_number_peptides, prot._pct_spectrum_ids]) end end.join end |