Class: ProteinSummary
- Inherits:
-
Object
- Object
- ProteinSummary
- Includes:
- HTML
- Defined in:
- lib/spec_id/protein_summary.rb,
lib/spec_id/protein_summary.rb
Defined Under Namespace
Modules: HTML
Instance Method Summary collapse
-
#accession(name) ⇒ Object
attempts to get the NCBI gi code.
-
#bioworks_output(spec_id, outfn, file = nil, false_flag_re = nil, fppr_output_as_html = nil) ⇒ Object
takes spec_id object the outfn is the output filename opt is an OpenStruct that holds opt.f = the false prefix.
- #bioworks_script_info(obj) ⇒ Object
- #create_from_command_line_args(argv) ⇒ Object
-
#create_precision_argv(file, opt) ⇒ Object
method create_from_command_line.
-
#error_info(prot_file_name) ⇒ Object
Takes the -prot.xml filename and grabs the png file (if available).
-
#file_as_decoy_to_html(string) ⇒ Object
transforms the output string of file_as_decoy into html.
- #file_info(file) ⇒ Object
-
#filter_and_sort(uniq_prots, flag = nil, prefix = false) ⇒ Object
filters on the false positive regex and sorts by prot probability.
- #flag_to_regex(flag, prefix = false) ⇒ Object
- #mspire_version ⇒ Object
-
#num_prots_above_fppr(prots, desired_fppr) ⇒ Object
assumes that these are sorted on probability desired_fppr is a float returns [number_of_prots, actual_fppr].
-
#num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins) ⇒ Object
bioworks_output.
-
#output_peptide_counts_file(prots, filename) ⇒ Object
given a list of proteins, output a tab delimited textfile with protein name and the total number of peptides found.
-
#peptide_cell(prot_num, peptide_sequences) ⇒ Object
given a list of peptide sequences creates javascript to hide/show them.
-
#prefix_as_decoy_to_html(string) ⇒ Object
transforms the output string of file_as_decoy into html.
- #print_html_pieces(file, *pieces) ⇒ Object
- #proph_output(file, outfn, opt, fppr_output_as_html) ⇒ Object
- #protproph_script_info ⇒ Object
- #ref_html(gi, name) ⇒ Object
- #script_info ⇒ Object
-
#table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, annotations = nil, peptide_count_filename = nil) ⇒ Object
returns a string of the table rows false_positive_rate (give as a %) is the cutoff mark returns the number of proteins at the desired_fppr (if given).
Methods included from HTML
#at, #header, #style, #table, #tds, #ths, #tr, #trailer
Instance Method Details
#accession(name) ⇒ Object
attempts to get the NCBI gi code
141 142 143 144 145 146 147 |
# File 'lib/spec_id/protein_summary.rb', line 141 def accession(name) if (name.include? '|') && (name[0,3] == 'gi|') name.split('|')[1] else name end end |
#bioworks_output(spec_id, outfn, file = nil, false_flag_re = nil, fppr_output_as_html = nil) ⇒ Object
takes spec_id object the outfn is the output filename opt is an OpenStruct that holds opt.f = the false prefix
350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 |
# File 'lib/spec_id/protein_summary.rb', line 350 def bioworks_output(spec_id, outfn, file=nil, false_flag_re=nil, fppr_output_as_html=nil) fppr_output_as_html ||= '' header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')] num_cols = header_anchors.size theaders = ths(header_anchors) proteins = spec_id.prots protein_num = 0 rows = "" proteins.each do |prot| if false_flag_re && prot.reference =~ false_flag_re next end uniq_peps = Hash.new {|h,k| h[k] = true; } protein_num += 1 prot.peps.each do |pep| uniq_peps[pep.sequence.split('.')[1]] = true end pieces = prot.reference.split(' ') long_prot_name = pieces.shift annotation = pieces.join(' ') accession = prot.accession if accession == '0' ; accession = long_prot_name end rows << tr{ tds([protein_num, prot.protein_probability, ref_html(accession, long_prot_name), annotation, prot.coverage, peptide_cell(protein_num, uniq_peps.keys), prot.peps.size]) } end table_string = table do tr{theaders} + rows end print_html_pieces(outfn, header, fppr_output_as_html, file_info(file), bioworks_script_info(spec_id), table_string, trailer) end |
#bioworks_script_info(obj) ⇒ Object
253 254 255 256 257 258 259 |
# File 'lib/spec_id/protein_summary.rb', line 253 def bioworks_script_info(obj) version = "3.2??" if obj.version version = obj.version end script_info{"Bioworks version #{version}"} end |
#create_from_command_line_args(argv) ⇒ Object
410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 |
# File 'lib/spec_id/protein_summary.rb', line 410 def create_from_command_line_args(argv) @orig_argv = argv.dup opt = OpenStruct.new opt.f = DEF_PREFIX opts = OptionParser.new do |op| op. = "usage: #{File.basename(__FILE__)} [options] <file>.xml ..." op.separator " where file = bioworks -or- <run>-prot (prophet output)" op.separator " outputs: <file>.summary.html" op.separator "" op.on("-f", "--false <prefix>", "ignore proteins with flag (def: #{DEF_PREFIX})") {|v| opt.f = v } op.on("--prefix", "false flag for prefixes only") {|v| opt.prefix = v } op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v } op.separator(" if --precision then -f is used to specify a file or prefix") op.separator(" that indicates the false positives.") op.on("--peptide_count <filename>", "outputs text file with # peptides per protein") {|v| opt.peptide_count = v} op.separator "" op.separator "Options for #{PRECISION_PROGRAM_BASE}.rb :" op.on("--#{PRECISION_PROGRAM_BASE}", "include output of #{PRECISION_PROGRAM_BASE}.rb,") {|v| opt.precision = v} op.separator(" type '#{PRECISION_PROGRAM_BASE}.rb' for details") op.separator "" op.separator "specific to ProteinProphet (with no concatenated DB):" op.on("-c", "--cutoff percent", "false positive predictive rate (FPPR)% for given cutoff") {|v| opt.c = v } op.on("--cut_at percent", "only reports proteins within FPPR %") {|v| opt.cut_at = v } op.on("--get_annotation", "retrieves annotation by gi code") {|v| opt.get_annotation = v} op.separator " (use if your proteins have gi's but no annotation) " end opts.parse!(argv) if argv.size < 1 puts opts return end fppr_output_as_html = '' files = argv.to_a files.each do |file| outfn = file.sub(/\.xml$/, '.summary.html') outfn = outfn.sub(/\.srg$/, '.summary.html') ## False Positive Rate Calculation: if opt.precision opt.o = outfn # won't actually be written over, but used to_use_argv = create_precision_argv(file, opt) (out_string, opt) = Prec.new.precision(to_use_argv) fppr_output_as_html = prefix_as_decoy_to_html(out_string) end case SpecID.file_type(file) when "protproph" #spec_id = SpecID.new(file) proph_output(file, outfn, opt, fppr_output_as_html) when "bioworks" spec_id = SpecID.new(file) false_regex = flag_to_regex(opt.f, opt.prefix) bioworks_output(spec_id, outfn, file, false_regex, fppr_output_as_html) else abort "filetype for #{file} not recognized!" end end end |
#create_precision_argv(file, opt) ⇒ Object
method create_from_command_line
474 475 476 477 478 479 480 481 |
# File 'lib/spec_id/protein_summary.rb', line 474 def create_precision_argv(file, opt) # include only those options specific new_argv = [file] if opt.prefix ; new_argv << '--prefix' end if opt.f ; new_argv << '-f' << opt.f end if opt.o ; new_argv << '-o' << opt.o end new_argv end |
#error_info(prot_file_name) ⇒ Object
Takes the -prot.xml filename and grabs the png file (if available)
134 135 136 137 138 |
# File 'lib/spec_id/protein_summary.rb', line 134 def error_info(prot_file_name) img = prot_file_name.gsub('.xml', '.png') img_bn = File.basename(img) "<div id=\"error\"><img src=\"#{img_bn}\" alt=\"[ Optional: To view error/sensitivity image, put #{img_bn} in the same directory as #{File.basename(prot_file_name)} ]\"/>\n</div>" end |
#file_as_decoy_to_html(string) ⇒ Object
transforms the output string of file_as_decoy into html
391 392 393 394 395 396 397 398 399 400 |
# File 'lib/spec_id/protein_summary.rb', line 391 def file_as_decoy_to_html(string) lines = string.split("\n") #puts lines ?? is this supposed to be commented out? lines = lines.reject do |obj| obj =~ /\*{10}/ end lines.map! do |line| "#{line}<br/>" end "<div class=\"fppr\"> <h3>Classification Analysis</h3> #{lines.join("\n")} </div>" end |
#file_info(file) ⇒ Object
246 247 248 249 250 251 |
# File 'lib/spec_id/protein_summary.rb', line 246 def file_info(file) "<div class=\"file_info\"><h3>Source File Information</h3>File: #{File.(file)} <br/>Last Modified: #{File.mtime(file)} <br/>Size: #{File.size(file)/1000} KB </div>" end |
#filter_and_sort(uniq_prots, flag = nil, prefix = false) ⇒ Object
filters on the false positive regex and sorts by prot probability
172 173 174 175 176 177 178 179 180 |
# File 'lib/spec_id/protein_summary.rb', line 172 def filter_and_sort(uniq_prots, flag=nil, prefix=false) false_flag_re = flag_to_regex(flag, prefix) sorted = uniq_prots.sort_by {|prt| [prt._probability, prt.parent._probability]}.reverse ## filter on prefix if prefix sorted = sorted.reject {|prot| prot._protein_name =~ false_flag_re } end sorted end |
#flag_to_regex(flag, prefix = false) ⇒ Object
149 150 151 152 153 154 155 156 157 158 159 |
# File 'lib/spec_id/protein_summary.rb', line 149 def flag_to_regex(flag, prefix=false) if flag if prefix /^#{Regexp.escape(flag)}/ else /#{Regexp.escape(flag)}/ end else nil end end |
#mspire_version ⇒ Object
275 276 277 278 279 280 281 282 283 284 |
# File 'lib/spec_id/protein_summary.rb', line 275 def mspire_version string = "mspire" begin if `gem list --local mspire` =~ /mspire \((.*?)\)/ string << (" v" + $1) end rescue Exception end string end |
#num_prots_above_fppr(prots, desired_fppr) ⇒ Object
assumes that these are sorted on probability desired_fppr is a float returns [number_of_prots, actual_fppr]
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
# File 'lib/spec_id/protein_summary.rb', line 185 def num_prots_above_fppr(prots, desired_fppr) current_fppr_rate_percent = 0.0 previous_fppr_rate_percent = 0.0 current_sum_one_minus_prob = 0.0 proteins_within_fppr = 0 actual_fppr = nil already_found = false prot_cnt = 0 prots.each do |prot| prot_cnt += 1 # SUM(1-probX)/#prots current_sum_one_minus_prob += 1.0 - prot._probability.to_f current_fppr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100 if current_fppr_rate_percent > desired_fppr && !already_found actual_fppr = previous_fppr_rate_percent proteins_within_fppr = prot_cnt already_found = true end previous_fppr_rate_percent = current_fppr_rate_percent end [proteins_within_fppr, actual_fppr] end |
#num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins) ⇒ Object
bioworks_output
380 381 382 383 384 385 386 387 388 |
# File 'lib/spec_id/protein_summary.rb', line 380 def num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins) actual_cutoff = sprintf("%.3f", actual_cutoff) desired_cutoff = sprintf("%.3f", desired_cutoff) "<div class=\"num_proteins\"><h3>False Positive Predictive Rate [ FP/(TP+FP) ]</h3> Desired FPPR: #{desired_cutoff} %<br/> Actual FPPR: #{actual_cutoff} %<br/> Number of Proteins at Actual FPPR: #{num_proteins} </div>" end |
#output_peptide_counts_file(prots, filename) ⇒ Object
given a list of proteins, output a tab delimited textfile with protein name and the total number of peptides found
163 164 165 166 167 168 169 |
# File 'lib/spec_id/protein_summary.rb', line 163 def output_peptide_counts_file(prots, filename) File.open(filename, "w") do |fh_out| prots.each do |prot| fh_out.puts [prot._protein_name, prot._total_number_peptides].join("\t") end end end |
#peptide_cell(prot_num, peptide_sequences) ⇒ Object
given a list of peptide sequences creates javascript to hide/show them
343 344 345 |
# File 'lib/spec_id/protein_summary.rb', line 343 def peptide_cell(prot_num, peptide_sequences) "<a href=\"#prot#{prot_num}\" onclick=\"toggle_vis('#{prot_num}');\">#{peptide_sequences.size}</a><div id=\"#{prot_num}\" style=\"display:none;\">#{peptide_sequences.join(', ')}</div>" end |
#prefix_as_decoy_to_html(string) ⇒ Object
transforms the output string of file_as_decoy into html
403 404 405 406 407 408 |
# File 'lib/spec_id/protein_summary.rb', line 403 def prefix_as_decoy_to_html(string) "<div class=\"fppr\"> <h3>Classification Analysis</h3> </div>" + string end |
#print_html_pieces(file, *pieces) ⇒ Object
238 239 240 241 242 243 244 |
# File 'lib/spec_id/protein_summary.rb', line 238 def print_html_pieces(file, *pieces) File.open(file, "w") do |out| pieces.each do |piece| out.print piece end end end |
#proph_output(file, outfn, opt, fppr_output_as_html) ⇒ Object
290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 |
# File 'lib/spec_id/protein_summary.rb', line 290 def proph_output(file, outfn, opt, fppr_output_as_html) header_anchors = [at('#', 'number'), at('prob','protein probability (for Prophet, higher is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (includes non-contributing peptides). Click number to show/hide'), at('#peps', 'total number of corresponding peptides that contributed to protein probability'), at('%ids', 'fraction of correct dataset peptide identifications corresponding to protein')] num_cols = header_anchors.size theaders = ths(header_anchors) root = AXML.parse_file(file) prots = [] ## find the min_prob at a fppr of XX min_prob_redline = 1.01 # if no fppr is less than what they give, then all are redlined! if opt.c actual_percent_fp = opt.c.to_f elsif opt.cut_at actual_percent_fp = opt.cut_at.to_f else actual_percent_fp = nil end root.protein_group.each do |group| group.protein.each do |prt| prots << prt end end uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first } filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f, opt.prefix) ## num proteins above cutoff (if opt.c) num_prots_html = '' if opt.c || opt.cut_at (num_prots, actual_fppr) = num_prots_above_fppr(filtered_sorted_prots, actual_percent_fp) num_prots_html = num_prots_to_html(actual_percent_fp, actual_fppr, num_prots) end if opt.cut_at filtered_sorted_prots = filtered_sorted_prots[0,num_prots] end output_peptide_counts_file(filtered_sorted_prots, opt.peptide_count) if opt.peptide_count # get an array of annotations (or nil if no option) annotations = if opt.get_annotation gis = filtered_sorted_prots.map {|prot| accession(prot._protein_name) } GI.gi2annot(gis) end table_string = table do tr{theaders} + table_rows(filtered_sorted_prots, opt.f, actual_percent_fp, num_cols, opt.c.to_f, actual_percent_fp, annotations, opt.peptide_count) end er_info = opt.precision ? error_info(file) : "" html_pieces = [outfn, header, fppr_output_as_html, er_info, file_info(file), protproph_script_info, num_prots_html, table_string, trailer] print_html_pieces(*html_pieces) end |
#protproph_script_info ⇒ Object
261 262 263 264 265 266 267 268 269 270 271 272 273 |
# File 'lib/spec_id/protein_summary.rb', line 261 def protproph_script_info begin where = `which xinteract` reply = `#{where}` rescue Exception reply = "" end prophet = "TPP (version unknown)" # put your version here if you can't get it dynamically if reply =~ /xinteract.*?\((TPP .*)\)/ prophet = $1.dup end script_info { "ProteinProphet from: #{prophet}" } end |
#ref_html(gi, name) ⇒ Object
129 130 131 |
# File 'lib/spec_id/protein_summary.rb', line 129 def ref_html(gi, name) "<a href=\"http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?db=protein&val=#{gi}\" title=\"#{name}\">#{gi}</a>" end |
#script_info ⇒ Object
286 287 288 |
# File 'lib/spec_id/protein_summary.rb', line 286 def script_info "<div class=\"software\"><h3>Software Information</h3>#{yield}<br/>Ruby package: #{mspire_version}<br/>Command: #{[File.basename(__FILE__), *@orig_argv].join(" ")}</div>" end |
#table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, annotations = nil, peptide_count_filename = nil) ⇒ Object
returns a string of the table rows false_positive_rate (give as a %) is the cutoff mark returns the number of proteins at the desired_fppr (if given)
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 |
# File 'lib/spec_id/protein_summary.rb', line 214 def table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, annotations=nil, peptide_count_filename=nil) prot_cnt = 0 an_cnt = 0 uniq_prots.map do |prot| tr do prot_cnt += 1 gi = accession(prot._protein_name) if annotations protein_description = annotations[an_cnt] an_cnt += 1 else if prot.annotation.size > 0 protein_description = prot.annotation.first._protein_description else protein_description = 'NA' end end tds([prot_cnt, prot._probability, ref_html(gi, prot._protein_name), protein_description, prot._percent_coverage, peptide_cell(prot_cnt, prot._unique_stripped_peptides.split('+')), prot._total_number_peptides, prot._pct_spectrum_ids]) end end.join end |