Class: MSAbundanceSim
- Inherits:
-
Object
- Object
- MSAbundanceSim
- Defined in:
- lib/ms_abundance_sim.rb,
lib/ms_abundance_sim.rb
Defined Under Namespace
Classes: Commandline
Constant Summary collapse
- VERSION =
"0.4.0"
- DEFAULTS =
{ num_control: 5, num_case: 5, pct_diff_express: 3.0, control_variance: 1, case_variance: 2, output_abundance_separator: " #", downshift_min_threshold: 1.0, downshift_probability_threshold: 0.75, }
Class Method Summary collapse
-
.downshift(value, min_threshold, probability_threshold, random = rand()) ⇒ Object
takes ‘value’ and subtracts rand(min_threshold..value) from it if ‘value’ is >= min_threshold AND a random number is less than the probability_threshold (i.e., a probability of 1 means downshifting will happen to all values).
- .get_fold_change(a_i, event_rate, max_abundance) ⇒ Object
-
.inverse_transform_sample(event_rate) ⇒ Object
probability (y).
-
.poisson(event_rate, num_occurences) ⇒ Object
event_rate (lambda) num_occurences (k).
-
.process_files(filenames, opts) ⇒ Object
returns a Hash keyed by filenames and pointing to the output of process_file (a list of case and control filenames, keyed by ‘case’ and ‘control’).
- .sample_abundance(abundances, fold_change) ⇒ Object
Instance Method Summary collapse
- #get_protein_entries(filename) ⇒ Object
-
#parse_entry_line(line) ⇒ Object
returns an array with the beginning part of the entry line (without the abundance (which begins with a ‘#’) and an array of abundances (all the numbers, returned as Floats, after the final ‘#’).
- #process_file(filename, opts) ⇒ Object
Class Method Details
.downshift(value, min_threshold, probability_threshold, random = rand()) ⇒ Object
takes ‘value’ and subtracts rand(min_threshold..value) from it if ‘value’ is >= min_threshold AND a random number is less than the probability_threshold (i.e., a probability of 1 means downshifting will happen to all values).
83 84 85 86 87 88 89 |
# File 'lib/ms_abundance_sim.rb', line 83 def downshift(value, min_threshold, probability_threshold, random=rand()) if (value >= min_threshold) && (random < probability_threshold) value - rand(min_threshold..value) else value end end |
.get_fold_change(a_i, event_rate, max_abundance) ⇒ Object
70 71 72 73 74 75 76 77 |
# File 'lib/ms_abundance_sim.rb', line 70 def get_fold_change(a_i, event_rate, max_abundance) raw_fc = inverse_transform_sample(event_rate) inv_share = (1.0-(a_i[0].to_f / max_abundance)) norm_fc = raw_fc * inv_share trans = rand(0..inv_share) sign = [1,-1].sample return (norm_fc + trans) * sign end |
.inverse_transform_sample(event_rate) ⇒ Object
probability (y)
59 60 61 62 63 64 65 66 67 68 |
# File 'lib/ms_abundance_sim.rb', line 59 def inverse_transform_sample(event_rate) max = poisson(event_rate, event_rate) probability = rand * (max) list_of_num_occurrences = (0..event_rate*4).to_a.shuffle # attempt to capture the entire distribution without wasting tests list_of_num_occurrences.each do |num_occurences| p_num_occurences = poisson(event_rate, num_occurences) return num_occurences if probability < p_num_occurences end puts "Problem: Should have found a num_occurences in inverse_transform_sample" end |
.poisson(event_rate, num_occurences) ⇒ Object
event_rate (lambda) num_occurences (k)
54 55 56 |
# File 'lib/ms_abundance_sim.rb', line 54 def poisson(event_rate, num_occurences) return event_rate**num_occurences * Math.exp(-event_rate) / num_occurences.factorial.to_f end |
.process_files(filenames, opts) ⇒ Object
returns a Hash keyed by filenames and pointing to the output of process_file (a list of case and control filenames, keyed by ‘case’ and ‘control’)
45 46 47 48 49 50 |
# File 'lib/ms_abundance_sim.rb', line 45 def process_files(filenames, opts) outputs = filenames.map do |filename| MSAbundanceSim.new.process_file(filename, opts) end filenames.zip(outputs).to_h end |
.sample_abundance(abundances, fold_change) ⇒ Object
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
# File 'lib/ms_abundance_sim.rb', line 91 def sample_abundance(abundances, fold_change) abundance = 0.0 if abundances.size > 1 # linear interpolation idx = [1..abundances.size-1].sample(1) next_idx = idx+1 < abundances.size ? idx+1 : abundances[0] # edge case: 2 abundances abundance = abundances[idx] + rand * (abundances[next_idx] - abundances[idx]) else # sample w/variance if fold_change >= 0 # positive fold change abundance = abundances[0].to_f * fold_change + abundances[0] else # negative fold change fold_change *= -1 abundance = abundances[0] / (2.0**fold_change) end end return abundance + [1,-1].sample * rand * 0.1 * abundance end |
Instance Method Details
#get_protein_entries(filename) ⇒ Object
166 167 168 169 170 171 172 173 174 175 176 177 |
# File 'lib/ms_abundance_sim.rb', line 166 def get_protein_entries(filename) protein_entries = [] IO.foreach(filename) do |line| line.chomp! if line[0] == ">" protein_entries << ProteinEntry.new(*parse_entry_line(line)) else protein_entries.last.additional_lines << line end end protein_entries end |
#parse_entry_line(line) ⇒ Object
returns an array with the beginning part of the entry line (without the abundance (which begins with a ‘#’) and an array of abundances (all the numbers, returned as Floats, after the final ‘#’)
158 159 160 161 162 163 164 |
# File 'lib/ms_abundance_sim.rb', line 158 def parse_entry_line(line) octothorpe_index = line.rindex("#") entry_line_wo_abundance = line[0...octothorpe_index] abundance_str = line[(octothorpe_index+1)..-1] abundances = abundance_str.split(",").map(&:to_f).sort [entry_line_wo_abundance, abundances] end |
#process_file(filename, opts) ⇒ Object
109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
# File 'lib/ms_abundance_sim.rb', line 109 def process_file(filename, opts) opts = DEFAULTS.merge(opts) basename = filename.chomp(File.extname(filename)) protein_entries = get_protein_entries(filename) max_abundance = protein_entries.max_by {|entry| entry.abundances.last }.abundances.last # generate which proteins will be differentially expressed num_proteins_to_diff_express = (protein_entries.size * opts[:pct_diff_express]/100.0).to_i diff_expressed_ids = (0...protein_entries.size).to_a.sample(num_proteins_to_diff_express).to_set output = Hash.new {|hash, key| hash[key] = [] } case_and_controls_needed = ([:case] * opts[:num_case]) + ([:control] * opts[:num_control]) case_and_controls_needed.each_with_index do |sample_type, sample_number| # for each sample outfilename = "#{basename}_#{sample_number}_#{sample_type}" output[sample_type] << outfilename File.open(outfilename, 'w') do |outfile| protein_entries.each_with_index do |protein_entry, idx| # put first line of fasta with simulated abundance variance = opts[ if sample_type == :case && diff_expressed_ids.include?(idx) :case_variance else :control_variance end ] fold_change = MSAbundanceSim.get_fold_change(protein_entry.abundances, variance, max_abundance) downshift_params = %w(min_threshold probability_threshold) .map {|key| 'downshift_' + key } .map(&:to_sym) .map {|key| opts[key] } downshifted_fold_change = MSAbundanceSim.downshift(fold_change, *downshift_params) sample_abundance = MSAbundanceSim.sample_abundance(protein_entry.abundances, downshifted_fold_change) outfile.puts [protein_entry.entry_line_wo_abundance, sample_abundance].join(opts[:output_abundance_separator]) outfile.puts protein_entry.additional_lines.join("\n") end end end output end |