Class: MSAbundanceSim

Inherits:
Object
  • Object
show all
Defined in:
lib/ms_abundance_sim.rb,
lib/ms_abundance_sim.rb

Defined Under Namespace

Classes: Commandline

Constant Summary collapse

VERSION =
"0.4.0"
DEFAULTS =
{
  num_control: 5,
  num_case: 5,
  pct_diff_express: 3.0,
  control_variance: 1,
  case_variance: 2,
  output_abundance_separator: " #",
  downshift_min_threshold: 1.0,
  downshift_probability_threshold: 0.75,
}

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.downshift(value, min_threshold, probability_threshold, random = rand()) ⇒ Object

takes ‘value’ and subtracts rand(min_threshold..value) from it if ‘value’ is >= min_threshold AND a random number is less than the probability_threshold (i.e., a probability of 1 means downshifting will happen to all values).



83
84
85
86
87
88
89
# File 'lib/ms_abundance_sim.rb', line 83

def downshift(value, min_threshold, probability_threshold, random=rand())
  if (value >= min_threshold) && (random < probability_threshold)
    value - rand(min_threshold..value)
  else
    value
  end
end

.get_fold_change(a_i, event_rate, max_abundance) ⇒ Object



70
71
72
73
74
75
76
77
# File 'lib/ms_abundance_sim.rb', line 70

def get_fold_change(a_i, event_rate, max_abundance)
  raw_fc = inverse_transform_sample(event_rate)
  inv_share = (1.0-(a_i[0].to_f / max_abundance))
  norm_fc = raw_fc * inv_share
  trans = rand(0..inv_share)
  sign = [1,-1].sample
  return (norm_fc + trans) * sign
end

.inverse_transform_sample(event_rate) ⇒ Object

probability (y)



59
60
61
62
63
64
65
66
67
68
# File 'lib/ms_abundance_sim.rb', line 59

def inverse_transform_sample(event_rate)
  max = poisson(event_rate, event_rate)
  probability = rand * (max)
  list_of_num_occurrences = (0..event_rate*4).to_a.shuffle # attempt to capture the entire distribution without wasting tests
  list_of_num_occurrences.each do |num_occurences|
    p_num_occurences = poisson(event_rate, num_occurences)
    return num_occurences if probability < p_num_occurences
  end
  puts "Problem: Should have found a num_occurences in inverse_transform_sample"
end

.poisson(event_rate, num_occurences) ⇒ Object

event_rate (lambda) num_occurences (k)



54
55
56
# File 'lib/ms_abundance_sim.rb', line 54

def poisson(event_rate, num_occurences)
  return event_rate**num_occurences * Math.exp(-event_rate) / num_occurences.factorial.to_f
end

.process_files(filenames, opts) ⇒ Object

returns a Hash keyed by filenames and pointing to the output of process_file (a list of case and control filenames, keyed by ‘case’ and ‘control’)



45
46
47
48
49
50
# File 'lib/ms_abundance_sim.rb', line 45

def process_files(filenames, opts)
  outputs = filenames.map do |filename|
    MSAbundanceSim.new.process_file(filename, opts)
  end
  filenames.zip(outputs).to_h
end

.sample_abundance(abundances, fold_change) ⇒ Object



91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/ms_abundance_sim.rb', line 91

def sample_abundance(abundances, fold_change)
  abundance = 0.0
  if abundances.size > 1 # linear interpolation
    idx = [1..abundances.size-1].sample(1)
    next_idx = idx+1 < abundances.size ? idx+1 : abundances[0] # edge case: 2 abundances
    abundance = abundances[idx] + rand * (abundances[next_idx] - abundances[idx])
  else # sample w/variance
    if fold_change >= 0 # positive fold change
      abundance = abundances[0].to_f * fold_change + abundances[0]
    else # negative fold change
      fold_change *= -1
      abundance = abundances[0] / (2.0**fold_change)
    end
  end
  return abundance + [1,-1].sample * rand * 0.1 * abundance
end

Instance Method Details

#get_protein_entries(filename) ⇒ Object



166
167
168
169
170
171
172
173
174
175
176
177
# File 'lib/ms_abundance_sim.rb', line 166

def get_protein_entries(filename)
  protein_entries = []
  IO.foreach(filename) do |line|
    line.chomp!
    if line[0] == ">"
      protein_entries << ProteinEntry.new(*parse_entry_line(line))
    else
      protein_entries.last.additional_lines << line
    end
  end
  protein_entries
end

#parse_entry_line(line) ⇒ Object

returns an array with the beginning part of the entry line (without the abundance (which begins with a ‘#’) and an array of abundances (all the numbers, returned as Floats, after the final ‘#’)



158
159
160
161
162
163
164
# File 'lib/ms_abundance_sim.rb', line 158

def parse_entry_line(line)
  octothorpe_index = line.rindex("#")
  entry_line_wo_abundance = line[0...octothorpe_index]
  abundance_str = line[(octothorpe_index+1)..-1]
  abundances = abundance_str.split(",").map(&:to_f).sort
  [entry_line_wo_abundance, abundances]
end

#process_file(filename, opts) ⇒ Object



109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# File 'lib/ms_abundance_sim.rb', line 109

def process_file(filename, opts)
  opts = DEFAULTS.merge(opts)

  basename = filename.chomp(File.extname(filename))

  protein_entries = get_protein_entries(filename)
  max_abundance = protein_entries.max_by {|entry| entry.abundances.last }.abundances.last

  # generate which proteins will be differentially expressed
  num_proteins_to_diff_express = (protein_entries.size * opts[:pct_diff_express]/100.0).to_i
  diff_expressed_ids = (0...protein_entries.size).to_a.sample(num_proteins_to_diff_express).to_set

  output = Hash.new {|hash, key| hash[key] = [] }
  case_and_controls_needed = ([:case] * opts[:num_case]) + ([:control] * opts[:num_control])
  case_and_controls_needed.each_with_index do |sample_type, sample_number| # for each sample
    outfilename = "#{basename}_#{sample_number}_#{sample_type}"
    output[sample_type] << outfilename
    File.open(outfilename, 'w') do |outfile|
      protein_entries.each_with_index do |protein_entry, idx|
        # put first line of fasta with simulated abundance
        variance = opts[
          if sample_type == :case && diff_expressed_ids.include?(idx)
            :case_variance
          else
            :control_variance
          end
        ]

        fold_change = MSAbundanceSim.get_fold_change(protein_entry.abundances, variance, max_abundance)

        downshift_params = %w(min_threshold probability_threshold)
          .map {|key| 'downshift_' + key }
          .map(&:to_sym)
          .map {|key| opts[key] }

        downshifted_fold_change = MSAbundanceSim.downshift(fold_change, *downshift_params)

        sample_abundance = MSAbundanceSim.sample_abundance(protein_entry.abundances, downshifted_fold_change)
        outfile.puts [protein_entry.entry_line_wo_abundance, sample_abundance].join(opts[:output_abundance_separator])
        outfile.puts protein_entry.additional_lines.join("\n")
      end
    end
  end
  output
end