Class: MSAbundanceSim

Inherits:

Object

Object
MSAbundanceSim

show all

Defined in:: lib/ms_abundance_sim.rb,
lib/ms_abundance_sim.rb

Defined Under Namespace

Classes: Commandline

Constant Summary collapse

VERSION =

"0.4.0"

DEFAULTS =

{
  num_control: 5,
  num_case: 5,
  pct_diff_express: 3.0,
  control_variance: 1,
  case_variance: 2,
  output_abundance_separator: " #",
  downshift_min_threshold: 1.0,
  downshift_probability_threshold: 0.75,
}

Class Method Summary collapse

.downshift(value, min_threshold, probability_threshold, random = rand()) ⇒ Object

takes ‘value’ and subtracts rand(min_threshold..value) from it if ‘value’ is >= min_threshold AND a random number is less than the probability_threshold (i.e., a probability of 1 means downshifting will happen to all values).
.get_fold_change(a_i, event_rate, max_abundance) ⇒ Object
.inverse_transform_sample(event_rate) ⇒ Object

probability (y).
.poisson(event_rate, num_occurences) ⇒ Object

event_rate (lambda) num_occurences (k).
.process_files(filenames, opts) ⇒ Object

returns a Hash keyed by filenames and pointing to the output of process_file (a list of case and control filenames, keyed by ‘case’ and ‘control’).
.sample_abundance(abundances, fold_change) ⇒ Object

Instance Method Summary collapse

#get_protein_entries(filename) ⇒ Object
#parse_entry_line(line) ⇒ Object

returns an array with the beginning part of the entry line (without the abundance (which begins with a ‘#’) and an array of abundances (all the numbers, returned as Floats, after the final ‘#’).
#process_file(filename, opts) ⇒ Object

Class Method Details

.downshift(value, min_threshold, probability_threshold, random = rand()) ⇒ `Object`

takes ‘value’ and subtracts rand(min_threshold..value) from it if ‘value’ is >= min_threshold AND a random number is less than the probability_threshold (i.e., a probability of 1 means downshifting will happen to all values).

# File 'lib/ms_abundance_sim.rb', line 83

def downshift(value, min_threshold, probability_threshold, random=rand())
  if (value >= min_threshold) && (random < probability_threshold)
    value - rand(min_threshold..value)
  else
    value
  end
end

.get_fold_change(a_i, event_rate, max_abundance) ⇒ `Object`

# File 'lib/ms_abundance_sim.rb', line 70

def get_fold_change(a_i, event_rate, max_abundance)
  raw_fc = inverse_transform_sample(event_rate)
  inv_share = (1.0-(a_i[0].to_f / max_abundance))
  norm_fc = raw_fc * inv_share
  trans = rand(0..inv_share)
  sign = [1,-1].sample
  return (norm_fc + trans) * sign
end

.inverse_transform_sample(event_rate) ⇒ `Object`

probability (y)

# File 'lib/ms_abundance_sim.rb', line 59

def inverse_transform_sample(event_rate)
  max = poisson(event_rate, event_rate)
  probability = rand * (max)
  list_of_num_occurrences = (0..event_rate*4).to_a.shuffle # attempt to capture the entire distribution without wasting tests
  list_of_num_occurrences.each do |num_occurences|
    p_num_occurences = poisson(event_rate, num_occurences)
    return num_occurences if probability < p_num_occurences
  end
  puts "Problem: Should have found a num_occurences in inverse_transform_sample"
end

.poisson(event_rate, num_occurences) ⇒ `Object`

event_rate (lambda) num_occurences (k)



54
55
56

# File 'lib/ms_abundance_sim.rb', line 54

def poisson(event_rate, num_occurences)
  return event_rate**num_occurences * Math.exp(-event_rate) / num_occurences.factorial.to_f
end

.process_files(filenames, opts) ⇒ `Object`

returns a Hash keyed by filenames and pointing to the output of process_file (a list of case and control filenames, keyed by ‘case’ and ‘control’)

# File 'lib/ms_abundance_sim.rb', line 45

def process_files(filenames, opts)
  outputs = filenames.map do |filename|
    MSAbundanceSim.new.process_file(filename, opts)
  end
  filenames.zip(outputs).to_h
end

.sample_abundance(abundances, fold_change) ⇒ `Object`

# File 'lib/ms_abundance_sim.rb', line 91

def sample_abundance(abundances, fold_change)
  abundance = 0.0
  if abundances.size > 1 # linear interpolation
    idx = [1..abundances.size-1].sample(1)
    next_idx = idx+1 < abundances.size ? idx+1 : abundances[0] # edge case: 2 abundances
    abundance = abundances[idx] + rand * (abundances[next_idx] - abundances[idx])
  else # sample w/variance
    if fold_change >= 0 # positive fold change
      abundance = abundances[0].to_f * fold_change + abundances[0]
    else # negative fold change
      fold_change *= -1
      abundance = abundances[0] / (2.0**fold_change)
    end
  end
  return abundance + [1,-1].sample * rand * 0.1 * abundance
end

Instance Method Details

#get_protein_entries(filename) ⇒ `Object`

# File 'lib/ms_abundance_sim.rb', line 166

def get_protein_entries(filename)
  protein_entries = []
  IO.foreach(filename) do |line|
    line.chomp!
    if line[0] == ">"
      protein_entries << ProteinEntry.new(*parse_entry_line(line))
    else
      protein_entries.last.additional_lines << line
    end
  end
  protein_entries
end

#parse_entry_line(line) ⇒ `Object`

returns an array with the beginning part of the entry line (without the abundance (which begins with a ‘#’) and an array of abundances (all the numbers, returned as Floats, after the final ‘#’)

# File 'lib/ms_abundance_sim.rb', line 158

def parse_entry_line(line)
  octothorpe_index = line.rindex("#")
  entry_line_wo_abundance = line[0...octothorpe_index]
  abundance_str = line[(octothorpe_index+1)..-1]
  abundances = abundance_str.split(",").map(&:to_f).sort
  [entry_line_wo_abundance, abundances]
end

#process_file(filename, opts) ⇒ `Object`

# File 'lib/ms_abundance_sim.rb', line 109

def process_file(filename, opts)
  opts = DEFAULTS.merge(opts)

  basename = filename.chomp(File.extname(filename))

  protein_entries = get_protein_entries(filename)
  max_abundance = protein_entries.max_by {|entry| entry.abundances.last }.abundances.last

  # generate which proteins will be differentially expressed
  num_proteins_to_diff_express = (protein_entries.size * opts[:pct_diff_express]/100.0).to_i
  diff_expressed_ids = (0...protein_entries.size).to_a.sample(num_proteins_to_diff_express).to_set

  output = Hash.new {|hash, key| hash[key] = [] }
  case_and_controls_needed = ([:case] * opts[:num_case]) + ([:control] * opts[:num_control])
  case_and_controls_needed.each_with_index do |sample_type, sample_number| # for each sample
    outfilename = "#{basename}_#{sample_number}_#{sample_type}"
    output[sample_type] << outfilename
    File.open(outfilename, 'w') do |outfile|
      protein_entries.each_with_index do |protein_entry, idx|
        # put first line of fasta with simulated abundance
        variance = opts[
          if sample_type == :case && diff_expressed_ids.include?(idx)
            :case_variance
          else
            :control_variance
          end
        ]

        fold_change = MSAbundanceSim.get_fold_change(protein_entry.abundances, variance, max_abundance)

        downshift_params = %w(min_threshold probability_threshold)
          .map {|key| 'downshift_' + key }
          .map(&:to_sym)
          .map {|key| opts[key] }

        downshifted_fold_change = MSAbundanceSim.downshift(fold_change, *downshift_params)

        sample_abundance = MSAbundanceSim.sample_abundance(protein_entry.abundances, downshifted_fold_change)
        outfile.puts [protein_entry.entry_line_wo_abundance, sample_abundance].join(opts[:output_abundance_separator])
        outfile.puts protein_entry.additional_lines.join("\n")
      end
    end
  end
  output
end

Class: MSAbundanceSim

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.downshift(value, min_threshold, probability_threshold, random = rand()) ⇒ Object

.get_fold_change(a_i, event_rate, max_abundance) ⇒ Object

.inverse_transform_sample(event_rate) ⇒ Object

.poisson(event_rate, num_occurences) ⇒ Object

.process_files(filenames, opts) ⇒ Object

.sample_abundance(abundances, fold_change) ⇒ Object