Module: BioDSL::Kmer

Included in:
Seq
Defined in:
lib/BioDSL/seq/kmer.rb

Overview

Module containing methods for manipulating sequence kmers.

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.to_oligos(kmers, kmer_size) ⇒ Object

Debug method to convert an array of binary encoded kmers to nucleotide oligos.



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/BioDSL/seq/kmer.rb', line 36

def self.to_oligos(kmers, kmer_size)
  oligos = []

  kmers.each do |kmer|
    oligo = ''
    bin   = format("%0#{kmer_size * 2}b", kmer)

    bin.scan(/.{2}/) do |m|
      case m
      when '00' then oligo << 'a'
      when '01' then oligo << 't'
      when '10' then oligo << 'c'
      when '11' then oligo << 'g'
      else
        fail "unknown m #{m}"
      end
    end

    oligos << oligo
  end

  oligos
end

Instance Method Details

#to_kmers(options) ⇒ Object

Method that returns a sorted array of unique kmers, which are integer representations of DNA/RNA sequence oligos where A is encoded in two bits as 00, T as 01, U as 01, C as 10 and G as 11. Oligos with other nucleotides are ignored. The following options apply:

* kmer_size: kmer size in the range 1-12.
* step_size: step size in the range 1-12 (defualt=1).
* score_min: drop kmers with quality score below this.


67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/BioDSL/seq/kmer.rb', line 67

def to_kmers(options)
  options[:step_size] ||= 1
  options[:score_min] ||= Seq::SCORE_MAX
  fail KmerError, 'No kmer_size' unless options[:kmer_size]

  unless (1..12).include? options[:kmer_size]
    fail KmerError, "Bad kmer_size: #{options[:kmer_size]}"
  end

  unless (1..12).include? options[:step_size]
    fail KmerError, "Bad step_size: #{options[:step_size]}"
  end

  if @qual && !(Seq::SCORE_MIN..Seq::SCORE_MAX).
               include?(options[:score_min])
    fail KmerError, "score minimum: #{options[:score_min]} out of " \
                    "range #{Seq::SCORE_MIN}..#{Seq::SCORE_MAX}"
  end

  size = Seq::DNA.size**options[:kmer_size]

  if defined?(@kmer_ary) && (@kmer_ary.count == size)
    @kmer_ary.zero!
  else
    @kmer_ary = BioDSL::CAry.new(size, 1)
  end

  if @qual
    to_kmers_qual_C(@seq, @qual, @kmer_ary.ary, length, @kmer_ary.count,
                    options[:kmer_size], options[:step_size],
                    options[:score_min], Seq::SCORE_BASE)
  else
    to_kmers_C(@seq, @kmer_ary.ary, length, @kmer_ary.count,
              options[:kmer_size], options[:step_size])
  end
end