Module: LibSSW

Defined in:
lib/libssw.rb,
lib/libssw/ffi.rb,
lib/libssw/align.rb,
lib/libssw/profile.rb,
lib/libssw/version.rb,
lib/libssw/BLOSUM50.rb,
lib/libssw/BLOSUM62.rb

Defined Under Namespace

Modules: FFI Classes: Align, Error, Profile

Constant Summary collapse

AAELEMENTS =
['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G',
'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S',
'T', 'W', 'Y', 'V', 'B', 'Z', 'X', '*']
AA2INT =
{ 'A' => 0,  'a' => 0,
'R' => 1,  'r' => 1,
'N' => 2,  'n' => 2,
'D' => 3,  'd' => 3,
'C' => 4,  'c' => 4,
'Q' => 5,  'q' => 5,
'E' => 6,  'e' => 6,
'G' => 7,  'g' => 7,
'H' => 8,  'h' => 8,
'I' => 9,  'i' => 9,
'L' => 10, 'l' => 10,
'K' => 11, 'k' => 11,
'M' => 12, 'm' => 12,
'F' => 13, 'f' => 13,
'P' => 14, 'p' => 14,
'S' => 15, 's' => 15,
'T' => 16, 't' => 16,
'W' => 17, 'w' => 17,
'Y' => 18, 'y' => 18,
'V' => 19, 'v' => 19,
'B' => 20, 'b' => 20,
'Z' => 21, 'z' => 21,
'X' => 22, 'x' => 22,
'*' => 23 }
INT2AA =
{ 0 => 'A', 1 => 'R', 2 => 'N', 3 => 'D',
4 => 'C', 5 => 'Q', 6 => 'E', 7 => 'G',
8 => 'H', 9 => 'I', 10 => 'L', 11 => 'K',
12 => 'M', 13 => 'F', 14 => 'P', 15 => 'S',
16 => 'T', 17 => 'W', 18 => 'Y', 19 => 'V',
20 => 'B', 21 => 'Z', 22 => 'X', 23 => '*' }
DNAElements =
%w[A C G T N]
DNA2INT =
{ 'A' => 0, 'a' => 0,
'C' => 1, 'c' => 1,
'G' => 2, 'g' => 2,
'T' => 3, 't' => 3,
'N' => 4, 'n' => 4 }
INT2DNA =
{ 0 => 'A', 1 => 'C', 2 => 'G', 3 => 'T', 4 => 'N' }
DNARC =

reverse complement

{ 'A' => 'T',
'C' => 'G',
'G' => 'C',
'T' => 'A',
'N' => 'N',
'a' => 'T',
'c' => 'G',
'g' => 'C',
't' => 'A',
'n' => 'N' }
VERSION =
'0.0.2'
BLOSUM50 =
[
  #  A   R   N   D   C   Q   E   G   H   I   L   K   M   F   P   S   T   W   Y   V   B   Z   X   *
 5, -2, -1, -2, -1, -1, -1,  0, -2, -1, -2, -1, -1, -3, -1,  1,  0, -3, -2,  0, -2, -1, -1, -5, # A
-2,  7, -1, -2, -4,  1,  0, -3,  0, -4, -3,  3, -2, -3, -3, -1, -1, -3, -1, -3, -1,  0, -1, -5, # R
-1, -1,  7,  2, -2,  0,  0,  0,  1, -3, -4,  0, -2, -4, -2,  1,  0, -4, -2, -3,  5,  0, -1, -5, # N
-2, -2,  2,  8, -4,  0,  2, -1, -1, -4, -4, -1, -4, -5, -1,  0, -1, -5, -3, -4,  6,  1, -1, -5, # D
-1, -4, -2, -4, 13, -3, -3, -3, -3, -2, -2, -3, -2, -2, -4, -1, -1, -5, -3, -1, -3, -3, -1, -5, # C
-1,  1,  0,  0, -3,  7,  2, -2,  1, -3, -2,  2,  0, -4, -1,  0, -1, -1, -1, -3,  0,  4, -1, -5, # Q
-1,  0,  0,  2, -3,  2,  6, -3,  0, -4, -3,  1, -2, -3, -1, -1, -1, -3, -2, -3,  1,  5, -1, -5, # E
 0, -3,  0, -1, -3, -2, -3,  8, -2, -4, -4, -2, -3, -4, -2,  0, -2, -3, -3, -4, -1, -2, -1, -5, # G
-2,  0,  1, -1, -3,  1,  0, -2, 10, -4, -3,  0, -1, -1, -2, -1, -2, -3,  2, -4,  0,  0, -1, -5, # H
-1, -4, -3, -4, -2, -3, -4, -4, -4,  5,  2, -3,  2,  0, -3, -3, -1, -3, -1,  4, -4, -3, -1, -5, # I
-2, -3, -4, -4, -2, -2, -3, -4, -3,  2,  5, -3,  3,  1, -4, -3, -1, -2, -1,  1, -4, -3, -1, -5, # L
-1,  3,  0, -1, -3,  2,  1, -2,  0, -3, -3,  6, -2, -4, -1,  0, -1, -3, -2, -3,  0,  1, -1, -5, # K
-1, -2, -2, -4, -2,  0, -2, -3, -1,  2,  3, -2,  7,  0, -3, -2, -1, -1,  0,  1, -3, -1, -1, -5, # M
-3, -3, -4, -5, -2, -4, -3, -4, -1,  0,  1, -4,  0,  8, -4, -3, -2,  1,  4, -1, -4, -4, -1, -5, # F
-1, -3, -2, -1, -4, -1, -1, -2, -2, -3, -4, -1, -3, -4, 10, -1, -1, -4, -3, -3, -2, -1, -1, -5, # P
 1, -1,  1,  0, -1,  0, -1,  0, -1, -3, -3,  0, -2, -3, -1,  5,  2, -4, -2, -2,  0,  0, -1, -5, # S
 0, -1,  0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1,  2,  5, -3, -2,  0,  0, -1, -1, -5, # T
-3, -3, -4, -5, -5, -1, -3, -3, -3, -3, -2, -3, -1,  1, -4, -4, -3, 15,  2, -3, -5, -2, -1, -5, # W
-2, -1, -2, -3, -3, -1, -2, -3,  2, -1, -1, -2,  0,  4, -3, -2, -2,  2,  8, -1, -3, -2, -1, -5, # Y
 0, -3, -3, -4, -1, -3, -3, -4, -4,  4,  1, -3,  1, -1, -3, -2,  0, -3, -1,  5, -3, -3, -1, -5, # V
-2, -1,  5,  6, -3,  0,  1, -1,  0, -4, -4,  0, -3, -4, -2,  0,  0, -5, -3, -3,  6,  1, -1, -5, # B
-1,  0,  0,  1, -3,  4,  5, -2,  0, -3, -3,  1, -1, -4, -1,  0, -1, -2, -2, -3,  1,  5, -1, -5, # Z
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -5, # X
-5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,  1  # *
]
BLOSUM62 =
[
  #  A   R   N   D   C   Q   E   G   H   I   L   K   M   F   P   S   T   W   Y   V   B   Z   X   *
 4, -1, -2, -2,  0, -1, -1,  0, -2, -1, -1, -1, -1, -2, -1,  1,  0, -3, -2,  0, -2, -1,  0, -4, # A
-1,  5,  0, -2, -3,  1,  0, -2,  0, -3, -2,  2, -1, -3, -2, -1, -1, -3, -2, -3, -1,  0, -1, -4, # R
-2,  0,  6,  1, -3,  0,  0,  0,  1, -3, -3,  0, -2, -3, -2,  1,  0, -4, -2, -3,  3,  0, -1, -4, # N
-2, -2,  1,  6, -3,  0,  2, -1, -1, -3, -4, -1, -3, -3, -1,  0, -1, -4, -3, -3,  4,  1, -1, -4, # D
 0, -3, -3, -3,  9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4, # C
-1,  1,  0,  0, -3,  5,  2, -2,  0, -3, -2,  1,  0, -3, -1,  0, -1, -2, -1, -2,  0,  3, -1, -4, # Q
-1,  0,  0,  2, -4,  2,  5, -2,  0, -3, -3,  1, -2, -3, -1,  0, -1, -3, -2, -2,  1,  4, -1, -4, # E
 0, -2,  0, -1, -3, -2, -2,  6, -2, -4, -4, -2, -3, -3, -2,  0, -2, -2, -3, -3, -1, -2, -1, -4, # G
-2,  0,  1, -1, -3,  0,  0, -2,  8, -3, -3, -1, -2, -1, -2, -1, -2, -2,  2, -3,  0,  0, -1, -4, # H
-1, -3, -3, -3, -1, -3, -3, -4, -3,  4,  2, -3,  1,  0, -3, -2, -1, -3, -1,  3, -3, -3, -1, -4, # I
-1, -2, -3, -4, -1, -2, -3, -4, -3,  2,  4, -2,  2,  0, -3, -2, -1, -2, -1,  1, -4, -3, -1, -4, # L
-1,  2,  0, -1, -3,  1,  1, -2, -1, -3, -2,  5, -1, -3, -1,  0, -1, -3, -2, -2,  0,  1, -1, -4, # K
-1, -1, -2, -3, -1,  0, -2, -3, -2,  1,  2, -1,  5,  0, -2, -1, -1, -1, -1,  1, -3, -1, -1, -4, # M
-2, -3, -3, -3, -2, -3, -3, -3, -1,  0,  0, -3,  0,  6, -4, -2, -2,  1,  3, -1, -3, -3, -1, -4, # F
-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4,  7, -1, -1, -4, -3, -2, -2, -1, -2, -4, # P
 1, -1,  1,  0, -1,  0,  0,  0, -1, -2, -2,  0, -1, -2, -1,  4,  1, -3, -2, -2,  0,  0,  0, -4, # S
 0, -1,  0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1,  1,  5, -2, -2,  0, -1, -1,  0, -4, # T
-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1,  1, -4, -3, -2, 11,  2, -3, -4, -3, -2, -4, # W
-2, -2, -2, -3, -2, -1, -2, -3,  2, -1, -1, -2, -1,  3, -3, -2, -2,  2,  7, -1, -3, -2, -1, -4, # Y
 0, -3, -3, -3, -1, -2, -2, -3, -3,  3,  1, -2,  1, -1, -2, -2,  0, -3, -1,  4, -3, -2, -1, -4, # V
-2, -1,  3,  4, -3,  0,  1, -1,  0, -3, -4,  0, -3, -3, -2,  0, -1, -4, -3, -3,  4,  1, -1, -4, # B
-1,  0,  0,  1, -3,  3,  4, -2,  0, -3, -3,  1, -1, -3, -1,  0, -1, -3, -2, -2,  1,  4, -1, -4, # Z
 0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2,  0,  0, -2, -1, -1, -1, -1, -1, -4, # X
-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,  1, # *
]

Class Attribute Summary collapse

Class Method Summary collapse

Class Attribute Details

.ffi_libObject

Returns the value of attribute ffi_lib.



11
12
13
# File 'lib/libssw.rb', line 11

def ffi_lib
  @ffi_lib
end

Class Method Details

.aaseq_to_int_array(seq) ⇒ Object

Raises:

  • (ArgumentError)


285
286
287
288
289
290
291
# File 'lib/libssw.rb', line 285

def aaseq_to_int_array(seq)
  raise ArgumentError, 'seq must be a string' unless seq.is_a? String

  seq.each_char.map do |base|
    AA2INT[base] || AA2INT['*']
  end
end

.align_destroy(align) ⇒ Object

Release the memory allocated by function ssw_align.

Parameters:



201
202
203
# File 'lib/libssw.rb', line 201

def align_destroy(align)
  FFI.align_destroy(align)
end

.array_to_cigar_string(arr) ⇒ Object



234
235
236
237
238
239
240
241
242
243
# File 'lib/libssw.rb', line 234

def array_to_cigar_string(arr)
  cigar_string = String.new
  arr.each do |x|
    n = x >> 4
    m = x & 15
    c = m > 8 ? 'M' : 'MIDNSHP=X'[m]
    cigar_string << n.to_s << c
  end
  cigar_string
end

.create_scoring_matrix(elements, match_score, mismatch_score) ⇒ Object

Create scoring matrix of Smith-Waterman algrithum.

Parameters:

  • elements (Array)
  • match_score (Integer)
  • mismatch_score (Integer)


249
250
251
252
253
254
255
256
257
258
259
# File 'lib/libssw.rb', line 249

def create_scoring_matrix(elements, match_score, mismatch_score)
  size = elements.size
  score = Array.new(size * size, 0)
  (size - 1).times do |i|
    (size - 1).times do |j|
      score[i * size + j] = \
        (elements[i] == elements[j] ? match_score : mismatch_score)
    end
  end
  score
end

.dna_complement(seq) ⇒ Object



270
271
272
273
274
# File 'lib/libssw.rb', line 270

def dna_complement(seq)
  seq.each_char.map do |base|
    DNARC[base]
  end.join.reverse
end

.dna_to_int_array(seq) ⇒ Object

Parameters:

  • seq (String)

Raises:

  • (ArgumentError)


262
263
264
265
266
267
268
# File 'lib/libssw.rb', line 262

def dna_to_int_array(seq)
  raise ArgumentError, 'seq must be a string' unless seq.is_a? String

  seq.each_char.map do |base|
    DNA2INT[base] || DNA2INT['N']
  end
end

.init_destroy(profile) ⇒ Object

Note:

Ruby has garbage collection, so there is not much reason to call this method.

Release the memory allocated by function ssw_init.

Parameters:



137
138
139
# File 'lib/libssw.rb', line 137

def init_destroy(profile)
  FFI.init_destroy(profile)
end

.int_array_to_aaseq(arr) ⇒ Object

Raises:

  • (ArgumentError)


293
294
295
296
297
298
299
# File 'lib/libssw.rb', line 293

def int_array_to_aaseq(arr)
  raise ArgumentError, 'arr must be an Array' unless arr.is_a? Array

  arr.map do |i|
    INT2AA[i] || '*'
  end.join
end

.int_array_to_dna(arr) ⇒ Object

Parameters:

  • int (Array)

    array

Raises:

  • (ArgumentError)


277
278
279
280
281
282
283
# File 'lib/libssw.rb', line 277

def int_array_to_dna(arr)
  raise ArgumentError, 'arr must be an Array' unless arr.is_a? Array

  arr.map do |i|
    INT2DNA[i] || 'N'
  end.join
end

.mark_mismatch(ref_begin1, read_begin1, read_end1, ref, read, read_len, cigar, cigar_len) ⇒ Integer

Note:

This method takes a Fiddle::Pointer as an argument. Please read the source code and understand it well before using this method. (Needs to be improved)

  1. Calculate the number of mismatches.

  2. Modify the cigar string:

differentiate matches (=), mismatches(X), and softclip(S).

Parameters:

  • ref_begin1 (Integer)

    0-based best alignment beginning position on the reference sequence

  • read_begin1 (Integer)

    0-based best alignment beginning position on the read sequence

  • read_end1 (Integer)

    0-based best alignment ending position on the read sequence

  • ref (Array)

    reference sequence

  • read (Array)

    read sequence

  • read_len (Integer)

    length of the read

  • cigar (Fiddle::Pointer)

    best alignment cigar; stored the same as that in BAM format, high 28 bits: length, low 4 bits: M/I/D (0/1/2)

  • cigar_len (Integer)

    length of the cigar string

Returns:

  • (Integer)

    The number of mismatches. The cigar and cigarLen are modified.



227
228
229
230
231
232
# File 'lib/libssw.rb', line 227

def mark_mismatch(ref_begin1, read_begin1, read_end1, ref, read, read_len, cigar, cigar_len)
  warn 'implementation: fiexme: **cigar' # FIXME
  FFI.mark_mismatch(
    ref_begin1, read_begin1, read_end1, ref.pack('c*'), read.pack('c*'), read_len, cigar, cigar_len.pack('l*')
  )
end

.ssw_align(prof, ref, weight_gap0, weight_gapE, flag, filters, filterd, mask_len) ⇒ Object

Do Striped Smith-Waterman alignment.

Parameters:

  • prof (Fiddle::Pointer, LibSSW::Profile, LibSSW::FFI::Profile)

    pointer to the query profile structure

  • ref (Array)

    target sequence; the target sequence needs to be numbers and corresponding to the mat parameter of function ssw_init

  • weight_gap0 (Integer)

    the absolute value of gap open penalty

  • weight_gapE (Integer)

    the absolute value of gap extension penalty

  • flag (Integer)
    • bit 5: when setted as 1, function ssw_align will return the best alignment beginning position;

    • bit 6: when setted as 1, if (ref_end1 - ref_begin1 < filterd && read_end1 - read_begin1 < filterd), (whatever bit 5 is setted) the function will return the best alignment beginning position and cigar;

    • bit 7: when setted as 1, if the best alignment score >= filters, (whatever bit 5 is setted) the function will return the best alignment beginning position and cigar;

    • bit 8: when setted as 1, (whatever bit 5, 6 or 7 is setted) the function will always return the best alignment beginning position and cigar. When flag == 0, only the optimal and sub-optimal scores and the optimal alignment ending position will be returned.

  • filters (Integer)

    scorefilter: when bit 7 of flag is setted as 1 and bit 8 is setted as 0, filters will be used (Please check the decription of the flag parameter for detailed usage.)

  • filterd (Integer)

    distance filter: when bit 6 of flag is setted as 1 and bit 8 is setted as 0, filterd will be used (Please check the decription of the flag parameter for detailed usage.)

  • mask_len (Integer)

    The distance between the optimal and suboptimal alignment ending position >= maskLen. We suggest to use readLen/2, if you don’t have special concerns. Note: maskLen has to be >= 15, otherwise this function will NOT return the suboptimal alignment information. Detailed description of maskLen: After locating the optimal alignment ending position, the suboptimal alignment score can be heuristically found by checking the second largest score in the array that contains the maximal score of each column of the SW matrix. In order to avoid picking the scores that belong to the alignments sharing the partial best alignment, SSW C library masks the reference loci nearby (mask length = maskLen) the best alignment ending position and locates the second largest score from the unmasked elements.



184
185
186
187
188
189
190
191
192
193
194
195
196
# File 'lib/libssw.rb', line 184

def ssw_align(prof, ref, weight_gap0, weight_gapE, flag, filters, filterd, mask_len)
  ref_str = ref.pack('c*')
  ref_len = ref.size
  ptr = FFI.ssw_align(
    prof, ref_str, ref_len, weight_gap0, weight_gapE, flag, filters, filterd, mask_len
  )
  # Not sure yet if we should set the instance variable to the pointer as a
  # garbage collection workaround.
  # For example: instance_variable_set(:@ref_str, ref_str)
  #
  # ptr.free = FFI.instance_variable_get(:@func_map)['align_destroy']
  LibSSW::Align.new(ptr)
end

.ssw_init(read, mat, n = nil, score_size: 2) ⇒ Object

Create the query profile using the query sequence.

Parameters:

  • read (Array)

    query sequence; the query sequence needs to be numbers

  • mat (Array)

    substitution matrix; mat needs to be corresponding to the read sequence

  • n (Integer) (defaults to: nil)

    the square root of the number of elements in mat (mat has n*n elements) If you omit this argument, the square root of the size of mat will be set.

  • score_size (Integer) (defaults to: 2)

    estimated Smith-Waterman score;

    • if your estimated best alignment score is surely < 255 please set 0;

    • if your estimated best alignment score >= 255, please set 1;

    • if you don’t know, please set 2



102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# File 'lib/libssw.rb', line 102

def ssw_init(read, mat, n = nil, score_size: 2)
  read_str = read.pack('c*')
  read_len = read.size
  mat = mat.to_a.flatten
  n = Math.sqrt(mat.size) if n.nil?
  raise "Not a square matrix. size: #{mat.size}, n: #{n}" if mat.size != n * n

  mat_str = mat.flatten.pack('c*')
  ptr = FFI.ssw_init(
    read_str,
    read_len,
    mat_str,
    n,
    score_size
  )
  # Garbage collection workaround
  #
  # * The following code will cause a segmentation violation when manually
  #   releasing memory. The reason is unknown.
  # * func_map is only available in newer versions of fiddle.
  # ptr.free = FFI.instance_variable_get(:@func_map)['init_destroy']
  ptr.instance_variable_set(:@read_str,   read_str)
  ptr.instance_variable_set(:@read_len,   read_len)
  ptr.instance_variable_set(:@mat_str,    mat_str)
  ptr.instance_variable_set(:@n,          n)
  ptr.instance_variable_set(:@score_size, score_size)

  LibSSW::Profile.new(ptr)
end