Class: BioDSL::Assemble

Inherits:
Object
  • Object
show all
Extended by:
Ambiguity
Defined in:
lib/BioDSL/seq/assemble.rb

Overview

Class with methods for assembling pair-end reads.

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Ambiguity

add_ambiguity_macro

Constructor Details

#initialize(entry1, entry2, options) ⇒ Assemble

Method to initialize an Assembly object.



48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/BioDSL/seq/assemble.rb', line 48

def initialize(entry1, entry2, options)
  @entry1  = entry1
  @entry2  = entry2
  @overlap = 0
  @offset1 = 0
  @offset2 = 0
  @options = options
  @options[:mismatches_max] ||= 0
  @options[:overlap_min]    ||= 1

  check_options
end

Class Method Details

.pair(entry1, entry2, options = {}) ⇒ Object

Class method to assemble two Seq objects.



42
43
44
45
# File 'lib/BioDSL/seq/assemble.rb', line 42

def self.pair(entry1, entry2, options = {})
  assemble = new(entry1, entry2, options)
  assemble.match
end

Instance Method Details

#calc_diffFixnum

Calculate the diff between sequence lengths and return this.

Returns:

  • (Fixnum)

    Diff.



118
119
120
121
122
# File 'lib/BioDSL/seq/assemble.rb', line 118

def calc_diff
  diff = @entry1.length - @entry2.length
  diff = 0 if diff < 0
  diff
end

#calc_overlapObject

Calculate the overlap to be matched.



107
108
109
110
111
112
113
# File 'lib/BioDSL/seq/assemble.rb', line 107

def calc_overlap
  @overlap = if @options[:overlap_max]
               [@options[:overlap_max], @entry1.length, @entry2.length].min
             else
               [@entry1.length, @entry2.length].min
             end
end

#check_optionsObject

Check option values are sane.

Raises:



64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# File 'lib/BioDSL/seq/assemble.rb', line 64

def check_options
  if @options[:mismatches_max] < 0
    fail AssembleError, "mismatches_max must be zero or greater - not: \
    #{@options[:mismatches_max]}"
  end

  if @options[:overlap_max] && @options[:overlap_max] <= 0
    fail AssembleError, "overlap_max must be one or greater - not: \
    #{@options[:overlap_max]}"
  end

  if @options[:overlap_min] <= 0
    fail AssembleError, "overlap_min must be one or greater - not: \
    #{@options[:overlap_min]}"
  end
end

#entry_leftBioDSL::Seq

Method to extract and downcase the left part of an assembled pair.

Returns:



127
128
129
130
131
# File 'lib/BioDSL/seq/assemble.rb', line 127

def entry_left
  entry = @entry1[0...@offset1]
  entry.seq.downcase!
  entry
end

#entry_overlapBioDSL::Seq

Method to extract and upcase the overlapping part of an assembled pair.

Returns:



150
151
152
153
154
155
156
157
158
159
160
161
162
# File 'lib/BioDSL/seq/assemble.rb', line 150

def entry_overlap
  if @entry1.qual && @entry2.qual
    entry_overlap1 = @entry1[@offset1...@offset1 + @overlap]
    entry_overlap2 = @entry2[@offset2...@offset2 + @overlap]

    entry = merge_overlap(entry_overlap1, entry_overlap2)
  else
    entry = @entry1[@offset1...@offset1 + @overlap]
  end

  entry.seq.upcase!
  entry
end

#entry_rightBioDSL::Seq

Method to extract and downcase the right part of an assembled pair.

Returns:



136
137
138
139
140
141
142
143
144
145
# File 'lib/BioDSL/seq/assemble.rb', line 136

def entry_right
  entry = if @entry1.length > @offset1 + @overlap
            @entry1[@offset1 + @overlap..-1]
          else
            @entry2[@offset2 + @overlap..-1]
          end

  entry.seq.downcase!
  entry
end

#matchObject

Method to locate overlapping matches between two sequences.



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/BioDSL/seq/assemble.rb', line 82

def match
  calc_overlap
  diff = calc_diff

  @offset1 = @entry1.length - @overlap - diff

  while @overlap >= @options[:overlap_min]
    mismatches_max = (@overlap * @options[:mismatches_max] * 0.01).round

    if (mismatches = match_C(@entry1.seq, @entry2.seq, @offset1, @offset2,
                             @overlap, mismatches_max)) && mismatches >= 0
      entry_merged          = entry_left + entry_overlap + entry_right
      entry_merged.seq_name = @entry1.seq_name +
        ":overlap=#{@overlap}:hamming=#{mismatches}" if @entry1.seq_name

      return entry_merged
    end

    diff > 0 ? diff -= 1 : @overlap -= 1

    @offset1 += 1
  end
end

#merge_overlap(entry_overlap1, entry_overlap2) ⇒ Object

Method to merge sequence and quality scores in an overlap. The residue with the highest score at mismatch positions is selected. The quality scores of the overlap are the mean of the two sequences.



167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# File 'lib/BioDSL/seq/assemble.rb', line 167

def merge_overlap(entry_overlap1, entry_overlap2)
  na_seq = NArray.byte(entry_overlap1.length, 2)
  na_seq[true, 0] = NArray.to_na(entry_overlap1.seq.downcase, 'byte')
  na_seq[true, 1] = NArray.to_na(entry_overlap2.seq.downcase, 'byte')

  na_qual = NArray.byte(entry_overlap1.length, 2)
  na_qual[true, 0] = NArray.to_na(entry_overlap1.qual, 'byte')
  na_qual[true, 1] = NArray.to_na(entry_overlap2.qual, 'byte')

  mask_xor = na_seq[true, 0] ^ na_seq[true, 1] > 0
  mask_seq = ((na_qual * mask_xor).eq((na_qual * mask_xor).max(1)))

  merged      = Seq.new
  merged.seq  = (na_seq * mask_seq).max(1).to_s
  merged.qual = na_qual.mean(1).round.to_type('byte').to_s

  merged
end