Class: Bio::DB::PileupIterator

Inherits:
Object
  • Object
show all
Includes:
Enumerable
Defined in:
lib/bio/db/pileup_iterator.rb

Defined Under Namespace

Classes: PileupRead

Instance Method Summary collapse

Constructor Details

#initialize(io) ⇒ PileupIterator

Returns a new instance of PileupIterator.



15
16
17
# File 'lib/bio/db/pileup_iterator.rb', line 15

def initialize(io)
  @io = io
end

Instance Method Details

#eachObject

Iterates through the positions of the a pileup, returning an instance of Bio::DB::Pileup complete with an instance variable @reads, an Array of Bio::DB::PileupRead objects.

Known problems:

  • Doesn’t record start or ends of each read

  • Doesn’t lookahead to determine the sequence of each read (though it does give the preceding bases)

  • Gives no information with mismatches



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
# File 'lib/bio/db/pileup_iterator.rb', line 25

def each
  current_ordered_reads = []
  log = Bio::Log::LoggerPlus['bio-pileup_iterator']
  
  @io.each_line do |line|
    #log.debug "new current_line: #{line.inspect}"
    pileup = Bio::DB::Pileup.new(line.strip)
    current_read_index = 0
    reads_ending = []

    bases = pileup.read_bases
    #log.debug "new column's read_bases: #{bases.inspect}"
    #log.debug "pileup entry parsed: #{pileup.inspect}"
    while bases.length > 0
      #log.debug "bases remaining: #{bases}    ------------------------"
      
      # Firstly, what is the current read we are working with
      current_read = current_ordered_reads[current_read_index]
      # if adding a new read
      if current_read.nil?
        #log.debug 'adding a new read'
        current_read = PileupRead.new
        current_ordered_reads.push current_read
      else
        #log.debug 'reusing a read'
      end
      matches = nil

      # Now, parse what the current read is
      if matches = bases.match(/^([ACGTNacgtn\.\,])([\+\-])([0-9]+)([ACGTNacgtn]+)(\${0,1})/)
        #log.debug "matched #{matches.to_s} as insertion/deletion"
        
        # match again to better match the number of inserted bases
        num_inserted = matches[3].to_i
        matches = bases.match(/^([ACGTNacgtn\.\,])([\+\-])([0-9]+)([ACGTNacgtn]{#{num_inserted}})(\${0,1})/)
        raise unless matches
        
        # insertion / deletion
        if matches[1] == '.'
          raise if !current_read.direction.nil? and current_read.direction != PileupRead::FORWARD_DIRECTION
          current_read.direction = PileupRead::FORWARD_DIRECTION
          current_read.sequence = "#{current_read.sequence}#{pileup.ref_base}"
        elsif matches[1] == ','
          raise if !current_read.direction.nil? and current_read.direction != PileupRead::REVERSE_DIRECTION
          current_read.direction = PileupRead::REVERSE_DIRECTION
          current_read.sequence = "#{current_read.sequence}#{pileup.ref_base}"
        else
          # Could sanity check the direction here by detecting case, but eh
          current_read.sequence = "#{current_read.sequence}#{matches[1]}"
        end
        
        # record the insertion
        if matches[2] == '+'
          current_read.add_insertion pileup.pos, matches[3], matches[4]
        end
        
        if matches[5].length > 0
          #log.debug "Ending this read"
          # end this read
          reads_ending.push current_read_index
        end
      # currently I don't care about indels, except for the direction, so I'll leave it at that for now

      # end of the read
      elsif matches = bases.match(/^([\.\,])\$/)
        #log.debug "matched #{matches.to_s} as end of read"
        # regular match in some direction, end of read
        if matches[1]=='.' # if forwards
          raise if current_read.direction and current_read.direction != PileupRead::FORWARD_DIRECTION
          current_read.direction = PileupRead::FORWARD_DIRECTION
          current_read.sequence = "#{current_read.sequence}#{pileup.ref_base}"
        else # else must be backwards, since it can only be , or .
          raise if current_read.direction and current_read.direction != PileupRead::REVERSE_DIRECTION
          current_read.direction = PileupRead::REVERSE_DIRECTION
          current_read.sequence = "#{current_read.sequence}#{pileup.ref_base}"
        end
        #log.debug "current read after deletion: #{current_read.inspect}"
        reads_ending.push current_read_index
        
      # regular match continuuing onwards
      elsif matches = bases.match(/^\./)
        #log.debug "matched #{matches.to_s} as forward regular match"
        # regular match in the forward direction
        raise if !current_read.direction.nil? and current_read.direction != PileupRead::FORWARD_DIRECTION
        current_read.direction = PileupRead::FORWARD_DIRECTION
        #log.debug "before adding this base, current sequence is '#{current_read.sequence}'"
        current_read.sequence = "#{current_read.sequence}#{pileup.ref_base}"
        #log.debug "after adding this base, current sequence is '#{current_read.sequence}', ref_base: #{pileup.ref_base}"
      elsif matches = bases.match(/^\,/)
        #log.debug "matched #{matches.to_s} as reverse regular match"
        # regular match in the reverse direction
        if !current_read.direction.nil? and current_read.direction != PileupRead::REVERSE_DIRECTION
          error_msg = "Unexpectedly found read a #{current_read.direction} direction read when expecting a positive direction one. This suggests there is a problem with either the pileup file or this pileup parser. Current pileup column #{pileup.inspect}, read #{current_read.inspect}, chomped until #{bases}"
          log.error error_msg
          raise Exception, error_msg
        end
        current_read.direction = PileupRead::REVERSE_DIRECTION
        current_read.sequence = "#{current_read.sequence}#{pileup.ref_base}"
      
      # starting a new read (possibly with a gap), with an accompanying insertion/deletion
      elsif matches = bases.match(/^\^.([ACGTNacgtn\.\,\*])([\+\-])([0-9]+)([ACGTNacgtn]+)(\${0,1})/)
        if matches[1] == '.'
          #log.debug 'forward match starting a read'
          current_read.direction = PileupRead::FORWARD_DIRECTION
          current_read.sequence = "#{current_read.sequence}#{pileup.ref_base}"
        elsif matches[1] == ','
          #log.debug 'reverse match starting a read'
          current_read.direction = PileupRead::REVERSE_DIRECTION
          current_read.sequence = "#{current_read.sequence}#{pileup.ref_base}"
        elsif matches[1] == '*'
          #log.debug 'starting a read with a gap'
          # leave direction unknown at this point
          current_read.sequence = "#{current_read.sequence}#{matches[1]}"
        elsif matches[1] == matches[1].upcase
          #log.debug 'forward match starting a read, warning of insertion next'
          current_read.direction = PileupRead::FORWARD_DIRECTION
          current_read.sequence = "#{current_read.sequence}#{matches[1]}"
        else
          #log.debug 'forward match starting a read, warning of insertion next'
          current_read.direction = PileupRead::REVERSE_DIRECTION
          current_read.sequence = "#{current_read.sequence}#{matches[1]}"
        end
        
        # record the insertion
        if matches[2] == '+'
          current_read.add_insertion pileup.pos, matches[3], matches[4]
        end
        
        if matches[5].length > 0
          #log.debug "Ending this read"
          # end this read
          reads_ending.push current_read_index
        end

        
      # regular match, starting a new read
      elsif matches = bases.match(/^\^.([ACGTNacgtn\.\,\*])(\${0,1})/)
        if matches[1] == '.'
          #log.debug 'forward match starting a read'
          current_read.direction = PileupRead::FORWARD_DIRECTION
          current_read.sequence = "#{current_read.sequence}#{pileup.ref_base}"
        elsif matches[1] == ','
          #log.debug 'reverse match starting a read'
          current_read.direction = PileupRead::REVERSE_DIRECTION
          current_read.sequence = "#{current_read.sequence}#{pileup.ref_base}"
        elsif matches[1] == '*'
          #log.debug 'gap starting a read'
          current_read.sequence = "#{current_read.sequence}#{matches[1]}"
        elsif matches[1] == matches[1].upcase
          #log.debug 'forward match starting a read, warning of insertion next'
          current_read.direction = PileupRead::FORWARD_DIRECTION
          current_read.sequence = "#{current_read.sequence}#{matches[1]}"
        else
          #log.debug 'forward match starting a read, warning of insertion next'
          current_read.direction = PileupRead::REVERSE_DIRECTION
          current_read.sequence = "#{current_read.sequence}#{matches[1]}"
        end
        if matches[2].length > 0
          #log.debug "Ending this read, even though it started here too.. it happens.."
          # end this read
          reads_ending.push current_read_index
        end

        
      elsif matches = bases.match(/^\*([\+\-])([0-9]+)([ACGTNacgtn=]+)(\${0,1})/)
        #log.debug 'gap then insert/delete found'
        # gap - should already be known from the last position
        current_read.sequence = "#{current_read.sequence}*"
        if matches[4].length > 0
          #log.debug "Ending this read"
          # end this read
          reads_ending.push current_read_index
        end
                  
        # record the insertion
        if matches[1] == '+'
          current_read.add_insertion pileup.pos, matches[2], matches[3]
        end

      elsif matches = bases.match(/(^[ACGTNacgtn\*])(\${0,1})/)
        #log.debug 'mismatch found (or deletion)'
        # simple mismatch
        current_read.sequence = "#{current_read.sequence}#{matches[1]}"
        if matches[2].length > 0
          #log.debug "Ending this read"
          reads_ending.push current_read_index
        end
      end
      #log.debug "current read's sequence: #{current_read.sequence}"
      
      #raise Exception, "implement mismatch parsing here!!!"
      raise Exception, "Unexpected Pileup format bases, starting here: #{bases}, from #{pileup.inspect}" if matches.nil?
      
      #remove the matched part from the base string for next time
      bases = bases[matches.to_s.length..bases.length-1]

      current_read_index += 1
    end

    # Create a new copy of the array and yield that, otherwise when things get deleted they get removed from the yielded array as well (which is unwanted)
    yielded_array = Array.new(current_ordered_reads)
    pileup.reads = yielded_array
    #log.debug "Number of reads yielded: #{pileup.reads.length}"
    yield pileup
    
    # Remove reads that ended. In reverse order since removing the last ones first doesn't mess with the indices beforehand in the array
    reads_ending.reverse.each do |i|
      #log.debug "Deleting read of index #{i} (total reads #{current_ordered_reads.length}): #{current_ordered_reads[i].inspect}"
      current_ordered_reads.delete_at i
    end
    #log.debug "Ended up with #{current_ordered_reads.length} reads that should be present next time"
  end
end