Module: Bio::MAF::MAFParsing

Included in:
DummyParser, ParseContext, Parser
Defined in:
lib/bio/maf/parser.rb

Overview

MAF parsing code useful for sequential and random-access parsing.

Constant Summary collapse

BLOCK_START =
/^(?=a)/
BLOCK_START_OR_EOS =
/(?:^(?=a))|\z/
EOL_OR_EOF =
/\n|\z/
JRUBY_P =
(RUBY_PLATFORM == 'java')
S =
's'.getbyte(0)
I =
'i'.getbyte(0)
E =
'e'.getbyte(0)
Q =
'q'.getbyte(0)
COMMENT =
'#'.getbyte(0)
STRAND_SYM =
{
  '+' => :+,
  '-' => :-
}

Instance Method Summary collapse

Instance Method Details

#_parse_blockBlock

Parse the block at the current position, joining fragments across chunk boundaries if necessary.


196
197
198
199
200
201
202
203
204
205
# File 'lib/bio/maf/parser.rb', line 196

def _parse_block
  return nil if at_end
  if s.pos != last_block_pos
    # in non-trailing block
    parse_block_data
  else
    # in trailing block fragment
    parse_trailing_fragment
  end
end

#gather_leading_fragmentObject

Read chunks and accumulate a leading fragment until we encounter a block start or EOF.


209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
# File 'lib/bio/maf/parser.rb', line 209

def gather_leading_fragment
  leading_frag = ''
  while true
    next_chunk_start = cr.pos
    next_chunk = cr.read_chunk
    if next_chunk
      next_scanner = StringScanner.new(next_chunk)
      # If this trailing fragment ends with a newline, then an
      # 'a' at the beginning of the leading fragment is the
      # start of the next alignment block.
      if trailing_nl?(leading_frag) || trailing_nl?(s.string)
        pat = BLOCK_START
      else
        pat = /(?:\n(?=a))/
      end
      frag = next_scanner.scan_until(pat)
      if frag
        # got block start
        leading_frag << frag
        break
      else
        # no block start in this
        leading_frag << next_chunk
      end
    else
      # EOF
      @at_end = true
      break
    end
  end
  return leading_frag, next_scanner, next_chunk_start
end

#parse_block_dataBlock

Parse a Block from the current position. Requires that #s and #chunk_start be set correctly.


298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
# File 'lib/bio/maf/parser.rb', line 298

def parse_block_data
  block_start_pos = s.pos
  block_offset = chunk_start + block_start_pos
  s.scan(/^a\s*/) || parse_error("bad a line")
  block_vars = parse_maf_vars()
  seqs = []
  payload = s.scan_until(/^(?=a)/)
  unless payload
    payload = s.rest
    s.pos = s.string.size # jump to EOS
  end
  filtered = false
  lines = payload.split("\n")
  until lines.empty?
    line = lines.shift
    first = line.getbyte(0)
    if first == S
      seq = parse_seq_line(line, sequence_filter)
      if seq
        seqs << seq
      else
        filtered = true
      end
    elsif first == E && parse_empty
      e_seq = parse_empty_line(line, sequence_filter)
      if e_seq
        seqs << e_seq
      else
        filtered = true
      end
    elsif first == I && parse_extended
      parts = line.split
      parse_error("wrong i source #{parts[1]}!") unless seqs.last.source == parts[1]
      seqs.last.i_data = parts.slice(2..6)
    elsif first == Q && parse_extended
      _, src, quality = line.split
      parse_error("wrong q source #{src}!") unless seqs.last.source == src
      seqs.last.quality = quality
    elsif [I, E, Q, COMMENT, nil].include? first
      next
    else
      if opts[:strict]
        parse_error "unexpected line: '#{line}'"
      else
        LOG.warn "Ignoring invalid MAF line: '#{line}'"
      end
    end
  end
  b = Block.new(block_vars,
                seqs,
                block_offset,
                s.pos - block_start_pos,
                filtered)
  if opts[:retain_text]
    b.orig_text = s.string.slice(block_start_pos...(s.pos))
  end
  return b
end

#parse_empty_line(line, filter) ⇒ EmptySequence

Parse an 'e' line.


376
377
378
379
380
381
382
383
384
385
386
387
388
389
# File 'lib/bio/maf/parser.rb', line 376

def parse_empty_line(line, filter)
  _, src, start, size, strand, src_size, status = line.split
  return nil if filter && ! seq_filter_ok?(src, filter)
  begin
    EmptySequence.new(src,
                      start.to_i,
                      size.to_i,
                      STRAND_SYM.fetch(strand),
                      src_size.to_i,
                      status)
  rescue KeyError
    parse_error "invalid empty sequence line: #{line}"
  end
end

#parse_error(msg) ⇒ Object

Raise a ParseError, indicating position within the MAF file and the chunk as well as the text surrounding the current scanner position.

Raises:


274
275
276
277
278
279
280
281
282
283
284
285
286
# File 'lib/bio/maf/parser.rb', line 274

def parse_error(msg)
  s_start = [s.pos - 10, 0].max
  s_end = [s.pos + 10, s.string.length].min
  if s_start > 0
    left = s.string[s_start..(s.pos - 1)]
  else
    left = ''
  end
  right = s.string[s.pos..s_end]
  extra = "pos #{s.pos} [#{chunk_start + s.pos}], last #{last_block_pos}"

  raise ParseError, "#{msg} at: '#{left}>><<#{right}' (#{extra})"
end

#parse_maf_varsHash

Parse key-value pairs from the MAF header or an 'a' line.


405
406
407
408
409
410
411
# File 'lib/bio/maf/parser.rb', line 405

def parse_maf_vars
  vars = {}
  while s.scan(/(\w+)=(\S*)\s+/) do
    vars[s[1].to_sym] = s[2]
  end
  vars
end

#parse_seq_line(line, filter) ⇒ Sequence

Parse an 's' line.


359
360
361
362
363
364
365
366
367
368
369
370
371
372
# File 'lib/bio/maf/parser.rb', line 359

def parse_seq_line(line, filter)
  _, src, start, size, strand, src_size, text = line.split
  return nil if filter && ! seq_filter_ok?(src, filter)
  begin
    Sequence.new(src,
                 start.to_i,
                 size.to_i,
                 STRAND_SYM.fetch(strand),
                 src_size.to_i,
                 text)
  rescue KeyError
    parse_error "invalid sequence line: #{line}"
  end
end

#parse_trailing_fragmentBlock

Join the trailing fragment of the current chunk with the leading fragment of the next chunk and parse the resulting block.


248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
# File 'lib/bio/maf/parser.rb', line 248

def parse_trailing_fragment
  leading_frag, next_scanner, next_chunk_start = gather_leading_fragment
  # join fragments and parse
  trailing_frag = s.rest
  joined_block = trailing_frag + leading_frag
  @chunk_start = chunk_start + s.pos
  @s = StringScanner.new(joined_block)
  begin
    block = parse_block_data
  rescue ParseError => pe
    parse_error "Could not parse joined fragments: #{pe}\nTRAILING: #{trailing_frag}\nLEADING: #{leading_frag}"
  end
  # Set up to parse the next block
  @s = next_scanner
  @chunk_start = next_chunk_start
  unless @at_end
    set_last_block_pos!
  end
  return block
end

#seq_filter_ok?(src, filter) ⇒ Boolean

Indicates whether the given sequence source should be parsed, given the current sequence filters.


393
394
395
396
397
398
399
400
401
# File 'lib/bio/maf/parser.rb', line 393

def seq_filter_ok?(src, filter)
  if filter[:only_species]
    src_sp = src.split('.', 2)[0]
    m = filter[:only_species].find { |sp| src_sp == sp }
    return m
  else
    return true
  end
end

#set_last_block_pos!Object


176
177
178
# File 'lib/bio/maf/parser.rb', line 176

def set_last_block_pos!
  @last_block_pos = s.string.rindex(BLOCK_START)
end

#trailing_nl?(string) ⇒ Boolean

Does string have a trailing newline?


414
415
416
417
418
419
420
# File 'lib/bio/maf/parser.rb', line 414

def trailing_nl?(string)
  if string.empty?
    false
  else
    s.string[s.string.size - 1] == "\n"
  end
end