Class: Bio::Fastq

Inherits:
Object show all
Defined in:
lib/bio/db/fastq.rb

Overview

Bio::Fastq is a parser for FASTQ format.

Defined Under Namespace

Classes: Error, FormatData

Constant Summary

FormatNames =

Available format names.

{
  "fastq-sanger"   => FormatData::FASTQ_SANGER,
  "fastq-solexa"   => FormatData::FASTQ_SOLEXA,
  "fastq-illumina" => FormatData::FASTQ_ILLUMINA
}.freeze
Formats =

Available format name symbols.

{
  :fastq_sanger   => FormatData::FASTQ_SANGER,
  :fastq_solexa   => FormatData::FASTQ_SOLEXA,
  :fastq_illumina => FormatData::FASTQ_ILLUMINA
}.freeze
DefaultFormatName =

Default format name

'fastq-sanger'.freeze
FLATFILE_SPLITTER =

Splitter for Bio::FlatFile

Bio::FlatFile::Splitter::LineOriented

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(str = nil) ⇒ Fastq

Creates a new Fastq object from formatted text string.

The format of quality scores should be specified later by using format= method.


Arguments:

  • str: Formatted string (String)



383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
# File 'lib/bio/db/fastq.rb', line 383

def initialize(str = nil)
  return unless str
  sc = StringScanner.new(str)
  while !sc.eos? and line = sc.scan(/.*(?:\n|\r|\r\n)?/)
    unless add_header_line(line) then
      sc.unscan
      break
    end
  end
  while !sc.eos? and line = sc.scan(/.*(?:\n|\r|\r\n)?/)
    unless add_line(line) then
      sc.unscan
      break
    end
  end
  @entry_overrun = sc.rest
end

Instance Attribute Details

#definitionObject (readonly)

definition; ID line (begins with @)



402
403
404
# File 'lib/bio/db/fastq.rb', line 402

def definition
  @definition
end

#entry_overrunObject (readonly)

entry_overrun



373
374
375
# File 'lib/bio/db/fastq.rb', line 373

def entry_overrun
  @entry_overrun
end

#headerObject (readonly)

misc lines before the entry (String or nil)



335
336
337
# File 'lib/bio/db/fastq.rb', line 335

def header
  @header
end

#quality_stringObject (readonly)

quality as a string



405
406
407
# File 'lib/bio/db/fastq.rb', line 405

def quality_string
  @quality_string
end

#sequence_stringObject (readonly)

raw sequence data as a String object



408
409
410
# File 'lib/bio/db/fastq.rb', line 408

def sequence_string
  @sequence_string
end

Instance Method Details

#add_header_line(line) ⇒ Object

Adds a header line if the header data is not yet given and the given line is suitable for header. Returns self if adding header line is succeeded. Otherwise, returns false (the line is not added).



324
325
326
327
328
329
330
331
332
# File 'lib/bio/db/fastq.rb', line 324

def add_header_line(line)
  @header ||= ""
  if line[0,1] == "@" then
    false
  else
    @header.concat line
    self
  end
end

#add_line(line) ⇒ Object

Adds a line to the entry if the given line is regarded as a part of the current entry.



339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
# File 'lib/bio/db/fastq.rb', line 339

def add_line(line)
  line = line.chomp
  if !defined? @definition then
    if line[0, 1] == "@" then
      @definition = line[1..-1]
    else
      @definition = line
      @parse_errors ||= []
      @parse_errors.push Error::No_atmark.new
    end
    return self
  end
  if defined? @definition2 then
    @quality_string ||= ''
    if line[0, 1] == "@" and
        @quality_string.size >= @sequence_string.size then
      return false
    else
      @quality_string.concat line
      return self
    end
  else
    @sequence_string ||= ''
    if line[0, 1] == '+' then
      @definition2 = line[1..-1]
    else
      @sequence_string.concat line
    end
    return self
  end
  raise "Bug: should not reach here!"
end

#entry_idObject

Identifier of the entry. Normally, the first word of the ID line.



432
433
434
435
436
437
438
# File 'lib/bio/db/fastq.rb', line 432

def entry_id
  unless defined? @entry_id then
    eid = @definition.strip.split(/\s+/)[0] || @definition
    @entry_id = eid
  end
  @entry_id
end

#error_probabilitiesObject

Estimated probability of error for each base.


Returns

(Array containing Float) error probability values



515
516
517
518
519
520
521
522
# File 'lib/bio/db/fastq.rb', line 515

def error_probabilities
  unless defined? @error_probabilities then
    self.format ||= self.class::DefaultFormatName
    a = @format.q2p(self.quality_scores)
    @error_probabilities = a
  end
  @error_probabilities
end

#formatObject

Format name. One of “fastq-sanger”, “fastq-solexa”, “fastq-illumina”, or nil (when not specified).


Returns

(String or nil) format name



483
484
485
# File 'lib/bio/db/fastq.rb', line 483

def format
  @format ? @format.name : nil
end

#format=(name) ⇒ Object

Specify the format. If the format is not found, raises RuntimeError.

Available formats are:

"fastq-sanger" or :fastq_sanger
"fastq-solexa" or :fastq_solexa
"fastq-illumina" or :fastq_illumina

Arguments:

  • (required) name: format name (String or Symbol).

Returns

(String) format name



462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
# File 'lib/bio/db/fastq.rb', line 462

def format=(name)
  if name then
    f = FormatNames[name] || Formats[name]
    if f then
      reset_state
      @format = f.instance
      self.format
    else
      raise "unknown format"
    end
  else
    reset_state
    nil
  end
end

#mask(threshold, mask_char = 'n') ⇒ Object

Masks low quality sequence regions. For each sequence position, if the quality score is smaller than the threshold, the sequence in the position is replaced with mask_char.

Note: This method does not care quality_score_type.


Arguments:

  • (required) threshold : (Numeric) threshold

  • (optional) mask_char : (String) character used for masking

Returns

Bio::Sequence object



654
655
656
# File 'lib/bio/db/fastq.rb', line 654

def mask(threshold, mask_char = 'n')
  to_biosequence.mask_with_quality_score(threshold, mask_char)
end

#nalenObject

length of naseq



419
420
421
# File 'lib/bio/db/fastq.rb', line 419

def nalen
  naseq.length
end

#naseqObject

returns Bio::Sequence::NA



411
412
413
414
415
416
# File 'lib/bio/db/fastq.rb', line 411

def naseq
  unless defined? @naseq then
    @naseq = Bio::Sequence::NA.new(@sequence_string)
  end
  @naseq
end

#quality_score_typeObject

The meaning of the quality scores. It may be one of :phred, :solexa, or nil.



490
491
492
493
# File 'lib/bio/db/fastq.rb', line 490

def quality_score_type
  self.format ||= self.class::DefaultFormatName
  @format.quality_score_type
end

#quality_scoresObject Also known as: qualities

Quality score for each base. For “fastq-sanger” or “fastq-illumina”, it is PHRED score. For “fastq-solexa”, it is Solexa score.


Returns

(Array containing Integer) quality score values



501
502
503
504
505
506
507
508
# File 'lib/bio/db/fastq.rb', line 501

def quality_scores
  unless defined? @quality_scores then
    self.format ||= self.class::DefaultFormatName
    s = @format.str2scores(@quality_string)
    @quality_scores = s
  end
  @quality_scores
end

#seqObject

returns Bio::Sequence::Generic



424
425
426
427
428
429
# File 'lib/bio/db/fastq.rb', line 424

def seq
  unless defined? @seq then
    @seq = Bio::Sequence::Generic.new(@sequence_string)
  end
  @seq
end

#to_biosequenceObject

Returns sequence as a Bio::Sequence object.

Note: If you modify the returned Bio::Sequence object, the sequence or definition in this Fastq object might also be changed (but not always be changed) because of efficiency.



639
640
641
# File 'lib/bio/db/fastq.rb', line 639

def to_biosequence
  Bio::Sequence.adapter(self, Bio::Sequence::Adapter::Fastq)
end

#validate_format(errors = nil) ⇒ Object

Format validation.

If an array is given as the argument, when errors are found, error objects are pushed to the array. Currently, following errors may be added to the array. (All errors are under the Bio::Fastq namespace, for example, Bio::Fastq::Error::Diff_ids).

Error::Diff_ids – the identifier in the two lines are different Error::Long_qual – length of quality is longer than the sequence Error::Short_qual – length of quality is shorter than the sequence Error::No_qual – no quality characters found Error::No_seq – no sequence found Error::Qual_char – invalid character in the quality Error::Seq_char – invalid character in the sequence Error::Qual_range – quality score value out of range Error::No_ids – sequence identifier not found Error::No_atmark – the first identifier does not begin with “@” Error::Skipped_unformatted_lines – the parser skipped unformatted lines that could not be recognized as FASTQ format


Arguments:

  • (optional) errors: (Array or nil) an array for pushing error messages. The array should be empty.

Returns

true:no error, false: containing error.



548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
# File 'lib/bio/db/fastq.rb', line 548

def validate_format(errors = nil)
  err = []

  # if header exists, the format might be broken.
  if defined? @header and @header and !@header.strip.empty? then
    err.push Error::Skipped_unformatted_lines.new
  end

  # if parse errors exist, adding them
  if defined? @parse_errors and @parse_errors then
    err.concat @parse_errors
  end

  # check if identifier exists, and identifier matches
  if !defined?(@definition) or !@definition then
    err.push Error::No_ids.new
  elsif defined?(@definition2) and
      !@definition2.to_s.empty? and
      @definition != @definition2 then
    err.push Error::Diff_ids.new
  end

  # check if sequence exists
  has_seq  = true
  if !defined?(@sequence_string) or !@sequence_string then
    err.push Error::No_seq.new
    has_seq = false
  end

  # check if quality exists
  has_qual = true
  if !defined?(@quality_string) or !@quality_string then
    err.push Error::No_qual.new
    has_qual = false
  end

  # sequence and quality length check
  if has_seq and has_qual then
    slen = @sequence_string.length
    qlen = @quality_string.length
    if slen > qlen then
      err.push Error::Short_qual.new
    elsif qlen > slen then
      err.push Error::Long_qual.new
    end
  end

  # sequence character check
  if has_seq then
    sc = StringScanner.new(@sequence_string)
    while sc.scan_until(/[ \x00-\x1f\x7f-\xff]/n)
      err.push Error::Seq_char.new(sc.pos - sc.matched_size)
    end
  end

  # sequence character check
  if has_qual then
    fmt = if defined?(@format) and @format then
            @format.name
          else
            nil
          end
    re = case fmt
         when 'fastq-sanger'
           /[^\x21-\x7e]/n
         when 'fastq-solexa'
           /[^\x3b-\x7e]/n
         when 'fastq-illumina'
           /[^\x40-\x7e]/n
         else
           /[ \x00-\x1f\x7f-\xff]/n
         end
    sc = StringScanner.new(@quality_string)
    while sc.scan_until(re)
      err.push Error::Qual_char.new(sc.pos - sc.matched_size)
    end
  end

  # if "errors" is given, set errors
  errors.concat err if errors
  # returns true if no error; otherwise, returns false
  err.empty? ? true : false
end