Class: Bio::GFFbrowser::Block::GffBlockParser

Inherits:
Object
  • Object
show all
Includes:
FastLineParser
Defined in:
lib/bio/db/gff/block/gffblockparser.rb

Overview

The block parser simplifies parsing, by assuming GFF3 is organised into blocks. All relevant information is resolved a block at a time.

Instance Method Summary collapse

Methods included from FastLineParser

#parse_attributes_fast, #parse_line_fast

Methods included from Helpers::Logger

#debug, #error, #info, #log_sys_info, #warn

Constructor Details

#initialize(filename, options) ⇒ GffBlockParser

Returns a new instance of GffBlockParser.



13
14
15
16
17
18
# File 'lib/bio/db/gff/block/gffblockparser.rb', line 13

def initialize filename, options
  info "Starting block parser"
  @filename = filename
  @options = options
  @iter = Bio::GFF::GFF3::FileIterator.new(@filename)
end

Instance Method Details

#each_CDS_seqObject



86
87
88
# File 'lib/bio/db/gff/block/gffblockparser.rb', line 86

def each_CDS_seq
  each_seq('cds') { | id, seq | yield id,seq }
end

#each_exon_seqObject



82
83
84
# File 'lib/bio/db/gff/block/gffblockparser.rb', line 82

def each_exon_seq
  each_seq('exon') { | id, seq | yield id,seq }
end

#each_gene_seqObject



74
75
76
# File 'lib/bio/db/gff/block/gffblockparser.rb', line 74

def each_gene_seq
  each_seq('gene') { | id, seq | yield id,seq }
end

#each_mRNA_seqObject



78
79
80
# File 'lib/bio/db/gff/block/gffblockparser.rb', line 78

def each_mRNA_seq
  each_seq('mrna') { | id, seq | yield id,seq }
end

#each_seq(gfftype) ⇒ Object



70
71
72
# File 'lib/bio/db/gff/block/gffblockparser.rb', line 70

def each_seq(gfftype) 
  parse(gfftype) { | id, seq | yield id,seq }
end

#parse(gfftype) ⇒ Object



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/bio/db/gff/block/gffblockparser.rb', line 20

def parse(gfftype)
  @inseqidlist = {}
  # Fetch FASTA first
  @sequencelist = {}
  if @options[:fasta_filename]
    File.open(@options[:fasta_filename]) do | f |
      fasta = Bio::GFF::FastaReader.new(f)
      fasta.each do | id, fastarec |
        # p fastarec
        @sequencelist[id] = fastarec
      end
    end
  else
    # Embedded FASTA
    @iter.each_sequence do | id, bioseq |
      @sequencelist[id] = bioseq.to_s
    end
  end
  seqid = nil
  recs = []
  @iter.each_rec do | fpos, line |
    rec = FastLineRecord.new(parse_line_fast(line))
    if seqid != rec.seqid 
      # starting a new block
      if @inseqidlist[rec.seqid]
        # not a well formed GFF3 file, we need
        # to drop
        error "GFF3 file not sorted, falling back to line parser"
        raise "ERROR, bailing out"
      end
      parse_block(gfftype,recs,@sequencelist[seqid])  { | id, seq | yield id,seq } if seqid
      recs = []
      seqid = rec.seqid
      @inseqidlist[seqid] = true
    end
    recs.push rec
  end
  parse_block(gfftype,recs,@sequencelist[seqid])  { | id, seq | yield id,seq } if seqid
end

#parse_block(gfftype, recs, sequence) ⇒ Object

Parse sequence objects sharing the same seqid and yield each gfftype as an iq,seq



62
63
64
65
66
67
68
# File 'lib/bio/db/gff/block/gffblockparser.rb', line 62

def parse_block gfftype, recs, sequence
  recs.each do | rec |
    if rec.feature_type.downcase == gfftype
      yield rec.id, sequence[rec.start-1..rec.end-1]
    end
  end
end