Module: Bio::GFFbrowser::Digest::Parser

Includes:
Helpers, Helpers::Error, Helpers::Validate, Gff3Component, Gff3Features
Included in:
InMemory, NoCache
Defined in:
lib/bio/db/gff/digest/gffparser.rb

Overview

Both in-memory and no-cache fully digest parsers share this Parser module.

Instance Method Summary collapse

Methods included from Helpers::Error

#debug, #error, #info, #warn

Methods included from Helpers::Validate

#validate_cdss, #validate_mrnas

Instance Method Details

#each_CDSObject

Yield the id, recs, and containing component



89
90
91
92
# File 'lib/bio/db/gff/digest/gffparser.rb', line 89

def each_CDS
  parse if !@cdslist
  each_item(@cdslist) { |id, recs, component | yield id, recs, component }
end

#each_CDS_seqObject

Yield a unique description and the sequence



131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# File 'lib/bio/db/gff/digest/gffparser.rb', line 131

def each_CDS_seq
  each_CDS do | id, reclist, component |
    if component
      sequence = @sequencelist[component.seqname]
      # p sequence
      if sequence
        seq = assemble(sequence,component.start,reclist,@options.merge(:codonize=>true))
        if seq.size % 3 != 0
          p reclist # leave this in
          # raise "CDS size #{seq.size} is not a multiple of 3! <#{seq}>"
          warn "CDS size is not a multiple of 3",id
        end
        yield description(id,component,reclist), seq
      else 
        warn "No sequence information for",id
      end
    end
  end
end

#each_exonObject

Yield the id, recs, and containing component



95
96
97
98
# File 'lib/bio/db/gff/digest/gffparser.rb', line 95

def each_exon
  parse if !@exonlist
  each_item(@exonlist) { |id, recs, component | yield id, recs, component }
end

#each_exon_seqObject

Yield a unique description and the sequence



152
153
154
155
156
157
158
159
160
161
162
163
164
# File 'lib/bio/db/gff/digest/gffparser.rb', line 152

def each_exon_seq
  each_exon do | id, reclist, component |
    if component
      sequence = @sequencelist[component.seqname]
      if sequence
        seq = assemble(sequence,component.start,reclist)
        yield description(id,component,reclist), seq
      else 
        warn "No sequence information for",id
      end
    end
  end
end

#each_geneObject

Yield the id, recs, containing component and sequence of genes



77
78
79
80
# File 'lib/bio/db/gff/digest/gffparser.rb', line 77

def each_gene
  parse if !@orflist
  each_item(@orflist) { |id, recs, component | yield id, recs, component }
end

#each_gene_seqObject

Yield a unique description and the sequence



101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/bio/db/gff/digest/gffparser.rb', line 101

def each_gene_seq
  each_gene do | id, reclist, component |
    if component
      sequence = @sequencelist[component.seqname]
      # p sequence
      if sequence
        yield description(id,component,reclist), assemble(sequence,component.start,reclist)
      else 
        warn "No sequence information for",id
      end
    end
  end
end

#each_mRNAObject

Yield the id, recs, containing component and sequence of mRNAs



83
84
85
86
# File 'lib/bio/db/gff/digest/gffparser.rb', line 83

def each_mRNA
  parse if !@mrnalist
  each_item(@mrnalist) { |id, recs, component | yield id, recs, component }
end

#each_mRNA_seqObject

Yield a unique description and the sequence



116
117
118
119
120
121
122
123
124
125
126
127
128
# File 'lib/bio/db/gff/digest/gffparser.rb', line 116

def each_mRNA_seq
  each_mRNA do | id, reclist, component |
    if component
      sequence = @sequencelist[component.seqname]
      # p sequence
      if sequence
        yield description(id,component,reclist), assemble(sequence,component.start,reclist)
      else 
        warn "No sequence information for",id
      end
    end
  end
end

#read_fastaObject



63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/bio/db/gff/digest/gffparser.rb', line 63

def read_fasta
  if @options[:fasta_filename]
    File.open(@options[:fasta_filename]) do | f |
      fasta = Bio::GFF::FastaReader.new(f)
      fasta.each do | id, fastarec |
        # p fastarec
        @sequencelist[id] = fastarec
      end
    end
  end
  # p :inmemory, @sequencelist
end

#show_unrecognized_featuresObject



57
58
59
60
61
# File 'lib/bio/db/gff/digest/gffparser.rb', line 57

def show_unrecognized_features 
  @unrecognized_features.keys.each do | k |
    warn "Feature has no match",k if k
  end
end

#store_record(rec) ⇒ Object

Takes a parsed record rec and stores items in the relevant lists/tables



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/bio/db/gff/digest/gffparser.rb', line 28

def store_record rec
  return if rec.comment # skip GFF comments
  id = Helpers::Record::formatID(rec)
  @count_ids.add(id)
  @count_seqnames.add(rec.seqname)

  is_component = COMPONENT_TYPES.include?(rec.feature_type)
  if is_component
    # check for container ID
    warn("Container <#{rec.feature_type}> has no ID, so using sequence name instead",id) if rec.id == nil
    @componentlist[id] = rec
    info "Added #{rec.feature_type} with component ID #{id}"
  end 
  case rec.feature_type
    when 'gene' || 'SO:0000704'
      @orflist.add(id,rec)
    when 'mRNA' || 'SO:0000234'
      @mrnalist.add(id,rec)
    when 'CDS'  || 'SO:0000316'
      @cdslist.add(id,rec)
    when 'exon' || 'SO:0000147'
      @exonlist.add(id,rec)
    else
      if !is_component and !IGNORE_FEATURES.include?(rec.feature_type)
        @unrecognized_features[rec.feature_type] = true
      end
  end
end