Class: FastaReader

Inherits:
Object
  • Object
show all
Includes:
Indexer
Defined in:
lib/bigbio/db/fasta/fastareader.rb,
lib/bigbio/db/fasta/fastareader.rb

Overview

The following is actually a module/trait implementation without state

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Indexer

#indexer_get, #indexer_get_by_index, #indexer_set, #indexer_use

Constructor Details

#initialize(fn, opts = {}) ⇒ FastaReader

Initalize the reader of FASTA file fn. Options can be :regex and :index (true/false)



12
13
14
15
16
17
18
# File 'lib/bigbio/db/fasta/fastareader.rb', line 12

def initialize fn, opts = {}
  @f = File.open(fn)
  @fread_once = false
  @regex = opts[:regex]
  @regex = '^(\S+)' if @regex == nil
  indexer_use opts[:index]
end

Class Method Details

.emit(getbuf_func) {|id, descr, seq| ... } ⇒ Object

func passes in a FASTA buffer. Every time a record is parsed it is yielded.

Yields:

  • (id, descr, seq)


141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# File 'lib/bigbio/db/fasta/fastareader.rb', line 141

def FastaReader::emit getbuf_func
  seq = ""
  id = nil
  descr = nil
  while buf = getbuf_func.call
    buf.split(/\n/).each do | line |
      if line =~ /^>/
        yield id, descr, seq if descr
        descr = line[1..-1].strip
        matched = /^(\S+)/.match(descr)
        id = matched[0]
        seq = ""
      else
        seq += line.strip
      end
    end
  end
  yield id, descr, seq if descr and seq.size > 0
end

.emit_fastarecord(getbuf_func) ⇒ Object



161
162
163
164
165
# File 'lib/bigbio/db/fasta/fastareader.rb', line 161

def FastaReader::emit_fastarecord getbuf_func
  emit(getbuf_func) do | id, descr, seq |
    yield FastaRecord.new(id, descr, seq) 
  end
end

Instance Method Details

#closeObject



117
118
119
# File 'lib/bigbio/db/fasta/fastareader.rb', line 117

def close
  @f.close
end

#digest_tag(tag) ⇒ Object



97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/bigbio/db/fasta/fastareader.rb', line 97

def digest_tag tag
  if tag =~ /^>/
    descr = $'.strip
    if descr =~ /#{@regex}/
      id = $1
      # p [descr,id]
      return id, descr
    end
    p descr  # do not remove these
    p @regex
  end
  raise "Can not digest '#{tag}' using '"+@regex+"'"
end

#eachObject

returns a FastaRecord for every item (invokes parse_each)



55
56
57
# File 'lib/bigbio/db/fasta/fastareader.rb', line 55

def each
  parse_each { | id, descr, seq | yield FastaRecord.new(id, descr, seq) }
end

#firstObject



59
60
61
62
63
# File 'lib/bigbio/db/fasta/fastareader.rb', line 59

def first
  parse_each { | id, descr, seq | 
    return FastaRecord.new(id, descr, seq) 
  }
end

#get(id) ⇒ Object

Return a record by its id, nil when not found



66
67
68
69
70
71
72
73
# File 'lib/bigbio/db/fasta/fastareader.rb', line 66

def get id
  indexed?
  if fpos = indexer_get(id)
    get_rec(fpos)
  else
    nil
  end
end

#get_by_index(idx) ⇒ Object



88
89
90
91
92
93
94
95
# File 'lib/bigbio/db/fasta/fastareader.rb', line 88

def get_by_index idx
  indexed?
  if fpos = indexer_get_by_index(idx)[1]
    ret = get_rec(fpos)
    return ret
  end
  nil
end

#get_rec(fpos) ⇒ Object



75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/bigbio/db/fasta/fastareader.rb', line 75

def get_rec fpos
  @f.seek fpos
  tag = @f.gets
  seq = ""
  begin
    line = @f.gets
    break if line =~ /^>/
    seq += line.strip 
  end while !@f.eof
  id, descr = digest_tag(tag)
  FastaRecord.new(id,descr,seq)
end

#parse_eachObject

Parse the FASTA file and yield id, descr, sequence. When the indexer is on it will index the records the first time. Note that, with indexing, when you don’t complete parsing there will be an error the second time. This is a # trade-off, otherwise one would always have to index the file and read it twice.



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/bigbio/db/fasta/fastareader.rb', line 25

def parse_each
  @f.seek 0    # force file rewind
  @rec_fpos = 0
  @rec_line = @f.gets
  fpos = 0
  @count = 0
  begin
    # digest id from record description
    id, descr = digest_tag(@rec_line)
    id_fpos = @rec_fpos
    # parse the sequence
    seq = ""
    begin
      fpos = @f.tell
      line = @f.gets
      break if line =~ /^>/
      seq += line.strip 
    end while !@f.eof 
    # new record
    @count += 1
    @rec_fpos = fpos
    @rec_line = line
    # p [@rec_line, id, id_fpos]
    indexer_set(id, id_fpos) if @indexer and not @fread_once
    yield id, descr, seq
  end while !@f.eof
  @fread_once = true
end

#sizeObject

Returns the size of the dataset - as read. After the final record the size represents the number of items in the FASTA file



113
114
115
# File 'lib/bigbio/db/fasta/fastareader.rb', line 113

def size
  @count
end