Class: Bio::FastaFormat

Inherits:
DB show all
Defined in:
lib/bio/db/fasta.rb

Overview

Treats a FASTA formatted entry, such as:

>id and/or some comments                    <== comment line
ATGCATGCATGCATGCATGCATGCATGCATGCATGC        <== sequence lines
ATGCATGCATGCATGCATGCATGCATGCATGCATGC
ATGCATGCATGC

The precedent ‘>’ can be omitted and the trailing ‘>’ will be removed automatically.

Examples

f_str = <<END_OF_STRING
>sce:YBR160W  CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEG
VPSTAIREISLLKELKDDNIVRLYDIVHSDAHKLYLVFEFLDLDLKRYME
GIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQNLLINKDGNL
KLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGC
IFAEMCNRKPIFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFP
QWRRKDLSQVVPSLDPRGIDLLDKLLAYDPINRISARRAAIHPYFQES
>sce:YBR274W  CHK1; probable serine/threonine-protein kinase [EC:2.7.1.-] [SP:KB9S_YEAST]
MSLSQVSPLPHIKDVVLGDTVGQGAFACVKNAHLQMDPSIILAVKFIHVP
TCKKMGLSDKDITKEVVLQSKCSKHPNVLRLIDCNVSKEYMWIILEMADG
GDLFDKIEPDVGVDSDVAQFYFQQLVSAINYLHVECGVAHRDIKPENILL
DKNGNLKLADFGLASQFRRKDGTLRVSMDQRGSPPYMAPEVLYSEEGYYA
DRTDIWSIGILLFVLLTGQTPWELPSLENEDFVFFIENDGNLNWGPWSKI
EFTHLNLLRKILQPDPNKRVTLKALKLHPWVLRRASFSGDDGLCNDPELL
AKKLFSHLKVSLSNENYLKFTQDTNSNNRYISTQPIGNELAELEHDSMHF
QTVSNTQRAFTSYDSNTNYNSGTGMTQEAKWTQFISYDIAALQFHSDEND
CNELVKRHLQFNPNKLTKFYTLQPMDVLLPILEKALNLSQIRVKPDLFAN
FERLCELLGYDNVFPLIINIKTKSNGGYQLCGSISIIKIEEELKSVGFER
KTGDPLEWRRLFKKISTICRDIILIPN
END_OF_STRING

f = Bio::FastaFormat.new(f_str)
puts "### FastaFormat"
puts "# entry"
puts f.entry
puts "# entry_id"
p f.entry_id
puts "# definition"
p f.definition
puts "# data"
p f.data
puts "# seq"
p f.seq
puts "# seq.type"
p f.seq.type
puts "# length"
p f.length
puts "# aaseq"
p f.aaseq
puts "# aaseq.type"
p f.aaseq.type
puts "# aaseq.composition"
p f.aaseq.composition
puts "# aalen"
p f.aalen

References

Direct Known Subclasses

FastaNumericFormat

Constant Summary collapse

DELIMITER =

Entry delimiter in flatfile text.

RS = "\n>"
DELIMITER_OVERRUN =

(Integer) excess read size included in DELIMITER.

1

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods inherited from DB

#exists?, #fetch, #get, open, #tags

Constructor Details

#initialize(str) ⇒ FastaFormat

Stores the comment and sequence information from one entry of the FASTA format string. If the argument contains more than one entry, only the first entry is used.



119
120
121
122
123
124
# File 'lib/bio/db/fasta.rb', line 119

def initialize(str)
  @definition = str[/.*/].sub(/^>/, '').strip	# 1st line
  @data = str.sub(/.*/, '')				# rests
  @data.sub!(/^>.*/m, '')	# remove trailing entries for sure
  @entry_overrun = $&
end

Instance Attribute Details

#dataObject

The seuqnce lines in text.



112
113
114
# File 'lib/bio/db/fasta.rb', line 112

def data
  @data
end

#definitionObject

The comment line of the FASTA formatted data.



109
110
111
# File 'lib/bio/db/fasta.rb', line 109

def definition
  @definition
end

#entry_overrunObject (readonly)

Returns the value of attribute entry_overrun.



114
115
116
# File 'lib/bio/db/fasta.rb', line 114

def entry_overrun
  @entry_overrun
end

Instance Method Details

#aalenObject

Returens the length of Bio::Sequence::AA.



209
210
211
# File 'lib/bio/db/fasta.rb', line 209

def aalen
  self.aaseq.length
end

#aaseqObject

Returens the Bio::Sequence::AA.



204
205
206
# File 'lib/bio/db/fasta.rb', line 204

def aaseq
  Sequence::AA.new(seq)
end

#acc_versionObject

Returns accession number with version.



265
266
267
# File 'lib/bio/db/fasta.rb', line 265

def acc_version
  identifiers.acc_version
end

#accessionObject

Returns an accession number.



253
254
255
# File 'lib/bio/db/fasta.rb', line 253

def accession
  identifiers.accession
end

#accessionsObject

Parsing FASTA Defline (using #identifiers method), and shows accession numbers. It returns an array of strings.



260
261
262
# File 'lib/bio/db/fasta.rb', line 260

def accessions
  identifiers.accessions
end

#commentObject

Returns comments.



183
184
185
186
# File 'lib/bio/db/fasta.rb', line 183

def comment
  seq
  @comment
end

#entryObject Also known as: to_s

Returns the stored one entry as a FASTA format. (same as to_s)



127
128
129
# File 'lib/bio/db/fasta.rb', line 127

def entry
  @entry = ">#{@definition}\n#{@data.strip}\n"
end

#entry_idObject

Parsing FASTA Defline (using #identifiers method), and shows a possibly unique identifier. It returns a string.



239
240
241
# File 'lib/bio/db/fasta.rb', line 239

def entry_id
  identifiers.entry_id
end

#giObject

Parsing FASTA Defline (using #identifiers method), and shows GI/locus/accession/accession with version number. If a entry has more than two of such IDs, only the first ID are shown. It returns a string or nil.



248
249
250
# File 'lib/bio/db/fasta.rb', line 248

def gi
  identifiers.gi
end

#identifiersObject

Parsing FASTA Defline, and extract IDs. IDs are NSIDs (NCBI standard FASTA sequence identifiers) or “:”-separated IDs. It returns a Bio::FastaDefline instance.



229
230
231
232
233
234
# File 'lib/bio/db/fasta.rb', line 229

def identifiers
  unless defined?(@ids) then
    @ids = FastaDefline.new(@definition)
  end
  @ids
end

#lengthObject

Returns sequence length.



189
190
191
# File 'lib/bio/db/fasta.rb', line 189

def length
  seq.length
end

#locusObject

Returns locus.



270
271
272
# File 'lib/bio/db/fasta.rb', line 270

def locus
  identifiers.locus
end

#nalenObject

Returens the length of Bio::Sequence::NA.



199
200
201
# File 'lib/bio/db/fasta.rb', line 199

def nalen
  self.naseq.length
end

#naseqObject

Returens the Bio::Sequence::NA.



194
195
196
# File 'lib/bio/db/fasta.rb', line 194

def naseq
  Sequence::NA.new(seq)
end

#query(factory) ⇒ Object Also known as: fasta, blast

Executes FASTA/BLAST search by using a Bio::Fasta or a Bio::Blast factory object.

#!/usr/bin/env ruby
require 'bio'

factory = Bio::Fasta.local('fasta34', 'db/swissprot.f')
flatfile = Bio::FlatFile.open(Bio::FastaFormat, 'queries.f')
flatfile.each do |entry|
  p entry.definition
  result = entry.fasta(factory)
  result.each do |hit|
    print "#{hit.query_id} : #{hit.evalue}\t#{hit.target_id} at "
    p hit.lap_at
  end
end


150
151
152
# File 'lib/bio/db/fasta.rb', line 150

def query(factory)
  factory.query(entry)
end

#seqObject

Returns a joined sequence line as a String.



157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# File 'lib/bio/db/fasta.rb', line 157

def seq
  unless defined?(@seq)
    unless /\A\s*^\#/ =~ @data then
      @seq = Sequence::Generic.new(@data.tr(" \t\r\n0-9", '')) # lazy clean up
    else
      a = @data.split(/(^\#.*$)/)
      i = 0
      cmnt = {}
      s = []
      a.each do |x|
        if /^# ?(.*)$/ =~ x then
          cmnt[i] ? cmnt[i] << "\n" << $1 : cmnt[i] = $1
        else
          x.tr!(" \t\r\n0-9", '') # lazy clean up
          i += x.length
          s << x
        end
      end
      @comment = cmnt
      @seq = Bio::Sequence::Generic.new(s.join(''))
    end
  end
  @seq
end

#to_biosequenceObject Also known as: to_seq

Returns sequence as a Bio::Sequence object.

Note: If you modify the returned Bio::Sequence object, the sequence or definition in this FastaFormat object might also be changed (but not always be changed) because of efficiency.



220
221
222
# File 'lib/bio/db/fasta.rb', line 220

def to_biosequence
  Bio::Sequence.adapter(self, Bio::Sequence::Adapter::FastaFormat)
end