Class: Bio::FastaDefline

Inherits:
Object show all
Defined in:
lib/bio/db/fasta/defline.rb

Overview

Parsing FASTA Defline, and extract IDs and other informations. IDs are NSIDs (NCBI standard FASTA sequence identifiers) or “:”-separated IDs.

specs are described in: ftp.ncbi.nih.gov/blast/documents/README.formatdb blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers

Examples

rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
rub.entry_id       ==> 'gi|671595'
rub.get('emb')     ==> 'CAA85678.1'
rub.emb            ==> 'CAA85678.1'
rub.gi             ==> '671595'
rub.accession      ==> 'CAA85678'
rub.accessions     ==> [ 'CAA85678' ]
rub.acc_version    ==> 'CAA85678.1'
rub.locus          ==> nil
rub.list_ids       ==> [["gi", "671595"],
                        ["emb", "CAA85678.1", nil],
                        ["Perovskia abrotanoides"]]

ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
ckr.entry_id      ==> "gi|2495000"
ckr.sp            ==> "CCKR_CAVPO"
ckr.pir           ==> "I51898"
ckr.gb            ==> "AAB29504.1"
ckr.gi            ==> "2495000"
ckr.accession     ==> "AAB29504"
ckr.accessions    ==> ["Q63931", "AAB29504"]
ckr.acc_version   ==> "AAB29504.1"
ckr.locus         ==> nil
ckr.description   ==>
  "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
ckr.descriptions  ==>
  ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
   "cholecystokinin A receptor - guinea pig",
   "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
ckr.words         ==> 
  ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
   "receptor", "type"]
ckr.id_strings    ==>
  ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
   "544724", "AAB29504.1", "Cavia"]
ckr.list_ids      ==>
  [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
   ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
   ["gb", "AAB29504.1", nil], ["Cavia"]]

References

Constant Summary collapse

NSIDs =
{
  # NCBI and WU-BLAST
  'gi'  => [ 'gi' ],                      # NCBI GI
  'gb'  => [ 'acc_version', 'locus' ],      # GenBank
  'emb' => [ 'acc_version', 'locus' ],      # EMBL
  'dbj' => [ 'acc_version', 'locus' ],      # DDBJ
  'sp'  => [ 'accession', 'entry_id' ],   # SWISS-PROT
  'tr'  => [ 'accession', 'entry_id' ],   # TREMBL
  'pdb' => [ 'entry_id', 'chain' ],       # PDB
  'bbs' => [ 'number' ],                  # GenInfo Backbone Id
  'gnl' => [ 'database' , 'entry_id' ],   # General database identifier
  'ref' => [ 'acc_version' , 'locus' ],     # NCBI Reference Sequence
  'lcl' => [ 'entry_id' ],                # Local Sequence identifier

  # WU-BLAST and NCBI
  'pir' => [ 'accession', 'entry_id' ],   # PIR
  'prf' => [ 'accession', 'entry_id' ],   # Protein Research Foundation
  'pat' => [ 'country', 'number', 'serial' ], # Patents

  # WU-BLAST only
  'bbm' => [ 'number' ],      # NCBI GenInfo Backbone database identifier
  'gim' => [ 'number' ],      # NCBI GenInfo Import identifier
  'gp'  => [ 'acc_version', 'locus' ],      # GenPept
  'oth' => [ 'accession', 'name', 'release' ],  # Other (user-definable) identifier
  'tpd' => [ 'accession', 'name' ],       # Third party annotation, DDBJ
  'tpe' => [ 'accession', 'name' ],       # Third party annotation, EMBL
  'tpg' => [ 'accession', 'name' ],       # Third party annotation, GenBank

  # Original
  'ri'  => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
}
KillWords =
[
  'an', 'the', 'this', 'that',
  'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
  'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
  'from', 'and', 'or', 'not',
  'dna', 'rna', 'mrna', 'cdna', 'orf',
  'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
  'similar', 'involved', 'identical', 'identity',
  'cds', 'clone', 'library', 'contig', 'contigs',
  'homolog', 'homologue', 'homologs', 'homologous',
  'protein', 'proteins', 'gene', 'genes',
  'product', 'products', 'sequence', 'sequences', 
  'strain', 'strains', 'region', 'regions',
]
KillWordsHash =
{}
KillRegexpArray =
[
  /\A\d{1,3}\%?\z/,
  /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
  /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(str) ⇒ FastaDefline

Parses given string.



180
181
182
183
184
185
186
187
188
189
190
191
# File 'lib/bio/db/fasta/defline.rb', line 180

def initialize(str)
  @deflines = []
  @info = {}
  @list_ids = []

  @entry_id = nil

  lines = str.split("\x01")
  lines.each do |line|
    add_defline(line)
  end
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(name, *args) ⇒ Object



521
522
523
524
525
526
527
528
529
# File 'lib/bio/db/fasta/defline.rb', line 521

def method_missing(name, *args)
  # raise ArgumentError,
  # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
  r = get(name, *args)
  if !r and !(self.class::NSIDs[name.to_s]) then
    raise "NameError: undefined method `#{name.inspect}'"
  end
  r
end

Instance Attribute Details

#entry_idObject (readonly)

Shows a possibly unique identifier. Returns a string.



177
178
179
# File 'lib/bio/db/fasta/defline.rb', line 177

def entry_id
  @entry_id
end

#list_idsObject (readonly)

Shows array that contains IDs (or ID-like strings). Returns an array of arrays of strings.



173
174
175
# File 'lib/bio/db/fasta/defline.rb', line 173

def list_ids
  @list_ids
end

Instance Method Details

#acc_versionObject

Shows accession with version number. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.



492
493
494
495
496
497
# File 'lib/bio/db/fasta/defline.rb', line 492

def acc_version
  unless defined?(@acc_version) then
    @acc_version = get_by_type('acc_version')
  end
  @acc_version
end

#accessionObject

Shows an accession number.



510
511
512
513
514
515
516
517
518
519
# File 'lib/bio/db/fasta/defline.rb', line 510

def accession
  unless defined?(@accession) then
    if acc_version then
      @accession = acc_version.split('.')[0]
    else
      @accession = accessions[0]
    end
  end
  @accession
end

#accessionsObject

Shows accession numbers. Returns an array of strings.



501
502
503
504
505
506
507
# File 'lib/bio/db/fasta/defline.rb', line 501

def accessions
  unless defined?(@accessions) then
    @accessions = get_all_by_type('accession', 'acc_version')
    @accessions.collect! { |x| x.sub(/\..*\z/, '') }
  end
  @accessions
end

#add_defline(str) ⇒ Object

Parses given string and adds parsed data.



194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
# File 'lib/bio/db/fasta/defline.rb', line 194

def add_defline(str)
  case str
  when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
    # NSIDs
    # examples:
    # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
    #
    # note: regexp (:?) means grouping without backreferences
    i = $1
    d = $2
    tks = i.split('|')
    tks << '' if i[-1,1] == '|'
    a = parse_NSIDs(tks)
    i = a[0].join('|')
    a.unshift('|')
    d = tks.join('|') + ' ' + d unless tks.empty?
    a << d
    this_line = a
    match_EC(d)
    parse_square_brackets(d).each do |x|
      if !match_EC(x, false) and x =~ /\A[A-Z]/ then
        di = [  x ]
        @list_ids << di
        @info['organism'] = x unless @info['organism']
      end
    end

  when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
    # examples:
    # >sce:YBR160W  CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
    # >emb:CACDC28 [X80034] C.albicans CDC28 gene 
    i = $1
    d = $2
    a = parse_ColonSepID(i)
    i = a.join(':')
    this_line = [ ':', a , d ]
    match_EC(d)
    parse_square_brackets(d).each do |x|
      if !match_EC(x, false) and x =~ /:/ then
        parse_ColonSepID(x)
      elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
        @list_ids << [ $1 ]
      end
    end

  when /^\>?\s*(\S+)(?:\s+(.+))?$/
    # examples:
    # >ABC12345 this is test
    i = $1
    d = $2.to_s
    @list_ids << [ i.chomp('.') ]
    this_line = [  '', [ i ], d ]
    match_EC(d)
  else
    i = str
    d = ''
    match_EC(i)
    this_line = [ '', [ i ], d ]
  end

  @deflines << this_line
  @entry_id = i unless @entry_id
end

#descriptionObject

Shows description.



335
336
337
# File 'lib/bio/db/fasta/defline.rb', line 335

def description
  @deflines[0].to_a[-1]
end

#descriptionsObject

Returns descriptions.



340
341
342
343
344
# File 'lib/bio/db/fasta/defline.rb', line 340

def descriptions
  @deflines.collect do |a|
    a[-1]
  end
end

#get(dbname) ⇒ Object

Returns identifires by a database name.



416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
# File 'lib/bio/db/fasta/defline.rb', line 416

def get(dbname)
  db = dbname.to_s
  r = nil
  unless r = @info[db] then
    di = @list_ids.find { |x| x[0] == db.to_s }
    if di and di.size <= 2 then
      r = di[-1]
    elsif di then
      labels = self.class::NSIDs[db]
      [ 'acc_version', 'entry_id',
        'locus', 'accession', 'number'].each do |x|
        if i = labels.index(x) then
          r = di[i+1]
          break if r
        end
      end
      r = di[1..-1].find { |x| x } unless r
    end
    @info[db] = r if r
  end
  r
end

#get_all_by_type(*type_strarg) ⇒ Object

Returns identifiers by given type.



452
453
454
455
456
457
458
459
460
461
462
463
464
# File 'lib/bio/db/fasta/defline.rb', line 452

def get_all_by_type(*type_strarg)
  d = []
  @list_ids.each do |x|
    if labels = self.class::NSIDs[x[0]] then
      type_strarg.each do |y|
        if i = labels.index(y) then
          d << x[i+1] if x[i+1]
        end
      end
    end
  end
  d
end

#get_by_type(type_str) ⇒ Object

Returns an identifier by given type.



440
441
442
443
444
445
446
447
448
449
# File 'lib/bio/db/fasta/defline.rb', line 440

def get_by_type(type_str)
  @list_ids.each do |x|
    if labels = self.class::NSIDs[x[0]] then
      if i = labels.index(type_str) then
        return x[i+1]
      end
    end
  end
  nil
end

#giObject

Shows GI. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.



481
482
483
484
485
486
# File 'lib/bio/db/fasta/defline.rb', line 481

def gi
  unless defined?(@gi) then
    @gi = get_by_type('gi')
  end
  @gi
end

#id_stringsObject

Shows ID-like strings. Returns an array of strings.



348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
# File 'lib/bio/db/fasta/defline.rb', line 348

def id_strings
  r = []
  @list_ids.each do |a|
    if a.size >= 2 then
      r.concat a[1..-1].find_all { |x| x }
    else
      if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
        r << a[0]
      end
    end
  end
  r.concat( words(true, []).find_all do |x|
             x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
               x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
           end)
  r
end

#locusObject

Shows locus. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.



470
471
472
473
474
475
# File 'lib/bio/db/fasta/defline.rb', line 470

def locus
  unless defined?(@locus)
    @locus = get_by_type('locus')
  end
  @locus
end

#to_sObject

Shows original string. Note that the result of this method may be different from original string which is given in FastaDefline.new method.



327
328
329
330
331
332
# File 'lib/bio/db/fasta/defline.rb', line 327

def to_s
  @deflines.collect { |a|
    s = a[0]
    (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
  }.join("\x01")
end

#words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray, kwhash = self.class::KillWordsHash) ⇒ Object

Shows words used in the defline. Returns an Array.



390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
# File 'lib/bio/db/fasta/defline.rb', line 390

def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
          kwhash = self.class::KillWordsHash)
  a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\# \x00-\x1f\x7f]+/)
  a.collect! do |x|
    x.sub!(/\A[\$\*\-\+]+/, '')
    x.sub!(/[\$\*\-\=]+\z/, '')
    if x.size <= 1 then
      nil
    elsif kwhash[x.downcase] then
      nil
    else
      if kill_regexp.find { |expr| expr =~ x } then
        nil
      else
        x
      end
    end
  end
  a.compact!
  a.collect! { |x| x.downcase } unless case_sensitive
  a.sort!
  a.uniq!
  a
end