Class: Bio::FastaDefline

Inherits:
Object show all
Defined in:
lib/bio/db/fasta/defline.rb

Overview

Parsing FASTA Defline, and extract IDs and other informations. IDs are NSIDs (NCBI standard FASTA sequence identifiers) or “:”-separated IDs.

specs are described in: ftp.ncbi.nih.gov/blast/documents/README.formatdb blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers

Examples

rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
rub.entry_id       ==> 'gi|671595'
rub.get('emb')     ==> 'CAA85678.1'
rub.emb            ==> 'CAA85678.1'
rub.gi             ==> '671595'
rub.accession      ==> 'CAA85678'
rub.accessions     ==> [ 'CAA85678' ]
rub.acc_version    ==> 'CAA85678.1'
rub.locus          ==> nil
rub.list_ids       ==> [["gi", "671595"],
                        ["emb", "CAA85678.1", nil],
                        ["Perovskia abrotanoides"]]

ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
ckr.entry_id      ==> "gi|2495000"
ckr.sp            ==> "CCKR_CAVPO"
ckr.pir           ==> "I51898"
ckr.gb            ==> "AAB29504.1"
ckr.gi            ==> "2495000"
ckr.accession     ==> "AAB29504"
ckr.accessions    ==> ["Q63931", "AAB29504"]
ckr.acc_version   ==> "AAB29504.1"
ckr.locus         ==> nil
ckr.description   ==>
  "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
ckr.descriptions  ==>
  ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
   "cholecystokinin A receptor - guinea pig",
   "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
ckr.words         ==> 
  ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
   "receptor", "type"]
ckr.id_strings    ==>
  ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
   "544724", "AAB29504.1", "Cavia"]
ckr.list_ids      ==>
  [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
   ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
   ["gb", "AAB29504.1", nil], ["Cavia"]]

References

Constant Summary collapse

NSIDs =
{
  # NCBI and WU-BLAST
  'gi'  => [ 'gi' ],                      # NCBI GI
  'gb'  => [ 'acc_version', 'locus' ],      # GenBank
  'emb' => [ 'acc_version', 'locus' ],      # EMBL
  'dbj' => [ 'acc_version', 'locus' ],      # DDBJ
  'sp'  => [ 'accession', 'entry_id' ],   # SWISS-PROT
  'tr'  => [ 'accession', 'entry_id' ],   # TREMBL
  'pdb' => [ 'entry_id', 'chain' ],       # PDB
  'bbs' => [ 'number' ],                  # GenInfo Backbone Id
  'gnl' => [ 'database' , 'entry_id' ],   # General database identifier
  'ref' => [ 'acc_version' , 'locus' ],     # NCBI Reference Sequence
  'lcl' => [ 'entry_id' ],                # Local Sequence identifier

  # WU-BLAST and NCBI
  'pir' => [ 'accession', 'entry_id' ],   # PIR
  'prf' => [ 'accession', 'entry_id' ],   # Protein Research Foundation
  'pat' => [ 'country', 'number', 'serial' ], # Patents

  # WU-BLAST only
  'bbm' => [ 'number' ],      # NCBI GenInfo Backbone database identifier
  'gim' => [ 'number' ],      # NCBI GenInfo Import identifier
  'gp'  => [ 'acc_version', 'locus' ],      # GenPept
  'oth' => [ 'accession', 'name', 'release' ],  # Other (user-definable) identifier
  'tpd' => [ 'accession', 'name' ],       # Third party annotation, DDBJ
  'tpe' => [ 'accession', 'name' ],       # Third party annotation, EMBL
  'tpg' => [ 'accession', 'name' ],       # Third party annotation, GenBank

  # Original
  'ri'  => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
}
KillWords =
[
  'an', 'the', 'this', 'that',
  'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
  'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
  'from', 'and', 'or', 'not',
  'dna', 'rna', 'mrna', 'cdna', 'orf',
  'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
  'similar', 'involved', 'identical', 'identity',
  'cds', 'clone', 'library', 'contig', 'contigs',
  'homolog', 'homologue', 'homologs', 'homologous',
  'protein', 'proteins', 'gene', 'genes',
  'product', 'products', 'sequence', 'sequences', 
  'strain', 'strains', 'region', 'regions',
]
KillWordsHash =
{}
KillRegexpArray =
[
  /\A\d{1,3}\%?\z/,
  /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
  /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(str) ⇒ FastaDefline

Parses given string.



181
182
183
184
185
186
187
188
189
190
191
192
# File 'lib/bio/db/fasta/defline.rb', line 181

def initialize(str)
  @deflines = []
  @info = {}
  @list_ids = []

  @entry_id = nil

  lines = str.split("\x01")
  lines.each do |line|
    add_defline(line)
  end
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(name, *args) ⇒ Object



523
524
525
526
527
528
529
530
531
# File 'lib/bio/db/fasta/defline.rb', line 523

def method_missing(name, *args)
  # raise ArgumentError,
  # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
  r = get(name, *args)
  if !r and !(self.class::NSIDs[name.to_s]) then
    raise "NameError: undefined method `#{name.inspect}'"
  end
  r
end

Instance Attribute Details

#entry_idObject (readonly)

Shows a possibly unique identifier. Returns a string.



178
179
180
# File 'lib/bio/db/fasta/defline.rb', line 178

def entry_id
  @entry_id
end

#list_idsObject (readonly)

Shows array that contains IDs (or ID-like strings). Returns an array of arrays of strings.



174
175
176
# File 'lib/bio/db/fasta/defline.rb', line 174

def list_ids
  @list_ids
end

Instance Method Details

#acc_versionObject

Shows accession with version number. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.



494
495
496
497
498
499
# File 'lib/bio/db/fasta/defline.rb', line 494

def acc_version
  unless defined?(@acc_version) then
    @acc_version = get_by_type('acc_version')
  end
  @acc_version
end

#accessionObject

Shows an accession number.



512
513
514
515
516
517
518
519
520
521
# File 'lib/bio/db/fasta/defline.rb', line 512

def accession
  unless defined?(@accession) then
    if acc_version then
      @accession = acc_version.split('.')[0]
    else
      @accession = accessions[0]
    end
  end
  @accession
end

#accessionsObject

Shows accession numbers. Returns an array of strings.



503
504
505
506
507
508
509
# File 'lib/bio/db/fasta/defline.rb', line 503

def accessions
  unless defined?(@accessions) then
    @accessions = get_all_by_type('accession', 'acc_version')
    @accessions.collect! { |x| x.sub(/\..*\z/, '') }
  end
  @accessions
end

#add_defline(str) ⇒ Object

Parses given string and adds parsed data.



195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
# File 'lib/bio/db/fasta/defline.rb', line 195

def add_defline(str)
  case str
  when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
    # NSIDs
    # examples:
    # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
    #
    # note: regexp (:?) means grouping without backreferences
    i = $1
    d = $2
    tks = i.split('|')
    tks << '' if i[-1,1] == '|'
    a = parse_NSIDs(tks)
    i = a[0].join('|')
    a.unshift('|')
    d = tks.join('|') + ' ' + d unless tks.empty?
    a << d
    this_line = a
    match_EC(d)
    parse_square_brackets(d).each do |x|
      if !match_EC(x, false) and x =~ /\A[A-Z]/ then
        di = [  x ]
        @list_ids << di
        @info['organism'] = x unless @info['organism']
      end
    end

  when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
    # examples:
    # >sce:YBR160W  CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
    # >emb:CACDC28 [X80034] C.albicans CDC28 gene 
    i = $1
    d = $2
    a = parse_ColonSepID(i)
    i = a.join(':')
    this_line = [ ':', a , d ]
    match_EC(d)
    parse_square_brackets(d).each do |x|
      if !match_EC(x, false) and x =~ /:/ then
        parse_ColonSepID(x)
      elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
        @list_ids << [ $1 ]
      end
    end

  when /^\>?\s*(\S+)(?:\s+(.+))?$/
    # examples:
    # >ABC12345 this is test
    i = $1
    d = $2.to_s
    @list_ids << [ i.chomp('.') ]
    this_line = [  '', [ i ], d ]
    match_EC(d)
  else
    i = str
    d = ''
    match_EC(i)
    this_line = [ '', [ i ], d ]
  end

  @deflines << this_line
  @entry_id = i unless @entry_id
end

#descriptionObject

Shows description.



337
338
339
# File 'lib/bio/db/fasta/defline.rb', line 337

def description
  @deflines[0].to_a[-1]
end

#descriptionsObject

Returns descriptions.



342
343
344
345
346
# File 'lib/bio/db/fasta/defline.rb', line 342

def descriptions
  @deflines.collect do |a|
    a[-1]
  end
end

#get(dbname) ⇒ Object

Returns identifires by a database name.



418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
# File 'lib/bio/db/fasta/defline.rb', line 418

def get(dbname)
  db = dbname.to_s
  r = nil
  unless r = @info[db] then
    di = @list_ids.find { |x| x[0] == db.to_s }
    if di and di.size <= 2 then
      r = di[-1]
    elsif di then
      labels = self.class::NSIDs[db]
      [ 'acc_version', 'entry_id',
        'locus', 'accession', 'number'].each do |x|
        if i = labels.index(x) then
          r = di[i+1]
          break if r
        end
      end
      r = di[1..-1].find { |x| x } unless r
    end
    @info[db] = r if r
  end
  r
end

#get_all_by_type(*type_strarg) ⇒ Object

Returns identifiers by given type.



454
455
456
457
458
459
460
461
462
463
464
465
466
# File 'lib/bio/db/fasta/defline.rb', line 454

def get_all_by_type(*type_strarg)
  d = []
  @list_ids.each do |x|
    if labels = self.class::NSIDs[x[0]] then
      type_strarg.each do |y|
        if i = labels.index(y) then
          d << x[i+1] if x[i+1]
        end
      end
    end
  end
  d
end

#get_by_type(type_str) ⇒ Object

Returns an identifier by given type.



442
443
444
445
446
447
448
449
450
451
# File 'lib/bio/db/fasta/defline.rb', line 442

def get_by_type(type_str)
  @list_ids.each do |x|
    if labels = self.class::NSIDs[x[0]] then
      if i = labels.index(type_str) then
        return x[i+1]
      end
    end
  end
  nil
end

#giObject

Shows GI. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.



483
484
485
486
487
488
# File 'lib/bio/db/fasta/defline.rb', line 483

def gi
  unless defined?(@gi) then
    @gi = get_by_type('gi')
  end
  @gi
end

#id_stringsObject

Shows ID-like strings. Returns an array of strings.



350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
# File 'lib/bio/db/fasta/defline.rb', line 350

def id_strings
  r = []
  @list_ids.each do |a|
    if a.size >= 2 then
      r.concat a[1..-1].find_all { |x| x }
    else
      if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
        r << a[0]
      end
    end
  end
  r.concat( words(true, []).find_all do |x|
             x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
               x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
           end)
  r
end

#locusObject

Shows locus. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.



472
473
474
475
476
477
# File 'lib/bio/db/fasta/defline.rb', line 472

def locus
  unless defined?(@locus)
    @locus = get_by_type('locus')
  end
  @locus
end

#to_sObject

Shows original string. Note that the result of this method may be different from original string which is given in FastaDefline.new method.



329
330
331
332
333
334
# File 'lib/bio/db/fasta/defline.rb', line 329

def to_s
  @deflines.collect { |a|
    s = a[0]
    (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
  }.join("\x01")
end

#words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray, kwhash = self.class::KillWordsHash) ⇒ Object

Shows words used in the defline. Returns an Array.



392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
# File 'lib/bio/db/fasta/defline.rb', line 392

def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
          kwhash = self.class::KillWordsHash)
  a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
  a.collect! do |x|
    x.sub!(/\A[\$\*\-\+]+/, '')
    x.sub!(/[\$\*\-\=]+\z/, '')
    if x.size <= 1 then
      nil
    elsif kwhash[x.downcase] then
      nil
    else
      if kill_regexp.find { |expr| expr =~ x } then
        nil
      else
        x
      end
    end
  end
  a.compact!
  a.collect! { |x| x.downcase } unless case_sensitive
  a.sort!
  a.uniq!
  a
end