Class: Bio::FastaDefline

Inherits:
Object show all
Defined in:
lib/bio/db/fasta/defline.rb

Overview

Parsing FASTA Defline, and extract IDs and other informations. IDs are NSIDs (NCBI standard FASTA sequence identifiers) or “:”-separated IDs.

specs are described in: ftp.ncbi.nih.gov/blast/documents/README.formatdb blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers

Examples

rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
rub.entry_id       ==> 'gi|671595'
rub.get('emb')     ==> 'CAA85678.1'
rub.emb            ==> 'CAA85678.1'
rub.gi             ==> '671595'
rub.accession      ==> 'CAA85678'
rub.accessions     ==> [ 'CAA85678' ]
rub.acc_version    ==> 'CAA85678.1'
rub.locus          ==> nil
rub.list_ids       ==> [["gi", "671595"],
                        ["emb", "CAA85678.1", nil],
                        ["Perovskia abrotanoides"]]

ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
ckr.entry_id      ==> "gi|2495000"
ckr.sp            ==> "CCKR_CAVPO"
ckr.pir           ==> "I51898"
ckr.gb            ==> "AAB29504.1"
ckr.gi            ==> "2495000"
ckr.accession     ==> "AAB29504"
ckr.accessions    ==> ["Q63931", "AAB29504"]
ckr.acc_version   ==> "AAB29504.1"
ckr.locus         ==> nil
ckr.description   ==>
  "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
ckr.descriptions  ==>
  ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
   "cholecystokinin A receptor - guinea pig",
   "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
ckr.words         ==> 
  ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
   "receptor", "type"]
ckr.id_strings    ==>
  ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
   "544724", "AAB29504.1", "Cavia"]
ckr.list_ids      ==>
  [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
   ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
   ["gb", "AAB29504.1", nil], ["Cavia"]]

Refereneces

Constant Summary collapse

NSIDs =
{
  # NCBI and WU-BLAST
  'gi'  => [ 'gi' ],                      # NCBI GI
  'gb'  => [ 'acc_version', 'locus' ],      # GenBank
  'emb' => [ 'acc_version', 'locus' ],      # EMBL
  'dbj' => [ 'acc_version', 'locus' ],      # DDBJ
  'sp'  => [ 'accession', 'entry_id' ],   # SWISS-PROT
  'pdb' => [ 'entry_id', 'chain' ],       # PDB
  'bbs' => [ 'number' ],                  # GenInfo Backbone Id
  'gnl' => [ 'database' , 'entry_id' ],   # General database identifier
  'ref' => [ 'acc_version' , 'locus' ],     # NCBI Reference Sequence
  'lcl' => [ 'entry_id' ],                # Local Sequence identifier

  # WU-BLAST and NCBI
  'pir' => [ 'accession', 'entry_id' ],   # PIR
  'prf' => [ 'accession', 'entry_id' ],   # Protein Research Foundation
  'pat' => [ 'country', 'number', 'serial' ], # Patents

  # WU-BLAST only
  'bbm' => [ 'number' ],      # NCBI GenInfo Backbone database identifier
  'gim' => [ 'number' ],      # NCBI GenInfo Import identifier
  'gp'  => [ 'acc_version', 'locus' ],      # GenPept
  'oth' => [ 'accession', 'name', 'release' ],  # Other (user-definable) identifier
  'tpd' => [ 'accession', 'name' ],       # Third party annotation, DDBJ
  'tpe' => [ 'accession', 'name' ],       # Third party annotation, EMBL
  'tpg' => [ 'accession', 'name' ],       # Third party annotation, GenBank

  # Original
  'ri'  => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
}
KillWords =
[
  'an', 'the', 'this', 'that',
  'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
  'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
  'from', 'and', 'or', 'not',
  'dna', 'rna', 'mrna', 'cdna', 'orf',
  'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
  'similar', 'involved', 'identical', 'identity',
  'cds', 'clone', 'library', 'contig', 'contigs',
  'homolog', 'homologue', 'homologs', 'homologous',
  'protein', 'proteins', 'gene', 'genes',
  'product', 'products', 'sequence', 'sequences', 
  'strain', 'strains', 'region', 'regions',
]
KillWordsHash =
{}
KillRegexpArray =
[
  /\A\d{1,3}\%?\z/,
  /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
  /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(str) ⇒ FastaDefline

Parses given string.



176
177
178
179
180
181
182
183
184
185
186
187
# File 'lib/bio/db/fasta/defline.rb', line 176

def initialize(str)
  @deflines = []
  @info = {}
  @list_ids = []

  @entry_id = nil

  lines = str.split("\x01")
  lines.each do |line|
    add_defline(line)
  end
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(name, *args) ⇒ Object



518
519
520
521
522
523
524
525
526
# File 'lib/bio/db/fasta/defline.rb', line 518

def method_missing(name, *args)
  # raise ArgumentError,
  # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
  r = get(name, *args)
  if !r and !(self.class::NSIDs[name.to_s]) then
    raise "NameError: undefined method `#{name.inspect}'"
  end
  r
end

Instance Attribute Details

#entry_idObject (readonly)

Shows a possibly unique identifier. Returns a string.



173
174
175
# File 'lib/bio/db/fasta/defline.rb', line 173

def entry_id
  @entry_id
end

#list_idsObject (readonly)

Shows array that contains IDs (or ID-like strings). Returns an array of arrays of strings.



169
170
171
# File 'lib/bio/db/fasta/defline.rb', line 169

def list_ids
  @list_ids
end

Instance Method Details

#acc_versionObject

Shows accession with version number. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.



489
490
491
492
493
494
# File 'lib/bio/db/fasta/defline.rb', line 489

def acc_version
  unless defined?(@acc_version) then
    @acc_version = get_by_type('acc_version')
  end
  @acc_version
end

#accessionObject

Shows an accession number.



507
508
509
510
511
512
513
514
515
516
# File 'lib/bio/db/fasta/defline.rb', line 507

def accession
  unless defined?(@accession) then
    if acc_version then
      @accession = acc_version.split('.')[0]
    else
      @accession = accessions[0]
    end
  end
  @accession
end

#accessionsObject

Shows accession numbers. Returns an array of strings.



498
499
500
501
502
503
504
# File 'lib/bio/db/fasta/defline.rb', line 498

def accessions
  unless defined?(@accessions) then
    @accessions = get_all_by_type('accession', 'acc_version')
    @accessions.collect! { |x| x.sub(/\..*\z/, '') }
  end
  @accessions
end

#add_defline(str) ⇒ Object

Parses given string and adds parsed data.



190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# File 'lib/bio/db/fasta/defline.rb', line 190

def add_defline(str)
  case str
  when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
    # NSIDs
    # examples:
    # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
    #
    # note: regexp (:?) means grouping without backreferences
    i = $1
    d = $2
    tks = i.split('|')
    tks << '' if i[-1,1] == '|'
    a = parse_NSIDs(tks)
    i = a[0].join('|')
    a.unshift('|')
    d = tks.join('|') + ' ' + d unless tks.empty?
    a << d
    this_line = a
    match_EC(d)
    parse_square_brackets(d).each do |x|
      if !match_EC(x, false) and x =~ /\A[A-Z]/ then
        di = [  x ]
        @list_ids << di
        @info['organism'] = x unless @info['organism']
      end
    end

  when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
    # examples:
    # >sce:YBR160W  CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
    # >emb:CACDC28 [X80034] C.albicans CDC28 gene 
    i = $1
    d = $2
    a = parse_ColonSepID(i)
    i = a.join(':')
    this_line = [ ':', a , d ]
    match_EC(d)
    parse_square_brackets(d).each do |x|
      if !match_EC(x, false) and x =~ /:/ then
        parse_ColonSepID(x)
      elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
        @list_ids << [ $1 ]
      end
    end

  when /^\>?\s*(\S+)(?:\s+(.+))?$/
    # examples:
    # >ABC12345 this is test
    i = $1
    d = $2.to_s
    @list_ids << [ i.chomp('.') ]
    this_line = [  '', [ i ], d ]
    match_EC(d)
  else
    i = str
    d = ''
    match_EC(i)
    this_line = [ '', [ i ], d ]
  end

  @deflines << this_line
  @entry_id = i unless @entry_id
end

#descriptionObject

Shows description.



332
333
334
# File 'lib/bio/db/fasta/defline.rb', line 332

def description
  @deflines[0].to_a[-1]
end

#descriptionsObject

Returns descriptions.



337
338
339
340
341
# File 'lib/bio/db/fasta/defline.rb', line 337

def descriptions
  @deflines.collect do |a|
    a[-1]
  end
end

#get(dbname) ⇒ Object

Returns identifires by a database name.



413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
# File 'lib/bio/db/fasta/defline.rb', line 413

def get(dbname)
  db = dbname.to_s
  r = nil
  unless r = @info[db] then
    di = @list_ids.find { |x| x[0] == db.to_s }
    if di and di.size <= 2 then
      r = di[-1]
    elsif di then
      labels = self.class::NSIDs[db]
      [ 'acc_version', 'entry_id',
        'locus', 'accession', 'number'].each do |x|
        if i = labels.index(x) then
          r = di[i+1]
          break if r
        end
      end
      r = di[1..-1].find { |x| x } unless r
    end
    @info[db] = r if r
  end
  r
end

#get_all_by_type(*type_strarg) ⇒ Object

Returns identifiers by given type.



449
450
451
452
453
454
455
456
457
458
459
460
461
# File 'lib/bio/db/fasta/defline.rb', line 449

def get_all_by_type(*type_strarg)
  d = []
  @list_ids.each do |x|
    if labels = self.class::NSIDs[x[0]] then
      type_strarg.each do |y|
        if i = labels.index(y) then
          d << x[i+1] if x[i+1]
        end
      end
    end
  end
  d
end

#get_by_type(type_str) ⇒ Object

Returns an identifier by given type.



437
438
439
440
441
442
443
444
445
446
# File 'lib/bio/db/fasta/defline.rb', line 437

def get_by_type(type_str)
  @list_ids.each do |x|
    if labels = self.class::NSIDs[x[0]] then
      if i = labels.index(type_str) then
        return x[i+1]
      end
    end
  end
  nil
end

#giObject

Shows GI. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.



478
479
480
481
482
483
# File 'lib/bio/db/fasta/defline.rb', line 478

def gi
  unless defined?(@gi) then
    @gi = get_by_type('gi')
  end
  @gi
end

#id_stringsObject

Shows ID-like strings. Returns an array of strings.



345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
# File 'lib/bio/db/fasta/defline.rb', line 345

def id_strings
  r = []
  @list_ids.each do |a|
    if a.size >= 2 then
      r.concat a[1..-1].find_all { |x| x }
    else
      if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
        r << a[0]
      end
    end
  end
  r.concat( words(true, []).find_all do |x|
             x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
               x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
           end)
  r
end

#locusObject

Shows locus. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.



467
468
469
470
471
472
# File 'lib/bio/db/fasta/defline.rb', line 467

def locus
  unless defined?(@locus)
    @locus = get_by_type('locus')
  end
  @locus
end

#to_sObject

Shows original string. Note that the result of this method may be different from original string which is given in FastaDefline.new method.



324
325
326
327
328
329
# File 'lib/bio/db/fasta/defline.rb', line 324

def to_s
  @deflines.collect { |a|
    s = a[0]
    (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
  }.join("\x01")
end

#words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray, kwhash = self.class::KillWordsHash) ⇒ Object

Shows words used in the defline. Returns an Array.



387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
# File 'lib/bio/db/fasta/defline.rb', line 387

def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
          kwhash = self.class::KillWordsHash)
  a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
  a.collect! do |x|
    x.sub!(/\A[\$\*\-\+]+/, '')
    x.sub!(/[\$\*\-\=]+\z/, '')
    if x.size <= 1 then
      nil
    elsif kwhash[x.downcase] then
      nil
    else
      if kill_regexp.find { |expr| expr =~ x } then
        nil
      else
        x
      end
    end
  end
  a.compact!
  a.collect! { |x| x.downcase } unless case_sensitive
  a.sort!
  a.uniq!
  a
end