Class: Bio::EMBL

Inherits:

EMBLDB

Object
DB
EMBLDB
Bio::EMBL

show all

Includes:: Bio::EMBLDB::Common

Defined in:: lib/bio/db/embl/embl.rb

Constant Summary

Constants included from Bio::EMBLDB::Common

Bio::EMBLDB::Common::DELIMITER, Bio::EMBLDB::Common::RS, Bio::EMBLDB::Common::TAGSIZE

Instance Method Summary collapse

#cc ⇒ Object (also: #comment)

returns comment text in the comments (CC) line.
#data_class ⇒ Object
#date_created ⇒ Object

created date.
#date_modified ⇒ Object

modified date.
#dblinks ⇒ Object

database references (DR).
#division ⇒ Object

returns DIVISION in the ID line.
#dt(key = nil) ⇒ Object

returns contents in the date (DT) line.
#each_cds ⇒ Object

iterates on CDS features in the FT lines.
#each_gene ⇒ Object

iterates on gene features in the FT lines.
#entry ⇒ Object (also: #entry_name, #entry_id)

returns ENTRY_NAME in the ID line.
#entry_version ⇒ Object

entry version number numbered by EMBL.
#fh ⇒ Object

returns feature table header (String) in the feature header (FH) line.
#ft ⇒ Object (also: #features)

returns contents in the feature table (FT) lines.
#id_line(key = nil) ⇒ Object

returns contents in the ID line.
#molecule ⇒ Object (also: #molecule_type)

returns MOLECULE_TYPE in the ID line.
#os(num = nil) ⇒ Object

returns contents in the OS line.
#release_created ⇒ Object

release number when created.
#release_modified ⇒ Object

release number when last updated.
#seq ⇒ Object (also: #naseq, #ntseq)

returns the nucleotie sequence in this entry.
#sequence_length ⇒ Object (also: #seqlen)

returns SEQUENCE_LENGTH in the ID line.
#species ⇒ Object

species.
#sq(base = nil) ⇒ Object

returns sequence header information in the sequence header (SQ) line.
#sv ⇒ Object

returns the version information in the sequence version (SV) line.
#to_biosequence ⇒ Object
converts the entry to Bio::Sequence object — Arguments

Returns

Bio::Sequence object.
#topology ⇒ Object
#version ⇒ Object

Instance Method Details

#cc ⇒ `Object` Also known as: comment

returns comment text in the comments (CC) line.

CC Line; comments of notes (>=0)



402
403
404

# File 'lib/bio/db/embl/embl.rb', line 402

def cc
  get('CC').to_s.gsub(/^CC   /, '')
end

#data_class ⇒ `Object`



130
131
132

# File 'lib/bio/db/embl/embl.rb', line 130

def data_class
  id_line('DATA_CLASS')
end

#date_created ⇒ `Object`

created date. Returns Date object, String or nil.



462
463
464

# File 'lib/bio/db/embl/embl.rb', line 462

def date_created
  parse_date(self.dt['created'])
end

#date_modified ⇒ `Object`

modified date. Returns Date object, String or nil.



457
458
459

# File 'lib/bio/db/embl/embl.rb', line 457

def date_modified
  parse_date(self.dt['updated'])
end

#dblinks ⇒ `Object`

database references (DR). Returns an array of Bio::Sequence::DBLink objects.

# File 'lib/bio/db/embl/embl.rb', line 512

def dblinks
  get('DR').split(/\n/).collect { |x|
    Bio::Sequence::DBLink.parse_embl_DR_line(x)
  }
end

#division ⇒ `Object`

returns DIVISION in the ID line.

Bio::EMBL#division -> String



140
141
142

# File 'lib/bio/db/embl/embl.rb', line 140

def division
  id_line('DIVISION')
end

#dt(key = nil) ⇒ `Object`

returns contents in the date (DT) line.

Bio::EMBL#dt -> <DT Hash>

where <DT Hash> is:

{}

Bio::EMBL#dt(key) -> String

keys: ‘created’ and ‘updated’

DT Line; date (2/entry)

# File 'lib/bio/db/embl/embl.rb', line 182

def dt(key=nil)
  unless @data['DT']
    tmp = Hash.new
    dt_line = self.get('DT').split(/\n/)
    tmp['created'] = dt_line[0].sub(/\w{2}   /,'').strip
    tmp['updated'] = dt_line[1].sub(/\w{2}   /,'').strip
    @data['DT'] = tmp
  end
  if key
    @data['DT'][key]
  else
    @data['DT']
  end
end

#each_cds ⇒ `Object`

iterates on CDS features in the FT lines.

# File 'lib/bio/db/embl/embl.rb', line 381

def each_cds
  ft.each do |cds_feature|
    if cds_feature.feature == 'CDS'
      yield cds_feature
    end
  end
end

#each_gene ⇒ `Object`

iterates on gene features in the FT lines.

# File 'lib/bio/db/embl/embl.rb', line 390

def each_gene
  ft.each do |gene_feature|
    if gene_feature.feature == 'gene'
      yield gene_feature
    end
  end
end

#entry ⇒ `Object` Also known as: entry_name, entry_id

returns ENTRY_NAME in the ID line.

Bio::EMBL#entry -> String



117
118
119

# File 'lib/bio/db/embl/embl.rb', line 117

def entry
  id_line('ENTRY_NAME')
end

#entry_version ⇒ `Object`

entry version number numbered by EMBL



477
478
479

# File 'lib/bio/db/embl/embl.rb', line 477

def entry_version
  parse_release_version(self.dt['updated'])[1]
end

#fh ⇒ `Object`

returns feature table header (String) in the feature header (FH) line.

FH Line; feature table header (0 or 2)



326
327
328

# File 'lib/bio/db/embl/embl.rb', line 326

def fh
  fetch('FH')
end

#ft ⇒ `Object` Also known as: features

returns contents in the feature table (FT) lines.

Bio::EMBL#ft -> Bio::Features
Bio::EMBL#ft {} -> {|Bio::Feature| }

same as features method in bio/db/genbank.rb

FT Line; feature table data (>=0)

# File 'lib/bio/db/embl/embl.rb', line 337

def ft
  unless @data['FT']
    ary = Array.new
    in_quote = false
    @orig['FT'].each_line do |line|
      next if line =~ /^FEATURES/

      #head = line[0,20].strip  # feature key (source, CDS, ...)
      body = line[20,60].chomp # feature value (position, /qualifier=)
      if line =~ /^FT {3}(\S+)/
        ary.push([ $1, body ]) # [ feature, position, /q="data", ... ]
      elsif body =~ /^ \// and not in_quote
        ary.last.push(body)    # /q="data..., /q=data, /q

        if body =~ /=" / and body !~ /"$/
          in_quote = true
        end

      else
        ary.last.last << body # ...data..., ...data..."

        if body =~ /"$/
          in_quote = false
        end
      end
    end

    ary.map! do |subary|
      parse_qualifiers(subary)
    end

    @data['FT'] = ary.extend(Bio::Features::BackwardCompatibility)
  end
  if block_given?
    @data['FT'].each do |feature|
      yield feature
    end
  else
    @data['FT']
  end
end

#id_line(key = nil) ⇒ `Object`

returns contents in the ID line.

Bio::EMBL#id_line -> <ID Hash>

where <ID Hash> is:

{'ENTRY_NAME' => String, 'MOLECULE_TYPE' => String, 'DIVISION' => String,
 'SEQUENCE_LENGTH' => Int, 'SEQUENCE_VERSION' => Int}

ID Line

"ID  ENTRY_NAME DATA_CLASS; MOLECULE_TYPE; DIVISION; SEQUENCE_LENGTH BP."

DATA_CLASS = [‘standard’]

MOLECULE_TYPE: DNA RNA XXX

Code ( DIVISION )

EST (ESTs)
PHG (Bacteriophage)
FUN (Fungi)
GSS (Genome survey)
HTC (High Throughput cDNAs) 
HTG (HTGs)
HUM (Human)
INV (Invertebrates)
ORG (Organelles)
MAM (Other Mammals)
VRT (Other Vertebrates)
PLN (Plants)
PRO (Prokaryotes)
ROD (Rodents)
SYN (Synthetic)
STS (STSs)
UNC (Unclassified)
VRL (Viruses)

Rel 89- ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP. ID <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP.

Primary accession number
Sequence version number
Topology: ‘circular’ or ‘linear’
Molecule type (see note 1 below)
Data class (see section 3.1)
Taxonomic division (see section 3.2)
Sequence length (see note 2 below)

# File 'lib/bio/db/embl/embl.rb', line 89

def id_line(key=nil)
  unless @data['ID']
    tmp = Hash.new
    idline = fetch('ID').split(/; +/)         
    tmp['ENTRY_NAME'], tmp['DATA_CLASS'] = idline.shift.split(/ +/)
    if idline.first =~ /^SV/
      tmp['SEQUENCE_VERSION'] = idline.shift.split(' ').last
      tmp['TOPOLOGY'] = idline.shift
      tmp['MOLECULE_TYPE'] = idline.shift
      tmp['DATA_CLASS'] = idline.shift
    else
      tmp['MOLECULE_TYPE'] = idline.shift
    end
    tmp['DIVISION'] = idline.shift
    tmp['SEQUENCE_LENGTH'] = idline.shift.strip.split(' ').first.to_i

    @data['ID'] = tmp
  end
  
  if key
    @data['ID'][key]
  else
    @data['ID']
  end
end

#molecule ⇒ `Object` Also known as: molecule_type

returns MOLECULE_TYPE in the ID line.

Bio::EMBL#molecule -> String



125
126
127

# File 'lib/bio/db/embl/embl.rb', line 125

def molecule
  id_line('MOLECULE_TYPE')
end

#os(num = nil) ⇒ `Object`

returns contents in the OS line.

Bio::EMBL#os -> Array of <OS Hash>

where <OS Hash> is:

[{'name'=>'Human', 'os'=>'Homo sapiens'}, 
 {'name'=>'Rat', 'os'=>'Rattus norveticus'}]

Bio::EMBL#os[‘name’] => “Human”
Bio::EMBL#os => ‘os’=>‘Homo sapiens’

–

Bio::EMBL#os(0) => “Homo sapiens (Human)”

OS Line; organism species (>=1)

OS   Trifolium repens (white clover)

Typically, OS line shows “Genus species (name)” style:

OS   Genus species (name)

Other examples:

OS   uncultured bacterium
OS   xxxxxx metagenome
OS   Cloning vector xxxxxxxx

Complicated examples:

OS   Poeciliopsis gracilis (Poeciliopsis gracilis (Heckel, 1848))
OS   Etmopterus sp. B Last & Stevens, 1994 (bristled lanternshark)
OS   Galaxias sp. D (Allibone et al., 1996) (Pool Burn galaxias)
OS   Sicydiinae sp. 'Keith et al., 2010'
OS   Acanthopagrus sp. 'Jean & Lee, 2008'
OS   Gaussia princeps (T. Scott, 1894)
OS   Rana sp. 8 Hillis & Wilcox, 2005
OS   Contracaecum rudolphii C D'Amelio et al., 2007
OS   Partula sp. 'Mt. Marau, Tahiti'
OS   Leptocephalus sp. 'type II larva' (Smith, 1989)
OS   Tayloria grandis (D.G.Long) Goffinet & A.J.Shaw, 2002
OS   Non-A, non-B hepatitis virus
OS   Canidae (dog, coyote, wolf, fox)
OS   Salmonella enterica subsp. enterica serovar 4,[5],12:i:-
OS   Yersinia enterocolitica (type O:5,27)
OS   Influenza A virus (A/green-winged teal/OH/72/99(H6N1,4))
OS   Influenza A virus (A/Beijing/352/1989,(highgrowth reassortant NIB26)(H3N2))
OS   Recombinant Hepatitis C virus H77(5'UTR-NS2)/JFH1_V787A,Q1247L

# File 'lib/bio/db/embl/embl.rb', line 266

def os(num = nil)
  unless @data['OS']
    os = Array.new
    tmp = fetch('OS')
    if /([A-Z][a-z]* *[\w \:\'\+\-]+\w) *\(([\w ]+)\)\s*\z/ =~ tmp
      org = $1
      name = $2
      os.push({'name' => name, 'os' => org})
    else
      os.push({'name' => nil, 'os' => tmp})
    end
    @data['OS'] = os
  end
  if num
    # EX. "Trifolium repens (white clover)"
    "#{@data['OS'][num]['os']} {#data['OS'][num]['name']"
  end
  @data['OS']
end

#release_created ⇒ `Object`

release number when created



472
473
474

# File 'lib/bio/db/embl/embl.rb', line 472

def release_created
  parse_release_version(self.dt['created'])[0]
end

#release_modified ⇒ `Object`

release number when last updated



467
468
469

# File 'lib/bio/db/embl/embl.rb', line 467

def release_modified
  parse_release_version(self.dt['updated'])[0]
end

#seq ⇒ `Object` Also known as: naseq, ntseq

returns the nucleotie sequence in this entry.

Bio::EMBL#seq -> Bio::Sequence::NA

@orig as sequence bb Line; (blanks) sequence data (>=1)



446
447
448

# File 'lib/bio/db/embl/embl.rb', line 446

def seq
  Bio::Sequence::NA.new( fetch('').gsub(/ /,'').gsub(/\d+/,'') )
end

#sequence_length ⇒ `Object` Also known as: seqlen

returns SEQUENCE_LENGTH in the ID line.

Bio::EMBL#sequencelength -> String



146
147
148

# File 'lib/bio/db/embl/embl.rb', line 146

def sequence_length
  id_line('SEQUENCE_LENGTH')
end

#species ⇒ `Object`

species



519
520
521

# File 'lib/bio/db/embl/embl.rb', line 519

def species
  self.fetch('OS')
end

#sq(base = nil) ⇒ `Object`

returns sequence header information in the sequence header (SQ) line.

Bio::EMBL#sq -> <SQ Hash>

where <SQ Hash> is:

{'ntlen' => Int, 'other' => Int,
 'a' => Int, 'c' => Int, 'g' => Int, 't' => Int}

Bio::EMBL#sq(base) -> <base content in Int>
Bio::EMBL#sq -> <base content in Int>

SQ Line; sequence header (1/entry)

SQ   Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;

# File 'lib/bio/db/embl/embl.rb', line 423

def sq(base = nil)
  unless @data['SQ']
    fetch('SQ') =~ \
           /(\d+) BP\; (\d+) A; (\d+) C; (\d+) G; (\d+) T; (\d+) other;/
    @data['SQ'] = {'ntlen' => $1.to_i, 'other' => $6.to_i,
                   'a' => $2.to_i, 'c' => $3.to_i , 'g' => $4.to_i, 't' => $5.to_i}
  else
    @data['SQ']
  end

  if base
    @data['SQ'][base.downcase]
  else
    @data['SQ']
  end
end

#sv ⇒ `Object`

returns the version information in the sequence version (SV) line.

Bio::EMBL#sv -> Accession.Version in String
Bio::EMBL#version -> accession in Int

SV Line; sequence version (1/entry)

SV    Accession.Version

# File 'lib/bio/db/embl/embl.rb', line 162

def sv
  if (v = field_fetch('SV').sub(/;/,'')) == ""
    [id_line['ENTRY_NAME'], id_line['SEQUENCE_VERSION']].join('.') 
  else
    v
  end  
end

#to_biosequence ⇒ `Object`

converts the entry to Bio::Sequence object

Arguments
Returns: Bio::Sequence object



531
532
533

# File 'lib/bio/db/embl/embl.rb', line 531

def to_biosequence
  Bio::Sequence.adapter(self, Bio::Sequence::Adapter::EMBL)
end

#topology ⇒ `Object`



134
135
136

# File 'lib/bio/db/embl/embl.rb', line 134

def topology
  id_line('TOPOLOGY')
end

#version ⇒ `Object`



169
170
171

# File 'lib/bio/db/embl/embl.rb', line 169

def version
  (sv.split(".")[1] || id_line['SEQUENCE_VERSION']).to_i
end

Class: Bio::EMBL

Constant Summary

Constants included from Bio::EMBLDB::Common

Instance Method Summary collapse

Methods included from Bio::EMBLDB::Common

Methods inherited from EMBLDB

Methods inherited from DB

Instance Method Details

#cc ⇒ Object Also known as: comment

#data_class ⇒ Object

#date_created ⇒ Object

#date_modified ⇒ Object

#dblinks ⇒ Object

#division ⇒ Object

#dt(key = nil) ⇒ Object

#each_cds ⇒ Object

#each_gene ⇒ Object

#entry ⇒ Object Also known as: entry_name, entry_id

#entry_version ⇒ Object

#fh ⇒ Object

#ft ⇒ Object Also known as: features

#id_line(key = nil) ⇒ Object

#molecule ⇒ Object Also known as: molecule_type

#os(num = nil) ⇒ Object

#release_created ⇒ Object

#release_modified ⇒ Object

#seq ⇒ Object Also known as: naseq, ntseq

#sequence_length ⇒ Object Also known as: seqlen

#species ⇒ Object

#sq(base = nil) ⇒ Object

#sv ⇒ Object

#to_biosequence ⇒ Object

#topology ⇒ Object

#version ⇒ Object