Class: Bio::PROSITE

Inherits:
EMBLDB show all
Defined in:
lib/bio/db/prosite.rb

Constant Summary collapse

DELIMITER =

Delimiter

"\n//\n"
RS =

Delimiter

DELIMITER
TAGSIZE =

Bio::DB API

5

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from DB

#exists?, #fetch, #get, open, #tags

Constructor Details

#initialize(entry) ⇒ PROSITE

Returns a new instance of PROSITE


26
27
28
# File 'lib/bio/db/prosite.rb', line 26

def initialize(entry)
  super(entry, TAGSIZE)
end

Class Method Details

.pa2re(pattern) ⇒ Object

prosite pattern to regular expression

prosite/prosuser.txt:

The PA (PAttern) lines contains the definition of a PROSITE pattern. The patterns are described using the following conventions:

0) The standard IUPAC one-letter codes for the amino acids are used. 0) Ambiguities are indicated by listing the acceptable amino acids for a

given position, between square parentheses `[ ]'. For example: [ALT]
stands for Ala or Leu or Thr.

1) A period ends the pattern. 2) When a pattern is restricted to either the N- or C-terminal of a

sequence, that pattern either starts with a `<' symbol or respectively
ends with a `>' symbol.

3) Ambiguities are also indicated by listing between a pair of curly

brackets `{ }' the amino acids that are not accepted at a given
position. For example: {AM} stands for any amino acid except Ala and
Met.

4) Repetition of an element of the pattern can be indicated by following

that element with a numerical value or a numerical range between
parenthesis. Examples: x(3) corresponds to x-x-x, x(2,4) corresponds to
x-x or x-x-x or x-x-x-x.

5) The symbol `x' is used for a position where any amino acid is accepted. 6) Each element in a pattern is separated from its neighbor by a `-'.

Examples:

PA [AC]-x-V-x(4)-ED.

This pattern is translated as: [Ala or Cys]-any-Val-any-any-any-any-but Glu or Asp

PA <A-x-(2)-x(0,1)-V.

This pattern, which must be in the N-terminal of the sequence (`<'), is translated as: Ala-any-[Ser or Thr]-[Ser or Thr]-(any or none)-Val


467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
# File 'lib/bio/db/prosite.rb', line 467

def self.pa2re(pattern)
  pattern.gsub!(/\s/, '')	# remove white spaces
  pattern.sub!(/\.$/, '')	# (1) remove trailing '.'
  pattern.sub!(/^</, '^')	# (2) restricted to the N-terminal : `<'
  pattern.sub!(/>$/, '$')	# (2) restricted to the C-terminal : `>'
  pattern.gsub!(/\{(\w+)\}/) { |m|
    '[^' + $1 + ']'		# (3) not accepted at a given position : '{}'
  }
  pattern.gsub!(/\(([\d,]+)\)/) { |m|
    '{' + $1 + '}'		# (4) repetition of an element : (n), (n,m)
  }
  pattern.tr!('x', '.')	# (5) any amino acid is accepted : 'x'
  pattern.tr!('-', '')	# (6) each element is separated by a '-'
  Regexp.new(pattern, Regexp::IGNORECASE)
end

Instance Method Details

#acObject Also known as: entry_id

AC Accession number (1 per entry)

AC   PSnnnnn;

Returns


57
58
59
60
61
62
# File 'lib/bio/db/prosite.rb', line 57

def ac
  unless @data['AC']
    @data['AC'] = fetch('AC').chomp(';')
  end
  @data['AC']
end

#ccObject Also known as: comment

CC Comments (>=0 per entry)

CC   /QUALIFIER=data; /QUALIFIER=data; .......

/TAXO-RANGE Taxonomic range. /MAX-REPEAT Maximum known number of repetitions of the pattern in a

single protein.

/SITE Indication of an `interesting' site in the pattern. /SKIP-FLAG Indication of an entry that can be, in some cases, ignored

by a program (because it is too unspecific).

Returns


273
274
275
276
277
278
279
280
281
282
# File 'lib/bio/db/prosite.rb', line 273

def cc
  unless @data['CC']
    hash = {}			# temporal hash
    fetch('CC').scan(%r{/(\S+)=([^;]+);}).each do |k, v|
      hash[k] = v
    end
    @data['CC'] = hash
  end
  @data['CC']
end

#deObject Also known as: definition

DE Short description (1 per entry)

DE   Description.

Returns


84
85
86
# File 'lib/bio/db/prosite.rb', line 84

def de
  field_fetch('DE')
end

#divisionObject

Returns


44
45
46
47
48
49
# File 'lib/bio/db/prosite.rb', line 44

def division
  unless @data['TYPE']
    name
  end
  @data['TYPE']
end

#drObject Also known as: sp_xref

DR Cross-references to SWISS-PROT (>=0 per entry)

DR   AC_NB, ENTRY_NAME, C; AC_NB, ENTRY_NAME, C; AC_NB, ENTRY_NAME, C;
  • `AC_NB' is the SWISS-PROT primary accession number of the entry to which reference is being made.

  • `ENTRY_NAME' is the SWISS-PROT entry name.

  • `C' is a one character flag that can be one of the following:

T For a true positive. N For a false negative; a sequence which belongs to the set under

consideration, but which has not been picked up by the pattern or
profile.

P For a `potential' hit; a sequence that belongs to the set under

consideration, but which was not picked up because the region(s) that
are used as a 'fingerprint' (pattern or profile) is not yet available
in the data bank (partial sequence).

? For an unknown; a sequence which possibly could belong to the set under

consideration.

F For a false positive; a sequence which does not belong to the set in

consideration.

Returns


349
350
351
352
353
354
355
356
357
358
359
360
# File 'lib/bio/db/prosite.rb', line 349

def dr
  unless @data['DR']
    hash = {}			# temporal hash
    if fetch('DR')
      fetch('DR').scan(/(\w+)\s*, (\w+)\s*, (.);/).each do |a, e, c|
        hash[a] = [e, c]	# SWISS-PROT : accession, entry, true/false
      end
    end
    @data['DR'] = hash
  end
  @data['DR']
end

#dtObject Also known as: date

DT Date (1 per entry)

DT   MMM-YYYY (CREATED); MMM-YYYY (DATA UPDATE); MMM-YYYY (INFO UPDATE).

Returns


72
73
74
# File 'lib/bio/db/prosite.rb', line 72

def dt
  field_fetch('DT')
end

#false_negObject Also known as: false_negative_hits

Returns


250
251
252
# File 'lib/bio/db/prosite.rb', line 250

def false_neg
  statistics['FALSE_NEG']
end

#false_posObject

Returns


235
236
237
# File 'lib/bio/db/prosite.rb', line 235

def false_pos
  statistics['FALSE_POS']
end

#false_positive_hitsObject

Returns


240
241
242
# File 'lib/bio/db/prosite.rb', line 240

def false_positive_hits
  false_pos.first
end

#false_positive_sequencesObject

Returns


245
246
247
# File 'lib/bio/db/prosite.rb', line 245

def false_positive_sequences
  false_pos.last
end

#list_falsenegative(by_name = nil) ⇒ Object

Returns


386
387
388
# File 'lib/bio/db/prosite.rb', line 386

def list_falsenegative(by_name = nil)
  list_xref('F', by_name)
end

#list_falsepositive(by_name = nil) ⇒ Object

Returns


391
392
393
# File 'lib/bio/db/prosite.rb', line 391

def list_falsepositive(by_name = nil)
  list_xref('P', by_name)
end

#list_potentialhit(by_name = nil) ⇒ Object

Returns


396
397
398
# File 'lib/bio/db/prosite.rb', line 396

def list_potentialhit(by_name = nil)
  list_xref('P', by_name)
end

#list_truepositive(by_name = nil) ⇒ Object

Returns


381
382
383
# File 'lib/bio/db/prosite.rb', line 381

def list_truepositive(by_name = nil)
  list_xref('T', by_name)
end

#list_unknown(by_name = nil) ⇒ Object

Returns


401
402
403
# File 'lib/bio/db/prosite.rb', line 401

def list_unknown(by_name = nil)
  list_xref('?', by_name)
end

#list_xref(flag, by_name = nil) ⇒ Object

Returns


365
366
367
368
369
370
371
372
373
374
375
376
377
378
# File 'lib/bio/db/prosite.rb', line 365

def list_xref(flag, by_name = nil)
  ary = []
  sp_xref.each do |sp_acc, value|
    if value[1] == flag
      if by_name
        sp_name = value[0]
        ary.push(sp_name)
      else
        ary.push(sp_acc)
      end
    end
  end
  return ary
end

#maObject Also known as: profile

MA Matrix/profile (>=0 per entry)

see - ma2re method

Returns


111
112
113
# File 'lib/bio/db/prosite.rb', line 111

def ma
  field_fetch('MA')
end

#ma2re(matrix) ⇒ Object

prosite profile to regular expression

prosite/profile.txt:

Returns

Raises:

  • (NotImplementedError)

497
498
499
# File 'lib/bio/db/prosite.rb', line 497

def ma2re(matrix)
  raise NotImplementedError
end

#max_repeatObject

Returns


306
307
308
# File 'lib/bio/db/prosite.rb', line 306

def max_repeat
  comment['MAX-REPEAT'].to_i
end

#nameObject

ID Identification (Begins each entry; 1 per entry)

ID   ENTRY_NAME; ENTRY_TYPE.  (ENTRY_TYPE : PATTERN, MATRIX, RULE)

Returns


36
37
38
39
40
41
# File 'lib/bio/db/prosite.rb', line 36

def name
  unless @data['ID']
    @data['ID'], @data['TYPE'] = fetch('ID').chomp('.').split('; ')
  end
  @data['ID']
end

#nrObject Also known as: statistics

NR Numerical results (>=0 per entry)

- SWISS-PROT scan statistics of true and false positives/negatives

/RELEASE SWISS-PROT release number and total number of sequence

entries in that release.

/TOTAL Total number of hits in SWISS-PROT. /POSITIVE Number of hits on proteins that are known to belong to the

set in consideration.

/UNKNOWN Number of hits on proteins that could possibly belong to

the set in consideration.

/FALSE_POS Number of false hits (on unrelated proteins). /FALSE_NEG Number of known missed hits. /PARTIAL Number of partial sequences which belong to the set in

consideration, but  which  are  not  hit  by the pattern or
profile because they are partial (fragment) sequences.

Returns


150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# File 'lib/bio/db/prosite.rb', line 150

def nr
  unless @data['NR']
    hash = {}			# temporal hash
    fetch('NR').scan(%r{/(\S+)=([^;]+);}).each do |k, v|
      if v =~ /^(\d+)\((\d+)\)$/
        hits = $1.to_i		# the number of hits
        seqs = $2.to_i		# the number of sequences
        v = [hits, seqs]
      elsif v =~ /([\d\.]+),(\d+)/
        sprel = $1			# the number of SWISS-PROT release
        spseq = $2.to_i		# the number of SWISS-PROT sequences
        v = [sprel, spseq]
      else
        v = v.to_i
      end
      hash[k] = v
    end
    @data['NR'] = hash
  end
  @data['NR']
end

#paObject Also known as: pattern

PA Pattern (>=0 per entry)

see - pa2re method

Returns


96
97
98
99
100
101
# File 'lib/bio/db/prosite.rb', line 96

def pa
  field_fetch('PA')
  @data['PA'] = fetch('PA') unless @data['PA']
  @data['PA'].gsub!(/\s+/, '') if @data['PA']
  @data['PA']
end

#pa2re(pattern) ⇒ Object


483
484
485
# File 'lib/bio/db/prosite.rb', line 483

def pa2re(pattern)
  self.class.pa2re(pattern)
end

#partialObject

Returns


256
257
258
# File 'lib/bio/db/prosite.rb', line 256

def partial
  statistics['PARTIAL']
end

#pdb_xrefObject

3D Cross-references to PDB (>=0 per entry)

3D   name; [name2;...]

Returns


411
412
413
414
415
416
# File 'lib/bio/db/prosite.rb', line 411

def pdb_xref
  unless @data['3D']
    @data['3D'] = fetch('3D').split(/; */)
  end
  @data['3D']
end

#pdoc_xrefObject

DO Pointer to the documentation file (1 per entry)

DO   PDOCnnnnn;

Returns


424
425
426
# File 'lib/bio/db/prosite.rb', line 424

def pdoc_xref
  @data['DO'] = fetch('DO').chomp(';')
end

#positiveObject

Returns


205
206
207
# File 'lib/bio/db/prosite.rb', line 205

def positive
  statistics['POSITIVE']
end

#positive_hitsObject

Returns


210
211
212
# File 'lib/bio/db/prosite.rb', line 210

def positive_hits
  positive.first
end

#positive_sequencesObject

Returns


215
216
217
# File 'lib/bio/db/prosite.rb', line 215

def positive_sequences
  positive.last
end

#reObject


487
488
489
# File 'lib/bio/db/prosite.rb', line 487

def re
  self.class.pa2re(self.pa)
end

#releaseObject

Returns


175
176
177
# File 'lib/bio/db/prosite.rb', line 175

def release
  statistics['RELEASE']
end

#ruObject Also known as: rule

RU Rule (>=0 per entry)

RU   Rule_Description.

The rule is described in ordinary English and is free-format.

Returns


125
126
127
# File 'lib/bio/db/prosite.rb', line 125

def ru
  field_fetch('RU')
end

#siteObject

Returns


311
312
313
314
315
316
# File 'lib/bio/db/prosite.rb', line 311

def site
  if comment['SITE']
    num, desc = comment['SITE'].split(',')
  end
  return [num.to_i, desc]
end

#skip_flagObject

Returns


319
320
321
322
323
# File 'lib/bio/db/prosite.rb', line 319

def skip_flag
  if comment['SKIP-FLAG'] == 'TRUE'
    return true
  end
end

#swissprot_release_numberObject

Returns


180
181
182
# File 'lib/bio/db/prosite.rb', line 180

def swissprot_release_number
  release.first
end

#swissprot_release_sequencesObject

Returns


185
186
187
# File 'lib/bio/db/prosite.rb', line 185

def swissprot_release_sequences
  release.last
end

#taxon_range(expand = nil) ⇒ Object

Returns


287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
# File 'lib/bio/db/prosite.rb', line 287

def taxon_range(expand = nil)
  range = comment['TAXO-RANGE']
  if range and expand
    expand = []
    range.scan(/./) do |x|
      case x
      when 'A'; expand.push('archaebacteria')
      when 'B'; expand.push('bacteriophages')
      when 'E'; expand.push('eukaryotes')
      when 'P'; expand.push('prokaryotes')
      when 'V'; expand.push('eukaryotic viruses')
      end
    end
    range = expand
  end
  return range
end

#totalObject

Returns


190
191
192
# File 'lib/bio/db/prosite.rb', line 190

def total
  statistics['TOTAL']
end

#total_hitsObject

Returns


195
196
197
# File 'lib/bio/db/prosite.rb', line 195

def total_hits
  total.first
end

#total_sequencesObject

Returns


200
201
202
# File 'lib/bio/db/prosite.rb', line 200

def total_sequences
  total.last
end

#unknownObject

Returns


220
221
222
# File 'lib/bio/db/prosite.rb', line 220

def unknown
  statistics['UNKNOWN']
end

#unknown_hitsObject

Returns


225
226
227
# File 'lib/bio/db/prosite.rb', line 225

def unknown_hits
  unknown.first
end

#unknown_sequencesObject

Returns


230
231
232
# File 'lib/bio/db/prosite.rb', line 230

def unknown_sequences
  unknown.last
end