Class: Bioroebe::Sequence

Inherits:
RawSequence show all
Defined in:
lib/bioroebe/sequence/sequence.rb

Constant Summary collapse

SHALL_WE_UPCASE =
#

SHALL_WE_UPCASE

This constant determines whether the given input at hand will be upcased or whether it will not.

Note that the value :do_upcase implies true - so it is equivalent to setting it to true. In my opinion it reads nicer than true or false, so it will be retained as it is.

#
:do_upcase
REMOVE_INVALID_CHARACTERS =
#

REMOVE_INVALID_CHARACTERS

If the following constant is set to true then invalid characters from the given input will be eliminated.

#
true

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from RawSequence

#+, #<<, #[]=, #calculate_levensthein_distance, #chars?, #complement, #composition?, #count, #delete, #delete!, #downcase, #each_char, #empty?, #find_substring_indices, #first_position=, #freeze, #gsub, #gsub!, #include?, #insert_at_this_position, #prepend, #remove_n_characters_from_the_left_side, #reverse, #reverse!, #reverse_complement, #scan, #set_raw_sequence, #shuffle, #split, #start_with?, #strip, #subseq, #to_s, #to_str, #tr!, #upcase!

Constructor Details

#initialize(this_sequence = 'ATCG', &block) ⇒ Sequence

#

initialize

The first argument given to the constructor (.new()) will become the sequence.

Initialization example:

seq = Bioroebe::Sequence.new('ATTGCCG')
#

67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# File 'lib/bioroebe/sequence/sequence.rb', line 67

def initialize( # === Bioroebe::Sequence.new()
    this_sequence = 'ATCG',
    &block
  )
  reset
  _ = this_sequence # Keep a copy - shorter to type.
  # ===================================================================== #
  # === Handle Hash as input next
  # ===================================================================== #
  if _.is_a? Hash
    # ===================================================================== #
    # === Handle :file
    # ===================================================================== #
    if _.has_key? :file
      set_save_file(_.delete(:file))
    end
    # ===================================================================== #
    # === Handle :desc
    # ===================================================================== #
    if _.has_key? :desc
      set_description(_.delete(:desc))
    end
    # ===================================================================== #
    # === Handle :alphabet
    #
    # Note that the "alphabet" is treated as synonymous to "type". Note
    # that :type will also be checked.
    # ===================================================================== #
    if _.has_key? :alphabet
      set_type(_.delete(:alphabet))
    # ===================================================================== #
    # === Handle :type
    # ===================================================================== #
    elsif _.has_key? :type
      set_type(_.delete(:type))
    # ===================================================================== #
    # === Handle :aminoacid
    # ===================================================================== #
    elsif _.has_key? :aminoacid
      set_type(_.delete(:aminoacid))
    end
    # ===================================================================== #
    # === Handle :seq
    # ===================================================================== #
    if _.has_key? :seq
      _ = _.delete :seq
    # ===================================================================== #
    # === Handle :sequence
    # ===================================================================== #
    elsif _.has_key? :sequence
      _ = _.delete :sequence
    end
  end
  # ======================================================================= #
  # Next, set the main sequence that is to be used.
  # ======================================================================= #
  set_sequence(_)
  # ======================================================================= #
  # === Handle blocks next
  # ======================================================================= #
  if block_given?
    yielded = yield
    case yielded
    # ===================================================================== #
    # === :is_DNA
    # ===================================================================== #
    when :is_DNA,
         :is_dna
      set_DNA_type
    # ===================================================================== #
    # === :aminoacid
    # ===================================================================== #
    when :aminoacid
      set_protein_type
    end
  end
end

Class Method Details

.[](i) ⇒ Object

#

Bioroebe::Sequence[]

Invocation example:

sequence = Bioroebe::Sequence['atgggtgggcccc']
#

643
644
645
# File 'lib/bioroebe/sequence/sequence.rb', line 643

def self.[](i)
  new(i)
end

.sequence_from_file(this_file) ⇒ Object

#

Bioroebe::Sequence.sequence_from_file

This method can be used to read in a dataset from a file. The first argument to this method denotes that.

Invocation examples:

x = Bioroebe::Sequence.sequence_from_file('/Depot/Temp/Bioroebe/vector_pBR322.fasta')
x = Bioroebe::Sequence.sequence_from_file('/home/x/DATA/PROGRAMMING_LANGUAGES/ruby/src/bioroebe/lib/bioroebe/data/alu_elements.fasta')
#

618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
# File 'lib/bioroebe/sequence/sequence.rb', line 618

def self.sequence_from_file(this_file)
  if File.exist? this_file
    _ = ::Bioroebe::Sequence.new
    dataset = File.readlines(this_file).map(&:chomp).reject {|line|
      line.start_with? '#' # Remove ad-hoc comments from such files.
    }
    if dataset.first.start_with? '>' # Chop it off in this case.
      dataset.shift # The first line will be removed, in this case.
    end
    sequence = dataset.join
    _.set_sequence(sequence, :do_not_downcase)
    return _
  else
    e "No file called `#{this_file}` exists."
  end
end

Instance Method Details

#automatic_support_for_nucleotidesObject

#

automatic_support_for_nucleotides

This adds automatic support for RNA and DNA to this sequence object.

#

556
557
558
559
# File 'lib/bioroebe/sequence/sequence.rb', line 556

def automatic_support_for_nucleotides
  require 'bioroebe/sequence/nucleotide_module/nucleotide_module.rb'
  extend(Bioroebe::NucleotideModule)
end

#description?Boolean Also known as: desc?

#

description?

Give us back the description of the sequence object at hand.

#

Returns:

  • (Boolean)

265
266
267
# File 'lib/bioroebe/sequence/sequence.rb', line 265

def description?
  @internal_hash[:description]
end

#index(i) ⇒ Object

#

index

#

194
195
196
# File 'lib/bioroebe/sequence/sequence.rb', line 194

def index(i)
  @sequence.index(i)
end

#infer_typeObject

#

infer_type

#

431
432
433
434
435
436
437
# File 'lib/bioroebe/sequence/sequence.rb', line 431

def infer_type
  set_type(
    Bioroebe.infer_type_from_this_sequence(
      sequence?
    )
  )
end

#is_a_protein?Boolean Also known as: is_protein?

#

is_a_protein?

#

Returns:

  • (Boolean)

399
400
401
# File 'lib/bioroebe/sequence/sequence.rb', line 399

def is_a_protein?
  @internal_hash[:type] == :protein
end

#is_a_protein_nowObject

#

is_a_protein_now

This will force the given sequence to “become” a protein - or be assumed to be a protein past this point.

#

409
410
411
# File 'lib/bioroebe/sequence/sequence.rb', line 409

def is_a_protein_now
  @internal_hash[:type] = :protein
end

#is_DNA?Boolean Also known as: is_dna?

#

is_DNA?

#

Returns:

  • (Boolean)

416
417
418
# File 'lib/bioroebe/sequence/sequence.rb', line 416

def is_DNA?
  @internal_hash[:type] == :dna
end

#is_RNA?Boolean Also known as: is_rna?

#

is_RNA?

#

Returns:

  • (Boolean)

423
424
425
# File 'lib/bioroebe/sequence/sequence.rb', line 423

def is_RNA?
  @internal_hash[:type] == :rna
end

#map(&block) ⇒ Object

#

map

#

201
202
203
# File 'lib/bioroebe/sequence/sequence.rb', line 201

def map(&block)
  @sequence.map(&block)
end

#n_uracil?Boolean

#

n_uracil?

Report how many Uracil can be found in the given String. This is more of an ad-hoc method, though.

#

Returns:

  • (Boolean)

294
295
296
# File 'lib/bioroebe/sequence/sequence.rb', line 294

def n_uracil?
  @sequence.to_s.upcase.tr('T','U').count('U')
end

#randomize(i = { 'A'=>1,'C'=>2,'G'=>3,'T'=>4 }) ⇒ Object

#

randomize

Usage example:

x = Bioroebe::Sequence.new; x.randomize
#

597
598
599
600
601
602
603
604
# File 'lib/bioroebe/sequence/sequence.rb', line 597

def randomize(
    i = { 'A'=>1,'C'=>2,'G'=>3,'T'=>4 } 
  )
  if i.is_a? Hash
    i = i.map{|key, value| "#{key * value}" }.join
  end
  ::Bioroebe.random_dna(size?, i) # => "GGTAGGGGGGGGTAGGGGGG"
end

#remove_invalid_entries_from_the_dna_sequence(i = sequence?) ) ⇒ Object

#

remove_invalid_entries_from_the_dna_sequence

#

573
574
575
576
577
# File 'lib/bioroebe/sequence/sequence.rb', line 573

def remove_invalid_entries_from_the_dna_sequence(i = sequence?)
  return i.chars.select {|character|
    DNA_NUCLEOTIDES.include? character.upcase
  }.join
end

#remove_invalid_entries_from_the_dna_sequence!(i = sequence?) ) ⇒ Object

#

remove_invalid_entries_from_the_dna_sequence!

#

582
583
584
585
586
587
# File 'lib/bioroebe/sequence/sequence.rb', line 582

def remove_invalid_entries_from_the_dna_sequence!(i = sequence?)
  result = i.chars.select {|character|
    DNA_NUCLEOTIDES.include? character.upcase
  }.join
  set_sequence(result)
end

#resetObject

#

reset (reset tag)

#

148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# File 'lib/bioroebe/sequence/sequence.rb', line 148

def reset
  # ======================================================================= #
  # === @internal_hash
  # ======================================================================= #
  @internal_hash = {}
  # ======================================================================= #
  # === :type
  #
  # The type can be :dna, :rna or :protein - or nil, which is the default.
  # ======================================================================= #
  @internal_hash[:type] = nil
  # ======================================================================= #
  # === :shall_we_upcase
  # ======================================================================= #
  @internal_hash[:shall_we_upcase] = SHALL_WE_UPCASE
  # ======================================================================= #
  # === :save_file
  # ======================================================================= #
  @internal_hash[:save_file] = nil
  # ======================================================================= #
  # Designate where a FASTA file may be stored.
  # ======================================================================= #
  set_save_file
  # ======================================================================= #
  # Initialize a default description next (to nil).
  # ======================================================================= #
  set_description
  # ======================================================================= #
  # Note that in the past, reset() used a call set_dna(), but this is
  # no longer enabled by default. We will simply treat such a case as
  # default in situations where it matters.
  # ======================================================================= #
end

#return_string_nucleotides_or_aminoacids(type = type? ) ⇒ Object Also known as: nucleotides_or_aminoacids?

#

return_string_nucleotides_or_aminoacids

This will either return the String “nucleotides” or “aminoacids”.

This functionality may be useful in downstream applications that try to display the correct terminology/word.

#

277
278
279
280
281
282
283
284
285
286
# File 'lib/bioroebe/sequence/sequence.rb', line 277

def return_string_nucleotides_or_aminoacids(
    type = type?
  )
  case type
  when :rna, :dna
    'nucleotides'
  when :protein
    'aminoacids'
  end
end

#sanitize_dataset(i = type? ) ⇒ Object Also known as: normalize

#

sanitize_dataset

This will sanitize the dataset, in particular for RNA and DNA.

#

324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
# File 'lib/bioroebe/sequence/sequence.rb', line 324

def sanitize_dataset(
    i = type?
  )
  case i
  # ======================================================================= #
  # === :protein
  # ======================================================================= #
  when :protein
    # Do nothing in this case.
  # ======================================================================= #
  # === :dna
  # ======================================================================= #
  when :dna
    # ===================================================================== #
    # If we have DNA, all U must become T.
    # ===================================================================== #
    sequence?.tr!('U','T') if sequence?
    # ===================================================================== #
    # We also need to check for a constant.
    # ===================================================================== #
    if REMOVE_INVALID_CHARACTERS
      @sequence = remove_invalid_entries_from_the_dna_sequence
    end
  # ======================================================================= #
  # === :rna
  #
  # This entry point will replace all 'T' with a 'U'.
  # ======================================================================= #
  when :rna
    # ===================================================================== #
    # If we have RNA, all T must become U.
    # ===================================================================== #
    sequence?.tr!('T','U') if sequence?
  end
end

#sanitize_rnaObject

#

sanitize_rna

This method will convert all T into U.

#

378
379
380
# File 'lib/bioroebe/sequence/sequence.rb', line 378

def sanitize_rna
  sanitize_dataset :rna
end

#save_sequence_to_this_file(into) ⇒ Object

#

save_sequence_to_this_file

We save to a file but we are silent about this action, unless the directory does not exist.

#

305
306
307
308
309
310
311
312
313
314
315
316
317
# File 'lib/bioroebe/sequence/sequence.rb', line 305

def save_sequence_to_this_file(into)
  what = sequence?
  # ======================================================================= #
  # === Must check whether the base directory exists
  # ======================================================================= #
  base_dir = File.dirname(into)
  if File.exist? base_dir
    ::Bioroebe.write_what_into(what, into)
  else
    e "No directory at #{base_dir} exists, thus we can not save "\
      "the DNA sequence into a file."
  end
end

#set_description(i = nil) ⇒ Object Also known as: set_desc, desc=

#

set_description

Set a specific description for the given sequence object at hand.

If it is a DNA sequence then we can “tag” it via a specific name. This may not be hugely necessary, but nonetheless the option is there. Proteins can be named as well, of course.

#

448
449
450
# File 'lib/bioroebe/sequence/sequence.rb', line 448

def set_description(i = nil)
  @description = i
end

#set_dnaObject Also known as: set_dna_type, set_DNA_type, is_DNA_now

#

set_dna

#

564
565
566
# File 'lib/bioroebe/sequence/sequence.rb', line 564

def set_dna
  set_type(:dna)
end

#set_proteinObject Also known as: set_protein_type

#

set_protein

#

535
536
537
# File 'lib/bioroebe/sequence/sequence.rb', line 535

def set_protein
  set_type(:protein)
end

#set_rnaObject Also known as: set_rna_type, convert_to_rna

#

set_rna

Note that one alias name, the one called .convert_to_rna(), is a more explicit variant for “conversion” into RNA. It just changes one variable, though.

#

546
547
548
# File 'lib/bioroebe/sequence/sequence.rb', line 546

def set_rna
  set_type(:rna)
end

#set_save_file(i = "#{Bioroebe.log_dir?}default_sequence.fasta") ⇒ Object

#

set_save_file

Where to save any fasta file to etc..

The default will be into a file called “default_sequence.fasta”.

#

390
391
392
393
394
# File 'lib/bioroebe/sequence/sequence.rb', line 390

def set_save_file(
    i = "#{Bioroebe.log_dir?}default_sequence.fasta"
  )
  @internal_hash[:save_file] = i
end

#set_sequence(i, upcase_downcase_or_make_no_modification = shall_we_upcase? ) ⇒ Object Also known as: set_string, set_input, set_this_sequence

#

set_sequence

This method sets the main sequence, aka DNA string or RNA string or protein string (aminoacids).

#

478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
# File 'lib/bioroebe/sequence/sequence.rb', line 478

def set_sequence(
    i,
    upcase_downcase_or_make_no_modification = shall_we_upcase?
  )
  if i
    # ===================================================================== #
    # === Handle Arrays next:
    # ===================================================================== #
    if i.is_a? Array
      i = i.join(' ').strip
    end
    # ===================================================================== #
    # We need a String past this point.
    # ===================================================================== #
    i = i.to_s unless i.is_a? String
    if i and !i.empty? and File.exist?(i)
      i = File.read(i)
    end
    i = i.dup if i.frozen?
    # ===================================================================== #
    # Handle only numbers given to this method next. This will default
    # to DNA-nucleotides though.
    # ===================================================================== #
    if i =~ /^\d+$/ and is_DNA?
      i = n_random_dna(i)
    end
    case upcase_downcase_or_make_no_modification
    # ===================================================================== #
    # === :do_not_downcase
    # ===================================================================== #
    when :do_not_downcase,
         :make_no_modification
      # Make no modification in this case.
    # ===================================================================== #
    # === :do_upcase
    #
    # This is also presently the default.
    # ===================================================================== #
    when :do_upcase,
         :default
      i.upcase!
    # ===================================================================== #
    # === :do_downcase
    # ===================================================================== #
    when :do_downcase
      i.downcase!
    end
  end
  @sequence = i.to_s.dup # .dup it to avoid having a frozen String.
  sanitize_dataset
end

#set_type(i = :dna) ⇒ Object Also known as: set_alphabet, set_mode

#

set_type

The type to use. By default, DNA.

#

365
366
367
368
369
370
# File 'lib/bioroebe/sequence/sequence.rb', line 365

def set_type(i = :dna)
  i.downcase! if i.is_a? String
  i = i.to_sym unless i.is_a? Symbol
  @internal_hash[:type] = i # Can be :dna, :rna or :protein (or nil).
  sanitize_rna if i == :rna
end

#shall_we_upcase?Boolean

#

shall_we_upcase?

#

Returns:

  • (Boolean)

256
257
258
# File 'lib/bioroebe/sequence/sequence.rb', line 256

def shall_we_upcase?
  @internal_hash[:shall_we_upcase]
end

#size?Boolean

#

size?

#

Returns:

  • (Boolean)

208
209
210
# File 'lib/bioroebe/sequence/sequence.rb', line 208

def size?
  @sequence.size
end

#to_genbankObject

#

to_genbank

Convert into the genbank format.

Usage example:

x = Bioroebe::Sequence.new('aaaatgggggggggggccccgtt'); y = x.to_genbank
#

463
464
465
466
467
468
469
470
# File 'lib/bioroebe/sequence/sequence.rb', line 463

def to_genbank
  unless ::Bioroebe.const_defined?(:GenbankFlatFileFormatGenerator)
    require 'bioroebe/genbank/genbank_flat_file_format_generator.rb'
  end
  _ = string?
  result = Bioroebe::GenbankFlatFileFormatGenerator.new(_) { :be_quiet }.string?
  return result
end

#to_regexpObject Also known as: to_regex, to_re

#

to_regexp

This method can be used to return a matching regexp-object.

#

217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# File 'lib/bioroebe/sequence/sequence.rb', line 217

def to_regexp
  regex = ''.dup
  _ = @sequence.chars
  _.each {|this_nucleotide|
    this_nucleotide.upcase!
    case this_nucleotide
    when 'A','T','C','G'
      regex << this_nucleotide
    when 'B'
      regex << '[TGC]'
    when 'D'
      regex << '[ATG]'
    when 'H'
      regex << '[ATC]'
    when 'K'
      regex << '[TG]'
    when 'M'
      regex << '[AC]'
    when 'N'
      regex << '[ATGC]'
    when 'R'
      regex << '[AG]'
    when 'S'
      regex << '[GC]'
    when 'V'
      regex << '[AGC]'
    when 'W'
      regex << '[AT]'
    when 'Y'
      regex << '[TC]'
    end
  }
  return Regexp.new(regex, Regexp::IGNORECASE)
end

#type?Boolean Also known as: type

#

type?

The type can be :dna, :rna or :protein. The default will be :dna.

#

Returns:

  • (Boolean)

187
188
189
# File 'lib/bioroebe/sequence/sequence.rb', line 187

def type?
  @internal_hash[:type]
end