Class: Bioroebe::Sequence

Inherits:
RawSequence show all
Defined in:
lib/bioroebe/sequence/sequence.rb

Constant Summary collapse

SHALL_WE_UPCASE =
#

SHALL_WE_UPCASE

This constant determines whether the given input at hand will be upcased or whether it will not.

Note that the value :do_upcase implies true - so it is equivalent to setting it to true. In my opinion it reads nicer than true or false, so it will be retained as it is.

#
:do_upcase
REMOVE_INVALID_CHARACTERS =
#

REMOVE_INVALID_CHARACTERS

If the following constant is set to true then invalid characters from the given input will be eliminated.

#
true

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from RawSequence

#+, #<<, #[]=, #calculate_levensthein_distance, #chars?, #complement, #composition?, #count, #delete, #delete!, #downcase, #each_char, #empty?, #find_substring_indices, #first_position=, #freeze, #gsub, #gsub!, #include?, #insert_at_this_position, #prepend, #remove_n_characters_from_the_left_side, #reverse, #reverse!, #reverse_complement, #scan, #set_raw_sequence, #shuffle, #split, #start_with?, #strip, #subseq, #to_s, #to_str, #tr!, #upcase!

Constructor Details

#initialize(this_sequence = 'ATCG', &block) ⇒ Sequence

#

initialize

The first argument given to the constructor (.new()) will become the sequence.

Initialization example:

seq = Bioroebe::Sequence.new('ATTGCCG')
#


69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# File 'lib/bioroebe/sequence/sequence.rb', line 69

def initialize( # === Bioroebe::Sequence.new()
    this_sequence = 'ATCG',
    &block
  )
  reset
  _ = this_sequence # Keep a copy - shorter to type.
  # ===================================================================== #
  # === Handle Hash as input next
  # ===================================================================== #
  if _.is_a? Hash
    # ===================================================================== #
    # === Handle :file
    # ===================================================================== #
    if _.has_key? :file
      set_save_file(_.delete(:file))
    end
    # ===================================================================== #
    # === Handle :desc
    # ===================================================================== #
    if _.has_key? :desc
      set_description(_.delete(:desc))
    end
    # ===================================================================== #
    # === Handle :alphabet
    #
    # Note that the "alphabet" is treated as synonymous to "type". Note
    # that :type will also be checked.
    # ===================================================================== #
    if _.has_key? :alphabet
      set_type(_.delete(:alphabet))
    # ===================================================================== #
    # === Handle :type
    # ===================================================================== #
    elsif _.has_key? :type
      set_type(_.delete(:type))
    # ===================================================================== #
    # === Handle :aminoacid
    # ===================================================================== #
    elsif _.has_key? :aminoacid
      set_type(_.delete(:aminoacid))
    end
    # ===================================================================== #
    # === Handle :seq
    # ===================================================================== #
    if _.has_key? :seq
      _ = _.delete :seq
    # ===================================================================== #
    # === Handle :sequence
    # ===================================================================== #
    elsif _.has_key? :sequence
      _ = _.delete :sequence
    end
  end
  # ======================================================================= #
  # Next, set the main sequence that is to be used.
  # ======================================================================= #
  set_sequence(_)
  # ======================================================================= #
  # === Handle blocks next
  # ======================================================================= #
  if block_given?
    yielded = yield
    case yielded
    # ===================================================================== #
    # === :is_DNA
    # ===================================================================== #
    when :is_DNA,
         :is_dna
      set_DNA_type
    # ===================================================================== #
    # === :aminoacid
    # ===================================================================== #
    when :aminoacid
      set_protein_type
    end
  end
end

Class Method Details

.[](i) ⇒ Object

#

Bioroebe::Sequence[]

Invocation example:

sequence = Bioroebe::Sequence['atgggtgggcccc']
#


654
655
656
# File 'lib/bioroebe/sequence/sequence.rb', line 654

def self.[](i)
  new(i)
end

.sequence_from_file(this_file) ⇒ Object

#

Bioroebe::Sequence.sequence_from_file

This method can be used to read in a dataset from a file. The first argument to this method denotes that.

Invocation examples:

x = Bioroebe::Sequence.sequence_from_file('/Depot/Temp/Bioroebe/vector_pBR322.fasta')
x = Bioroebe::Sequence.sequence_from_file('/home/x/DATA/PROGRAMMING_LANGUAGES/ruby/src/bioroebe/lib/bioroebe/data/alu_elements.fasta')
#


629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
# File 'lib/bioroebe/sequence/sequence.rb', line 629

def self.sequence_from_file(this_file)
  if File.exist? this_file
    _ = ::Bioroebe::Sequence.new
    dataset = File.readlines(this_file).map(&:chomp).reject {|line|
      line.start_with? '#' # Remove ad-hoc comments from such files.
    }
    if dataset.first.start_with? '>' # Chop it off in this case.
      dataset.shift # The first line will be removed, in this case.
    end
    sequence = dataset.join
    _.set_sequence(sequence, :do_not_downcase)
    return _
  else
    e "No file called `#{this_file}` exists."
  end
end

Instance Method Details

#automatic_support_for_nucleotidesObject

#

automatic_support_for_nucleotides

This adds automatic support for RNA and DNA to this sequence object.

#


519
520
521
522
# File 'lib/bioroebe/sequence/sequence.rb', line 519

def automatic_support_for_nucleotides
  require 'bioroebe/sequence/nucleotide_module/nucleotide_module.rb'
  extend(Bioroebe::NucleotideModule)
end

#description?Boolean Also known as: desc?

#

description?

Give us back the description of the sequence object at hand.

#

Returns:

  • (Boolean)


267
268
269
# File 'lib/bioroebe/sequence/sequence.rb', line 267

def description?
  @internal_hash[:description]
end

#index(i) ⇒ Object

#

index

#


196
197
198
# File 'lib/bioroebe/sequence/sequence.rb', line 196

def index(i)
  @sequence.index(i)
end

#infer_typeObject Also known as: try_to_infer_the_type

#

infer_type

This method attempts to determine whether the main sequence is a DNA, RNA or protein. Right now this is not very sophisticated, so we have to improve this at a later time (April 2023).

#


609
610
611
612
613
614
615
# File 'lib/bioroebe/sequence/sequence.rb', line 609

def infer_type
  set_type(
    Bioroebe.infer_type_from_this_sequence(
      sequence?
    )
  )
end

#is_a_protein?Boolean Also known as: is_protein?

#

is_a_protein?

#

Returns:

  • (Boolean)


598
599
600
# File 'lib/bioroebe/sequence/sequence.rb', line 598

def is_a_protein?
  @internal_hash[:type] == :protein
end

#is_a_protein_nowObject

#

is_a_protein_now

This will force the given sequence to “become” a protein - or be assumed to be a protein past this point.

#


384
385
386
# File 'lib/bioroebe/sequence/sequence.rb', line 384

def is_a_protein_now
  @internal_hash[:type] = :protein
end

#is_DNA?Boolean Also known as: is_dna?

#

is_DNA?

#

Returns:

  • (Boolean)


391
392
393
# File 'lib/bioroebe/sequence/sequence.rb', line 391

def is_DNA?
  @internal_hash[:type] == :dna
end

#is_RNA?Boolean Also known as: is_rna?

#

is_RNA?

#

Returns:

  • (Boolean)


398
399
400
# File 'lib/bioroebe/sequence/sequence.rb', line 398

def is_RNA?
  @internal_hash[:type] == :rna
end

#map(&block) ⇒ Object

#

map

#


203
204
205
# File 'lib/bioroebe/sequence/sequence.rb', line 203

def map(&block)
  @sequence.map(&block)
end

#n_uracil?Boolean

#

n_uracil?

Report how many Uracil can be found in the given String. This is more of an ad-hoc method, though.

#

Returns:

  • (Boolean)


277
278
279
# File 'lib/bioroebe/sequence/sequence.rb', line 277

def n_uracil?
  @sequence.to_s.upcase.tr('T','U').count('U')
end

#randomize(i = { 'A'=>1,'C'=>2,'G'=>3,'T'=>4 }) ⇒ Object

#

randomize

Usage example:

x = Bioroebe::Sequence.new; x.randomize
#


586
587
588
589
590
591
592
593
# File 'lib/bioroebe/sequence/sequence.rb', line 586

def randomize(
    i = { 'A'=>1,'C'=>2,'G'=>3,'T'=>4 } 
  )
  if i.is_a? Hash
    i = i.map{|key, value| "#{key * value}" }.join
  end
  ::Bioroebe.random_dna(size?, i) # => "GGTAGGGGGGGGTAGGGGGG"
end

#remove_invalid_entries_from_the_dna_sequence(i = sequence?) ) ⇒ Object

#

remove_invalid_entries_from_the_dna_sequence

#


536
537
538
539
540
# File 'lib/bioroebe/sequence/sequence.rb', line 536

def remove_invalid_entries_from_the_dna_sequence(i = sequence?)
  return i.chars.select {|character|
    DNA_NUCLEOTIDES.include? character.upcase
  }.join
end

#remove_invalid_entries_from_the_dna_sequence!(i = sequence?) ) ⇒ Object

#

remove_invalid_entries_from_the_dna_sequence!

#


545
546
547
548
549
550
# File 'lib/bioroebe/sequence/sequence.rb', line 545

def remove_invalid_entries_from_the_dna_sequence!(i = sequence?)
  result = i.chars.select {|character|
    DNA_NUCLEOTIDES.include? character.upcase
  }.join
  set_sequence(result)
end

#resetObject

#

reset (reset tag)

#


150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# File 'lib/bioroebe/sequence/sequence.rb', line 150

def reset
  # ======================================================================= #
  # === @internal_hash
  # ======================================================================= #
  @internal_hash = {}
  # ======================================================================= #
  # === :type
  #
  # The type can be :dna, :rna or :protein - or nil, which is the default.
  # ======================================================================= #
  @internal_hash[:type] = nil
  # ======================================================================= #
  # === :shall_we_upcase
  # ======================================================================= #
  @internal_hash[:shall_we_upcase] = SHALL_WE_UPCASE
  # ======================================================================= #
  # === :save_file
  # ======================================================================= #
  @internal_hash[:save_file] = nil
  # ======================================================================= #
  # Designate where a FASTA file may be stored.
  # ======================================================================= #
  set_save_file
  # ======================================================================= #
  # Initialize a default description next (to nil).
  # ======================================================================= #
  set_description
  # ======================================================================= #
  # Note that in the past, reset() used a call set_dna(), but this is
  # no longer enabled by default. We will simply treat such a case as
  # default in situations where it matters.
  # ======================================================================= #
end

#return_string_nucleotides_or_aminoacids(type = type? ) ⇒ Object Also known as: nucleotides_or_aminoacids?

#

return_string_nucleotides_or_aminoacids

This will either return the String “nucleotides” or “aminoacids”.

This functionality may be useful in downstream applications that try to display the correct terminology/word.

#


560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
# File 'lib/bioroebe/sequence/sequence.rb', line 560

def return_string_nucleotides_or_aminoacids(
    type = type?
  )
  case type
  # ======================================================================= #
  # === :rna
  # ======================================================================= #
  when :rna,
       :dna
    'nucleotides'
  # ======================================================================= #
  # === :protein
  # ======================================================================= #
  when :protein
    'aminoacids'
  end
end

#sanitize_dataset(i = type? ) ⇒ Object Also known as: normalize

#

sanitize_dataset

This will sanitize the dataset, in particular for RNA and DNA.

#


306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
# File 'lib/bioroebe/sequence/sequence.rb', line 306

def sanitize_dataset(
    i = type?
  )
  case i
  # ======================================================================= #
  # === :protein
  # ======================================================================= #
  when :protein
    # Do nothing in this case.
  # ======================================================================= #
  # === :dna
  # ======================================================================= #
  when :dna
    # ===================================================================== #
    # If we have DNA, all U must become T.
    # ===================================================================== #
    sequence?.tr!('U','T') if sequence?
    # ===================================================================== #
    # We also need to check for a constant.
    # ===================================================================== #
    if REMOVE_INVALID_CHARACTERS
      @sequence = remove_invalid_entries_from_the_dna_sequence
    end
  # ======================================================================= #
  # === :rna
  #
  # This entry point will replace all 'T' with a 'U'.
  # ======================================================================= #
  when :rna
    # ===================================================================== #
    # If we have RNA, all T must become U.
    # ===================================================================== #
    sequence?.tr!('T','U') if sequence?
  end
end

#sanitize_rnaObject

#

sanitize_rna

This method will convert all T into U.

#


360
361
362
# File 'lib/bioroebe/sequence/sequence.rb', line 360

def sanitize_rna
  sanitize_dataset :rna
end

#save_sequence_to_this_file(into) ⇒ Object

#

save_sequence_to_this_file

We save to a file but we are silent about this action, unless the directory does not exist.

#


287
288
289
290
291
292
293
294
295
296
297
298
299
# File 'lib/bioroebe/sequence/sequence.rb', line 287

def save_sequence_to_this_file(into)
  what = sequence?
  # ======================================================================= #
  # === Must check whether the base directory exists
  # ======================================================================= #
  base_dir = File.dirname(into)
  if File.exist? base_dir
    ::Bioroebe.write_what_into(what, into)
  else
    e "No directory at #{base_dir} exists, thus we can not save "\
      "the DNA sequence into a file."
  end
end

#set_description(i = nil) ⇒ Object Also known as: set_desc, desc=

#

set_description

Set a specific description for the given sequence object at hand.

If it is a DNA sequence then we can “tag” it via a specific name. This may not be hugely necessary, but nonetheless the option is there. Proteins can be named as well, of course.

#


411
412
413
# File 'lib/bioroebe/sequence/sequence.rb', line 411

def set_description(i = nil)
  @description = i
end

#set_dnaObject Also known as: set_dna_type, set_DNA_type, is_DNA_now

#

set_dna

#


527
528
529
# File 'lib/bioroebe/sequence/sequence.rb', line 527

def set_dna
  set_type(:dna)
end

#set_proteinObject Also known as: set_protein_type

#

set_protein

#


498
499
500
# File 'lib/bioroebe/sequence/sequence.rb', line 498

def set_protein
  set_type(:protein)
end

#set_rnaObject Also known as: set_rna_type, convert_to_rna

#

set_rna

Note that one alias name, the one called .convert_to_rna(), is a more explicit variant for “conversion” into RNA. It just changes one variable, though.

#


509
510
511
# File 'lib/bioroebe/sequence/sequence.rb', line 509

def set_rna
  set_type(:rna)
end

#set_save_file(i = "#{Bioroebe.log_dir?}default_sequence.fasta") ⇒ Object

#

set_save_file

Where to save any fasta file to etc..

The default will be into a file called “default_sequence.fasta”.

#


372
373
374
375
376
# File 'lib/bioroebe/sequence/sequence.rb', line 372

def set_save_file(
    i = "#{Bioroebe.log_dir?}default_sequence.fasta"
  )
  @internal_hash[:save_file] = i
end

#set_sequence(i, upcase_downcase_or_make_no_modification = shall_we_upcase? ) ⇒ Object Also known as: set_string, set_input, set_this_sequence

#

set_sequence

This method sets the main sequence, aka DNA string or RNA string or protein string (aminoacids).

#


441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
# File 'lib/bioroebe/sequence/sequence.rb', line 441

def set_sequence(
    i,
    upcase_downcase_or_make_no_modification = shall_we_upcase?
  )
  if i
    # ===================================================================== #
    # === Handle Arrays next:
    # ===================================================================== #
    if i.is_a? Array
      i = i.join(' ').strip
    end
    # ===================================================================== #
    # We need a String past this point.
    # ===================================================================== #
    i = i.to_s unless i.is_a? String
    if i and !i.empty? and File.exist?(i)
      i = File.read(i)
    end
    i = i.dup if i.frozen?
    # ===================================================================== #
    # Handle only numbers given to this method next. This will default
    # to DNA-nucleotides though.
    # ===================================================================== #
    if i =~ /^\d+$/ and is_DNA?
      i = n_random_dna(i)
    end
    case upcase_downcase_or_make_no_modification
    # ===================================================================== #
    # === :do_not_downcase
    # ===================================================================== #
    when :do_not_downcase,
         :make_no_modification
      # Make no modification in this case.
    # ===================================================================== #
    # === :do_upcase
    #
    # This is also presently the default.
    # ===================================================================== #
    when :do_upcase,
         :default
      i.upcase!
    # ===================================================================== #
    # === :do_downcase
    # ===================================================================== #
    when :do_downcase
      i.downcase!
    end
  end
  @sequence = i.to_s.dup # .dup it to avoid having a frozen String.
  sanitize_dataset
end

#set_type(i = :dna) ⇒ Object Also known as: set_alphabet, set_mode

#

set_type

The type to use. By default, DNA.

#


347
348
349
350
351
352
# File 'lib/bioroebe/sequence/sequence.rb', line 347

def set_type(i = :dna)
  i.downcase! if i.is_a? String
  i = i.to_sym unless i.is_a? Symbol
  @internal_hash[:type] = i # Can be :dna, :rna or :protein (or nil).
  sanitize_rna if i == :rna
end

#shall_we_upcase?Boolean

#

shall_we_upcase?

#

Returns:

  • (Boolean)


258
259
260
# File 'lib/bioroebe/sequence/sequence.rb', line 258

def shall_we_upcase?
  @internal_hash[:shall_we_upcase]
end

#size?Boolean

#

size?

#

Returns:

  • (Boolean)


210
211
212
# File 'lib/bioroebe/sequence/sequence.rb', line 210

def size?
  @sequence.size
end

#to_genbankObject

#

to_genbank

Convert into the genbank format.

Usage example:

x = Bioroebe::Sequence.new('aaaatgggggggggggccccgtt'); y = x.to_genbank
#


426
427
428
429
430
431
432
433
# File 'lib/bioroebe/sequence/sequence.rb', line 426

def to_genbank
  unless ::Bioroebe.const_defined?(:GenbankFlatFileFormatGenerator)
    require 'bioroebe/genbank/genbank_flat_file_format_generator.rb'
  end
  _ = string?
  result = Bioroebe::GenbankFlatFileFormatGenerator.new(_) { :be_quiet }.string?
  return result
end

#to_regexpObject Also known as: to_regex, to_re

#

to_regexp

This method can be used to return a matching regexp-object.

#


219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# File 'lib/bioroebe/sequence/sequence.rb', line 219

def to_regexp
  regex = ''.dup
  _ = @sequence.chars
  _.each {|this_nucleotide|
    this_nucleotide.upcase!
    case this_nucleotide
    when 'A','T','C','G'
      regex << this_nucleotide
    when 'B'
      regex << '[TGC]'
    when 'D'
      regex << '[ATG]'
    when 'H'
      regex << '[ATC]'
    when 'K'
      regex << '[TG]'
    when 'M'
      regex << '[AC]'
    when 'N'
      regex << '[ATGC]'
    when 'R'
      regex << '[AG]'
    when 'S'
      regex << '[GC]'
    when 'V'
      regex << '[AGC]'
    when 'W'
      regex << '[AT]'
    when 'Y'
      regex << '[TC]'
    end
  }
  return Regexp.new(regex, Regexp::IGNORECASE)
end

#type?Boolean Also known as: type

#

type?

The type can be :dna, :rna or :protein. The default will be :dna.

#

Returns:

  • (Boolean)


189
190
191
# File 'lib/bioroebe/sequence/sequence.rb', line 189

def type?
  @internal_hash[:type]
end