Class: Bioroebe::SplitThisFastaFileIntoChromosomes

Inherits:
CommandlineApplication show all
Defined in:
lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/menu.rb,
lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/misc.rb,
lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/reset.rb,
lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/constants.rb,
lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb

Overview

Bioroebe::SplitThisFastaFileIntoChromosomes

Constant Summary collapse

ALLOWED_CHARACTERS =
#

ALLOWED_CHARACTERS

This constant keeps track as to which characters are valid nucleotide strings - aka A, T, C, G, U.

#
%w(
  A T C G U
)

Constants inherited from CommandlineApplication

CommandlineApplication::OLD_VERBOSE_VALUE

Constants included from ColoursForBase

ColoursForBase::ARRAY_HTML_COLOURS_IN_USE

Constants inherited from Base

Base::NAMESPACE

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from CommandlineApplication

#all_aminoacids?, #append_what_into, #at_home?, #be_silent, #be_verbose?, #cat, #ccliner, #change_directory, #cliner, #codon_table_dataset?, #codon_to_aminoacid, #codons_for?, #colourize_this_dna_sequence, #complement, #cp, #disable_warnings, #download_dir?, #editor?, #enable_warnings, #ensure_that_the_base_directories_exist, #esystem, #extract, #is_this_a_start_codon?, #is_this_a_stop_codon?, #leading_five_prime, #load_bioroebe_yaml_file, #log_directory?, #one_letter_to_long_name, #one_to_three, #only_numbers?, #open_in_browser, #opnerev, #opnn, #pad_with_double_quotes, #pad_with_single_quotes, #partner_nucleotide, #remove_numbers, #remove_trailing_ansii_escape_code, #return_all_possible_start_codons, #return_array_of_one_letter_aminoacids, #return_cheerful_person, #return_chunked_display, #return_ubiquitin_sequence, #runmode?, #set_be_verbose, #set_runmode, #start_codon?, #stop_codons?, #strict_filter_away_invalid_aminoacids, #taxonomy_download_directory?, #three_to_one, #to_rna, #trailing_three_prime, #use_opn?, #verbose_truth, #was_or_were, #without_extname, #write_what_into

Methods included from BaseModule

#absolute_path, #default_file_read, #file_readlines

Methods included from CommandlineArguments

#commandline_arguments?, #commandline_arguments_that_are_files?, #e, #first?, #first_non_hyphen_argument?, #remove_hyphens_from_the_commandline_arguments, #return_commandline_arguments_as_string, #return_commandline_arguments_that_are_not_files, #return_entries_without_two_leading_hyphens, #select_commandline_arguments, #select_entries_starting_with_two_hyphens

Methods included from ColoursForBase

#colourize_this_aminoacid_sequence_for_the_commandline, #colourize_this_nucleotide_sequence, #disable_colours, #ecomment, #efancy, #egold, #enable_colours, #eorange, #eparse, #erev, #red, #remove_trailing_escape_part, #return_colour_for_nucleotides, #rev, #sdir, #set_will_we_use_colours, #sfancy, #sfile, #simp, #swarn, #use_colours?, #use_colours_within_the_bioroebe_namespace?

Methods inherited from Base

#append_what_into, #can_base_pair?, #convert_global_env, #delete_file, #directory_to_the_codon_tables?, #is_on_roebe?, #is_palindrome?, #main_encoding?, #mkdir, #move_file, #mv, #no_file_exists_at, #no_newlines, #project_yaml_directory?, #rds, #register_sigint, #return_pwd, #return_the_first_line_of_this_file, #word_wrap, #write_what_into

Methods included from InternalHashModule

#internal_hash?, #reset_the_internal_hash

Methods included from InferTheNamespaceModule

#infer_the_namespace, #namespace?

Constructor Details

#initialize(commandline_arguments = nil, run_already = true) ⇒ SplitThisFastaFileIntoChromosomes

#

initialize

#


34
35
36
37
38
39
40
41
42
43
# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 34

def initialize(
    commandline_arguments = nil,
    run_already           = true
  )
  reset
  set_commandline_arguments(
    commandline_arguments
  )
  run if run_already
end

Class Method Details

.[](i = '') ⇒ Object

#

Bioroebe::SplitThisFastaFileIntoChromosomes[]

#


281
282
283
# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 281

def self.[](i = '')
  new(i)
end

Instance Method Details

#active_chromosome?Boolean

#

active_chromosome?

#

Returns:

  • (Boolean)


88
89
90
# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 88

def active_chromosome?
  @active_chromosome
end

#append_newlineObject

#

append_newline

#


145
146
147
# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 145

def append_newline
  append_this_line_into_that_file(N, :default_name)
end

#append_this_line_into_that_file(line, name_of_the_file = @active_chromosome, additional_options = nil) ⇒ Object

#

append_this_line_into_that_file (append tag)

Append the input-line into the file at hand.

#


154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 154

def append_this_line_into_that_file(
    line, # ← This is the content that will be put into the file at hand.
    name_of_the_file   = @active_chromosome,
    additional_options = nil # This can be :reject_newlines or nil.
  )
  case name_of_the_file
  when :default_name
    name_of_the_file = @active_chromosome
  end
  if line.is_a? Array
    line = line.first
  end
  unless name_of_the_file.end_with? '.fa'
    name_of_the_file << '.fa'
  end
  what = line.dup
  set_last_line(what) # Keep track of the last line in use.
  if additional_options == :reject_newlines
    what.strip!
  end
  into = "#{@prefix_for_the_autogenerated_files}#{name_of_the_file}"
  append_what_into(what, into)
end

#append_this_line_into_the_default_chromosome(i) ⇒ Object

#

append_this_line_into_the_default_chromosome

This method exists mostly only due to readability reasons alone, as we could otherwise use append_this_line_into_that_file() as-is.

Note that this method is exclusively called when we have lines that only have a FASTA nucleotide sequence right now, such as “ATGCACG”.

#


134
135
136
137
138
139
140
# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 134

def append_this_line_into_the_default_chromosome(i)
  if @store_the_fasta_body_as_a_single_line
    append_this_line_into_that_file(i, @active_chromosome, :reject_newlines)
  else
    append_this_line_into_that_file(i, @active_chromosome)
  end
end

#determine_the_prefix_for_the_autogenerated_files(i = File.basename(return_pwd)) ⇒ Object

#

determine_the_prefix_for_the_autogenerated_files

This method will try to automatically determine the prefix for the autogenerated files. This can be overruled by the user on the commandline, via:

--prefix=ncrna
#


65
66
67
68
69
# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 65

def determine_the_prefix_for_the_autogenerated_files(
    i = File.basename(return_pwd)
  )
  @prefix_for_the_autogenerated_files = i
end

#has_only_nucleotides?(i) ⇒ Boolean

#

has_only_nucleotides?

This method will return true if the input-line has only nucleotides; and false otherwise.

#

Returns:

  • (Boolean)


107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 107

def has_only_nucleotides?(i)
  # ======================================================================= #
  # First grab the unique characters from the given input String - we
  # assume that a String has been passed.
  # ======================================================================= #
  unique_chars = i.strip.chars.uniq
  return_value = true
  unique_chars.each {|character|
    case character
    when *ALLOWED_CHARACTERS
      # These cases are fine.
    else
      return_value = false
    end
  }
  return return_value
end

#last_line?Boolean

#

last_line?

#

Returns:

  • (Boolean)


188
189
190
# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 188

def last_line?
  @last_line
end

#last_line_is_empty?Boolean

#

last_line_is_empty?

Here we query whether the last line was empty or whether it was not.

#

Returns:

  • (Boolean)


97
98
99
# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 97

def last_line_is_empty?
  !last_line?.end_with?("\n")
end

#main_file?Boolean

#

main_file?

#

Returns:

  • (Boolean)


81
82
83
# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 81

def main_file?
  @this_file
end
#

menu (menu tag)

#


14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/menu.rb', line 14

def menu(
    i = commandline_arguments?
  )
  if i.is_a? Array
    i.each {|entry| menu(entry) }
  else
    case i
    # ===================================================================== #
    # === prefix=ncrna
    #
    # This can be used like so:
    #
    #   splitthisfasta Mus_musculus.GRCm38.ncrna.fa --prefix=ncrna.
    #
    # ===================================================================== #
    when /^-?-?prefix=(.+)$/i
      set_use_this_prefix($1.to_s.dup)
    # ===================================================================== #
    # === help
    # ===================================================================== #
    when /help/
      show_help
      exit
    end
  end
end

#resetObject

#

reset (reset tag)

#


14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/reset.rb', line 14

def reset
  super()
  infer_the_namespace
  # ======================================================================= #
  # === :be_verbose
  # ======================================================================= #
  set_be_quiet
  # ======================================================================= #
  # === @this_file
  #
  # The variable @this_file will store which file we are working with.
  # ======================================================================= #
  @this_file = nil
  # ======================================================================= #
  # === @n_unidentified_lines
  #
  # The next variable keeps track as to how many unidentified lines we
  # have found.
  # ======================================================================= #
  @n_unidentified_lines = 0
  # ======================================================================= #
  # === @store_the_fasta_body_as_a_single_line
  #
  # If the following variable is set to true then the FASTA body will
  # be stored as a single line - thus ignoring newlines. If this option
  # is set to false then we may end up with FASTA files that will have
  # entries such as "ATGCG\nGCGC", which is not as convenient to process
  # further.
  # ======================================================================= #
  @store_the_fasta_body_as_a_single_line = true
  # ======================================================================= #
  # === @last_line
  #
  # This line will keep track of the last line.
  # ======================================================================= #
  @last_line = ''.dup
  # ======================================================================= #
  # === @use_this_genome
  #
  # The variable @use_this_genome keeps track of which genome we will
  # use, based on the number.
  # ======================================================================= #
  @use_this_genome = 'GRCm38'
  # ======================================================================= #
  # We must determine the prefix for the autogenerated files. This also
  # requires the user to be in the proper directory, such as ncrna/ or
  # cdna/.
  # ======================================================================= #
  determine_the_prefix_for_the_autogenerated_files
end

#runObject

#

run (run tag)

#


205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 205

def run
  menu
  # ======================================================================= #
  # See: https://rubular.com/r/AFwJeQdJ5fuut1
  # ======================================================================= #
  use_this_regex = /chromosome:#{@use_this_genome}:([0-9A-Z]+):/
  _ = main_file?
  unless File.exist? _
    opnn; no_file_exists_at(_)
    return
  end
  if File.exist? _
    # ===================================================================== #
    # We first obtain the array of our dataset, already properly split up
    # for us.
    # ===================================================================== #
    array = ::Bioroebe.return_fasta_subsection_of_this_file(_, :keep_it_flat)
    # ===================================================================== #
    # We iterate over our data structure - first the FASTA header, then
    # the FASTA body, both of which are Strings at this point.
    # ===================================================================== #
    array.each {|fasta_header_string, fasta_body_string|
      # =================================================================== #
      # Check for the line containing the substring 'chromosome:'.
      # =================================================================== #
      if fasta_header_string.include? 'chromosome:'
        fasta_header_string =~ use_this_regex
        match = $1.to_s.dup
        if match.empty?
          # =============================================================== #
          # This clause is used when we have found a line containing a
          # "chromosome:" substring but without a proper "assumed"
          # match, such as a number or X or Y. For example, the FASTA
          # header id may include a substring such as _PATCH, which does
          # not denote any chromosome.
          # =============================================================== #
          @n_unidentified_lines += 1
          cliner
          erev "The line at #{slateblue(fasta_header_string)}#{rev} was not identified."
          erev 'It will be shown regardless, so as to notify you about it.'
          if fasta_header_string.include? '_PATCH'
            erev 'It is most likely a patch-set to the (genomic) '\
                 'dataset at hand.'
          end
          next # We will skip when reaching this point.
        else
          # =============================================================== #
          # Here we have found a match, so we can simply store this
          # into the file in that event.
          # =============================================================== #
          if be_verbose?
            opnerev "The following line belongs to chromosome "\
                       "number `#{sfancy(match)}#{rev}`."
            e crimson(fasta_header_string)
          end
          if @last_line
            unless last_line?.empty?
              append_newline
            end
          end
          set_active_chromosome(match)
          append_this_line_into_that_file(fasta_header_string, match)
        end
        # ================================================================= #
        # We must also store the FASTA body, which will be done by
        # the following line.
        # ================================================================= #
        append_this_line_into_the_default_chromosome(fasta_body_string)
      end
    }
  end
end

#set_active_chromosome(i) ⇒ Object

#

set_active_chromosome

We will keep track of which chromosome is active, such as “2” or “5” or “X”. This allows us to put the FASTA nucleotide sequence into the proper filename.

#


52
53
54
55
# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 52

def set_active_chromosome(i)
  i = i.to_s
  @active_chromosome = i
end

#set_commandline_arguments(i = '') ⇒ Object

#

set_commandline_arguments

#


14
15
16
17
18
19
20
21
# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/misc.rb', line 14

def set_commandline_arguments(i = '')
  i = [i].flatten.compact
  if i.any? {|line| File.exist? line }
    @this_file = i.select {|line| File.exist? line }.first
    i.reject! {|line| File.exist? line}
  end
  @commandline_arguments = i
end

#set_last_line(i) ⇒ Object

#

set_last_line

#


181
182
183
# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 181

def set_last_line(i)
  @last_line = i.dup
end

#set_use_this_prefix(i) ⇒ Object

#

set_use_this_prefix

#


74
75
76
# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 74

def set_use_this_prefix(i)
  @prefix_for_the_autogenerated_files = i
end

#show_helpObject

#

show_help (help tag)

#


195
196
197
198
199
200
# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 195

def show_help
  e 'Documented options available:'
  e
  e '  --prefix=ncrna # This is the prefix for the autogenerated .fa files'
  e
end