Class: Bioroebe::SplitThisFastaFileIntoChromosomes

Inherits:

Object
Base
CommandlineApplication
Bioroebe::SplitThisFastaFileIntoChromosomes

Defined in:: lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/menu.rb,
lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/misc.rb,
lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/reset.rb,
lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/constants.rb,
lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb

Overview

Bioroebe::SplitThisFastaFileIntoChromosomes

Constant Summary collapse

ALLOWED_CHARACTERS = # ALLOWED_CHARACTERS This constant keeps track as to which characters are valid nucleotide strings - aka A, T, C, G, U. #

%w(
  A T C G U
)

Class Method Summary collapse

.[](i = '') ⇒ Object

# === Bioroebe::SplitThisFastaFileIntoChromosomes[] ========================================================================= #.

Instance Method Summary collapse

#active_chromosome? ⇒ Boolean

# === active_chromosome? ========================================================================= #.
#append_newline ⇒ Object

# === append_newline ========================================================================= #.
#append_this_line_into_that_file(line, name_of_the_file = @active_chromosome, additional_options = nil) ⇒ Object

# === append_this_line_into_that_file (append tag).
#append_this_line_into_the_default_chromosome(i) ⇒ Object

# === append_this_line_into_the_default_chromosome.
#determine_the_prefix_for_the_autogenerated_files(i = File.basename(return_pwd)) ⇒ Object

# === determine_the_prefix_for_the_autogenerated_files.
#has_only_nucleotides?(i) ⇒ Boolean

# === has_only_nucleotides?.
#initialize(commandline_arguments = nil, run_already = true) ⇒ SplitThisFastaFileIntoChromosomes constructor

# === initialize ========================================================================= #.
#last_line? ⇒ Boolean

# === last_line? ========================================================================= #.
#last_line_is_empty? ⇒ Boolean

# === last_line_is_empty?.
#main_file? ⇒ Boolean

# === main_file? ========================================================================= #.
#menu(i = commandline_arguments? ) ⇒ Object

# === menu (menu tag) ========================================================================= #.
#reset ⇒ Object

# === reset (reset tag) ========================================================================= #.
#run ⇒ Object

# === run (run tag) ========================================================================= #.
#set_active_chromosome(i) ⇒ Object

# === set_active_chromosome.
#set_commandline_arguments(i = '') ⇒ Object

# === set_commandline_arguments ========================================================================= #.
#set_last_line(i) ⇒ Object

# === set_last_line ========================================================================= #.
#set_use_this_prefix(i) ⇒ Object

# === set_use_this_prefix ========================================================================= #.
#show_help ⇒ Object

# === show_help (help tag) ========================================================================= #.

Methods inherited from CommandlineApplication

#all_aminoacids?, #append_what_into, #at_home?, #be_silent, #be_verbose?, #cat, #ccliner, #change_directory, #cliner, #codon_table_dataset?, #codon_to_aminoacid, #codons_for?, #colourize_this_dna_sequence, #complement, #cp, #disable_warnings, #download_dir?, #editor?, #enable_warnings, #ensure_that_the_base_directories_exist, #esystem, #extract, #is_this_a_start_codon?, #is_this_a_stop_codon?, #leading_five_prime, #load_bioroebe_yaml_file, #log_directory?, #one_letter_to_long_name, #one_to_three, #only_numbers?, #open_in_browser, #opnerev, #opnn, #pad_with_double_quotes, #pad_with_single_quotes, #partner_nucleotide, #remove_numbers, #remove_trailing_ansii_escape_code, #return_all_possible_start_codons, #return_array_of_one_letter_aminoacids, #return_cheerful_person, #return_chunked_display, #return_ubiquitin_sequence, #runmode?, #set_be_verbose, #set_runmode, #start_codon?, #stop_codons?, #strict_filter_away_invalid_aminoacids, #taxonomy_download_directory?, #three_to_one, #to_rna, #trailing_three_prime, #use_opn?, #verbose_truth, #was_or_were, #without_extname, #write_what_into

Methods included from BaseModule

#absolute_path, #default_file_read, #file_readlines

Methods included from CommandlineArguments

#commandline_arguments?, #commandline_arguments_that_are_files?, #e, #first?, #first_non_hyphen_argument?, #remove_hyphens_from_the_commandline_arguments, #return_commandline_arguments_as_string, #return_commandline_arguments_that_are_not_files, #return_entries_without_two_leading_hyphens, #select_commandline_arguments, #select_entries_starting_with_two_hyphens

Methods included from ColoursForBase

#colourize_this_aminoacid_sequence_for_the_commandline, #colourize_this_nucleotide_sequence, #disable_colours, #ecomment, #efancy, #egold, #enable_colours, #eorange, #eparse, #erev, #red, #remove_trailing_escape_part, #return_colour_for_nucleotides, #rev, #sdir, #set_will_we_use_colours, #sfancy, #sfile, #simp, #swarn, #use_colours?, #use_colours_within_the_bioroebe_namespace?

Methods inherited from Base

#append_what_into, #can_base_pair?, #convert_global_env, #delete_file, #directory_to_the_codon_tables?, #is_on_roebe?, #is_palindrome?, #main_encoding?, #mkdir, #move_file, #mv, #no_file_exists_at, #no_newlines, #project_yaml_directory?, #rds, #register_sigint, #return_pwd, #return_the_first_line_of_this_file, #word_wrap, #write_what_into

Methods included from InternalHashModule

#internal_hash?, #reset_the_internal_hash

Methods included from InferTheNamespaceModule

#infer_the_namespace, #namespace?

Constructor Details

#initialize(commandline_arguments = nil, run_already = true) ⇒ `SplitThisFastaFileIntoChromosomes`

#

initialize

#

# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 34

def initialize(
    commandline_arguments = nil,
    run_already           = true
  )
  reset
  set_commandline_arguments(
    commandline_arguments
  )
  run if run_already
end

Class Method Details

.[](i = '') ⇒ `Object`

#

Bioroebe::SplitThisFastaFileIntoChromosomes[]

#



281
282
283

# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 281

def self.[](i = '')
  new(i)
end

Instance Method Details

#active_chromosome? ⇒ `Boolean`

#

active_chromosome?

#

Returns:

(Boolean)



88
89
90

# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 88

def active_chromosome?
  @active_chromosome
end

#append_newline ⇒ `Object`

#

append_newline

#



145
146
147

# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 145

def append_newline
  append_this_line_into_that_file(N, :default_name)
end

#append_this_line_into_that_file(line, name_of_the_file = @active_chromosome, additional_options = nil) ⇒ `Object`

#

append_this_line_into_that_file (append tag)

Append the input-line into the file at hand.

#

# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 154

def append_this_line_into_that_file(
    line, # ← This is the content that will be put into the file at hand.
    name_of_the_file   = @active_chromosome,
    additional_options = nil # This can be :reject_newlines or nil.
  )
  case name_of_the_file
  when :default_name
    name_of_the_file = @active_chromosome
  end
  if line.is_a? Array
    line = line.first
  end
  unless name_of_the_file.end_with? '.fa'
    name_of_the_file << '.fa'
  end
  what = line.dup
  set_last_line(what) # Keep track of the last line in use.
  if additional_options == :reject_newlines
    what.strip!
  end
  into = "#{@prefix_for_the_autogenerated_files}#{name_of_the_file}"
  append_what_into(what, into)
end

#append_this_line_into_the_default_chromosome(i) ⇒ `Object`

#

append_this_line_into_the_default_chromosome

This method exists mostly only due to readability reasons alone, as we could otherwise use append_this_line_into_that_file() as-is.

Note that this method is exclusively called when we have lines that only have a FASTA nucleotide sequence right now, such as “ATGCACG”.

#

# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 134

def append_this_line_into_the_default_chromosome(i)
  if @store_the_fasta_body_as_a_single_line
    append_this_line_into_that_file(i, @active_chromosome, :reject_newlines)
  else
    append_this_line_into_that_file(i, @active_chromosome)
  end
end

#determine_the_prefix_for_the_autogenerated_files(i = File.basename(return_pwd)) ⇒ `Object`

#

determine_the_prefix_for_the_autogenerated_files

This method will try to automatically determine the prefix for the autogenerated files. This can be overruled by the user on the commandline, via:

--prefix=ncrna

#

# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 65

def determine_the_prefix_for_the_autogenerated_files(
    i = File.basename(return_pwd)
  )
  @prefix_for_the_autogenerated_files = i
end

#has_only_nucleotides?(i) ⇒ `Boolean`

#

has_only_nucleotides?

This method will return true if the input-line has only nucleotides; and false otherwise.

#

Returns:

(Boolean)

# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 107

def has_only_nucleotides?(i)
  # ======================================================================= #
  # First grab the unique characters from the given input String - we
  # assume that a String has been passed.
  # ======================================================================= #
  unique_chars = i.strip.chars.uniq
  return_value = true
  unique_chars.each {|character|
    case character
    when *ALLOWED_CHARACTERS
      # These cases are fine.
    else
      return_value = false
    end
  }
  return return_value
end

#last_line? ⇒ `Boolean`

#

last_line?

#

Returns:

(Boolean)



188
189
190

# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 188

def last_line?
  @last_line
end

#last_line_is_empty? ⇒ `Boolean`

#

last_line_is_empty?

Here we query whether the last line was empty or whether it was not.

#

Returns:

(Boolean)



97
98
99

# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 97

def last_line_is_empty?
  !last_line?.end_with?("\n")
end

#main_file? ⇒ `Boolean`

#

main_file?

#

Returns:

(Boolean)



81
82
83

# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 81

def main_file?
  @this_file
end

#

# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/menu.rb', line 14

def menu(
    i = commandline_arguments?
  )
  if i.is_a? Array
    i.each {|entry| menu(entry) }
  else
    case i
    # ===================================================================== #
    # === prefix=ncrna
    #
    # This can be used like so:
    #
    #   splitthisfasta Mus_musculus.GRCm38.ncrna.fa --prefix=ncrna.
    #
    # ===================================================================== #
    when /^-?-?prefix=(.+)$/i
      set_use_this_prefix($1.to_s.dup)
    # ===================================================================== #
    # === help
    # ===================================================================== #
    when /help/
      show_help
      exit
    end
  end
end

#reset ⇒ `Object`

#

reset (reset tag)

#

# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/reset.rb', line 14

def reset
  super()
  infer_the_namespace
  # ======================================================================= #
  # === :be_verbose
  # ======================================================================= #
  set_be_quiet
  # ======================================================================= #
  # === @this_file
  #
  # The variable @this_file will store which file we are working with.
  # ======================================================================= #
  @this_file = nil
  # ======================================================================= #
  # === @n_unidentified_lines
  #
  # The next variable keeps track as to how many unidentified lines we
  # have found.
  # ======================================================================= #
  @n_unidentified_lines = 0
  # ======================================================================= #
  # === @store_the_fasta_body_as_a_single_line
  #
  # If the following variable is set to true then the FASTA body will
  # be stored as a single line - thus ignoring newlines. If this option
  # is set to false then we may end up with FASTA files that will have
  # entries such as "ATGCG\nGCGC", which is not as convenient to process
  # further.
  # ======================================================================= #
  @store_the_fasta_body_as_a_single_line = true
  # ======================================================================= #
  # === @last_line
  #
  # This line will keep track of the last line.
  # ======================================================================= #
  @last_line = ''.dup
  # ======================================================================= #
  # === @use_this_genome
  #
  # The variable @use_this_genome keeps track of which genome we will
  # use, based on the number.
  # ======================================================================= #
  @use_this_genome = 'GRCm38'
  # ======================================================================= #
  # We must determine the prefix for the autogenerated files. This also
  # requires the user to be in the proper directory, such as ncrna/ or
  # cdna/.
  # ======================================================================= #
  determine_the_prefix_for_the_autogenerated_files
end

#run ⇒ `Object`

#

run (run tag)

#

# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 205

def run
  menu
  # ======================================================================= #
  # See: https://rubular.com/r/AFwJeQdJ5fuut1
  # ======================================================================= #
  use_this_regex = /chromosome:#{@use_this_genome}:([0-9A-Z]+):/
  _ = main_file?
  unless File.exist? _
    opnn; no_file_exists_at(_)
    return
  end
  if File.exist? _
    # ===================================================================== #
    # We first obtain the array of our dataset, already properly split up
    # for us.
    # ===================================================================== #
    array = ::Bioroebe.return_fasta_subsection_of_this_file(_, :keep_it_flat)
    # ===================================================================== #
    # We iterate over our data structure - first the FASTA header, then
    # the FASTA body, both of which are Strings at this point.
    # ===================================================================== #
    array.each {|fasta_header_string, fasta_body_string|
      # =================================================================== #
      # Check for the line containing the substring 'chromosome:'.
      # =================================================================== #
      if fasta_header_string.include? 'chromosome:'
        fasta_header_string =~ use_this_regex
        match = $1.to_s.dup
        if match.empty?
          # =============================================================== #
          # This clause is used when we have found a line containing a
          # "chromosome:" substring but without a proper "assumed"
          # match, such as a number or X or Y. For example, the FASTA
          # header id may include a substring such as _PATCH, which does
          # not denote any chromosome.
          # =============================================================== #
          @n_unidentified_lines += 1
          cliner
          erev "The line at #{slateblue(fasta_header_string)}#{rev} was not identified."
          erev 'It will be shown regardless, so as to notify you about it.'
          if fasta_header_string.include? '_PATCH'
            erev 'It is most likely a patch-set to the (genomic) '\
                 'dataset at hand.'
          end
          next # We will skip when reaching this point.
        else
          # =============================================================== #
          # Here we have found a match, so we can simply store this
          # into the file in that event.
          # =============================================================== #
          if be_verbose?
            opnerev "The following line belongs to chromosome "\
                       "number `#{sfancy(match)}#{rev}`."
            e crimson(fasta_header_string)
          end
          if @last_line
            unless last_line?.empty?
              append_newline
            end
          end
          set_active_chromosome(match)
          append_this_line_into_that_file(fasta_header_string, match)
        end
        # ================================================================= #
        # We must also store the FASTA body, which will be done by
        # the following line.
        # ================================================================= #
        append_this_line_into_the_default_chromosome(fasta_body_string)
      end
    }
  end
end

#set_active_chromosome(i) ⇒ `Object`

#

set_active_chromosome

We will keep track of which chromosome is active, such as “2” or “5” or “X”. This allows us to put the FASTA nucleotide sequence into the proper filename.

#

# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 52

def set_active_chromosome(i)
  i = i.to_s
  @active_chromosome = i
end

#set_commandline_arguments(i = '') ⇒ `Object`

#

set_commandline_arguments

#

# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/misc.rb', line 14

def set_commandline_arguments(i = '')
  i = [i].flatten.compact
  if i.any? {|line| File.exist? line }
    @this_file = i.select {|line| File.exist? line }.first
    i.reject! {|line| File.exist? line}
  end
  @commandline_arguments = i
end

#set_last_line(i) ⇒ `Object`

#

set_last_line

#



181
182
183

# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 181

def set_last_line(i)
  @last_line = i.dup
end

#set_use_this_prefix(i) ⇒ `Object`

#

set_use_this_prefix

#



74
75
76

# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 74

def set_use_this_prefix(i)
  @prefix_for_the_autogenerated_files = i
end

#show_help ⇒ `Object`

#

show_help (help tag)

#

# File 'lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/split_this_fasta_file_into_chromosomes.rb', line 195

def show_help
  e 'Documented options available:'
  e
  e '  --prefix=ncrna # This is the prefix for the autogenerated .fa files'
  e
end

Class: Bioroebe::SplitThisFastaFileIntoChromosomes

Overview

Bioroebe::SplitThisFastaFileIntoChromosomes

Constant Summary collapse

#

ALLOWED_CHARACTERS

#

Constants inherited from CommandlineApplication

Constants included from ColoursForBase

Constants inherited from Base

Class Method Summary collapse

# === Bioroebe::SplitThisFastaFileIntoChromosomes[] ========================================================================= #.

Instance Method Summary collapse

# === active_chromosome? ========================================================================= #.

# === append_newline ========================================================================= #.

# === append_this_line_into_that_file (append tag).

# === append_this_line_into_the_default_chromosome.

# === determine_the_prefix_for_the_autogenerated_files.

# === has_only_nucleotides?.

# === initialize ========================================================================= #.

# === last_line? ========================================================================= #.

# === last_line_is_empty?.

# === main_file? ========================================================================= #.

# === menu (menu tag) ========================================================================= #.

# === reset (reset tag) ========================================================================= #.

# === run (run tag) ========================================================================= #.

# === set_active_chromosome.

# === set_commandline_arguments ========================================================================= #.

# === set_last_line ========================================================================= #.

# === set_use_this_prefix ========================================================================= #.

# === show_help (help tag) ========================================================================= #.

Methods inherited from CommandlineApplication

Methods included from BaseModule

Methods included from CommandlineArguments

Methods included from ColoursForBase

Methods inherited from Base

Methods included from InternalHashModule

Methods included from InferTheNamespaceModule

Constructor Details

#initialize(commandline_arguments = nil, run_already = true) ⇒ SplitThisFastaFileIntoChromosomes

#

initialize

#

Class Method Details

.[](i = '') ⇒ Object

#

Bioroebe::SplitThisFastaFileIntoChromosomes[]

#

Instance Method Details

#active_chromosome? ⇒ Boolean

#

active_chromosome?

#

#append_newline ⇒ Object

#

append_newline

#

#append_this_line_into_that_file(line, name_of_the_file = @active_chromosome, additional_options = nil) ⇒ Object

#

append_this_line_into_that_file (append tag)

#

#append_this_line_into_the_default_chromosome(i) ⇒ Object

#

append_this_line_into_the_default_chromosome

#

#determine_the_prefix_for_the_autogenerated_files(i = File.basename(return_pwd)) ⇒ Object

#

determine_the_prefix_for_the_autogenerated_files

#

#has_only_nucleotides?(i) ⇒ Boolean

#

has_only_nucleotides?

#

#last_line? ⇒ Boolean

#

last_line?

#

#last_line_is_empty? ⇒ Boolean

#

last_line_is_empty?

#initialize(commandline_arguments = nil, run_already = true) ⇒ `SplitThisFastaFileIntoChromosomes`

.[](i = '') ⇒ `Object`

#active_chromosome? ⇒ `Boolean`

#append_newline ⇒ `Object`

#append_this_line_into_that_file(line, name_of_the_file = @active_chromosome, additional_options = nil) ⇒ `Object`

#append_this_line_into_the_default_chromosome(i) ⇒ `Object`

#determine_the_prefix_for_the_autogenerated_files(i = File.basename(return_pwd)) ⇒ `Object`

#has_only_nucleotides?(i) ⇒ `Boolean`

#last_line? ⇒ `Boolean`

#last_line_is_empty? ⇒ `Boolean`

#main_file? ⇒ `Boolean`

#menu(i = commandline_arguments? ) ⇒ `Object`

#reset ⇒ `Object`

#run ⇒ `Object`

#set_active_chromosome(i) ⇒ `Object`

#set_commandline_arguments(i = '') ⇒ `Object`

#set_last_line(i) ⇒ `Object`

#set_use_this_prefix(i) ⇒ `Object`

#show_help ⇒ `Object`