.batch_process(from_this_directory = AA_DIR) ⇒ Object

# === ParseFasta.batch ========================================================================= #.

Instance Method Summary collapse

#all_accession_entries ⇒ Object

# === all_accession_entries.
#debug ⇒ Object

# === debug ========================================================================= #.
#filesize(i = @location) ⇒ Object

# === filesize ========================================================================= #.
#hash? ⇒ Boolean (also: #hash)

# === hash? ========================================================================= #.
#initialize(i, run_already = true) ⇒ ParseFasta constructor

# === initialize ========================================================================= #.
#is_it_dna_or_protein? ⇒ Boolean

# === is_it_dna_or_protein?.
#location? ⇒ Boolean (also: #location)

# === location? ========================================================================= #.
#modification_time ⇒ Object

# === modification_time.
#n_entries ⇒ Object

# === n_entries ========================================================================= #.
#report_filesize ⇒ Object

# === report_filesize ========================================================================= #.
#report_how_many_entries_exist ⇒ Object

# === report_how_many_entries_exist ========================================================================= #.
#report_last_modification_time ⇒ Object

# === report_last_modification_time ========================================================================= #.
#report_one_sequence ⇒ Object

# === report_one_sequence ========================================================================= #.
#report_whether_it_is_nt_or_proteine ⇒ Object

# === report_whether_it_is_nt_or_proteine.
#reset ⇒ Object

# === reset ========================================================================= #.
#run ⇒ Object

# === run (run tag) ========================================================================= #.
#run_everything ⇒ Object

# === run_everything ========================================================================= #.
#sequence(return_n_entries = 1) ⇒ Object

# === sequence.
#sequence_and_accession_number(i = 3) ⇒ Object

# === sequence_and_accession_number.
#set_location(i = nil) ⇒ Object

# === set_location.
#status? ⇒ Boolean (also: #report_status, #report)

# === status? (status tag).
#try_to_locate_info_file(i = @location) ⇒ Object

# === try_to_locate_info_file ========================================================================= #.
#try_to_read_in_the_dataset ⇒ Object

# === try_to_read_in_the_dataset ========================================================================= #.
#type? ⇒ Boolean (also: #type)

# === type? ========================================================================= #.

Methods included from Shared

be_quiet, #be_verbose?, be_verbose?, #cd, #edit_login_file, #eliminate_tabulator, #ensure_that_download_dir_exists, #ensure_that_temp_dir_exists, #mkdir, #readlines, #set_pgpassword, #show_password, #show_time_now, #split_at, #split_at_tabulator, #tokenize

Methods included from Constants

#info_dir?, #work_directory?

Methods inherited from CommandlineApplication

#all_aminoacids?, #append_what_into, #at_home?, #be_silent, #be_verbose?, #cat, #ccliner, #change_directory, #cliner, #codon_table_dataset?, #codon_to_aminoacid, #codons_for?, #colourize_this_dna_sequence, #complement, #cp, #disable_warnings, #download_dir?, #editor?, #enable_warnings, #ensure_that_the_base_directories_exist, #esystem, #extract, #is_this_a_start_codon?, #is_this_a_stop_codon?, #leading_five_prime, #load_bioroebe_yaml_file, #log_directory?, #one_letter_to_long_name, #one_to_three, #only_numbers?, #open_in_browser, #opnerev, #opnn, #pad_with_double_quotes, #pad_with_single_quotes, #partner_nucleotide, #remove_numbers, #remove_trailing_ansii_escape_code, #return_all_possible_start_codons, #return_array_of_one_letter_aminoacids, #return_cheerful_person, #return_chunked_display, #return_ubiquitin_sequence, #runmode?, #set_be_verbose, #set_runmode, #start_codon?, #stop_codons?, #strict_filter_away_invalid_aminoacids, #taxonomy_download_directory?, #three_to_one, #to_rna, #trailing_three_prime, #use_opn?, #verbose_truth, #was_or_were, #without_extname, #write_what_into

Methods included from BaseModule

#absolute_path, #default_file_read, #file_readlines

Methods included from CommandlineArguments

#commandline_arguments?, #commandline_arguments_that_are_files?, #e, #first?, #first_non_hyphen_argument?, #remove_hyphens_from_the_commandline_arguments, #return_commandline_arguments_as_string, #return_commandline_arguments_that_are_not_files, #return_entries_without_two_leading_hyphens, #select_commandline_arguments, #select_entries_starting_with_two_hyphens, #set_commandline_arguments

Methods included from ColoursForBase

#colourize_this_aminoacid_sequence_for_the_commandline, #colourize_this_nucleotide_sequence, #disable_colours, #ecomment, #efancy, #egold, #enable_colours, #eorange, #eparse, #erev, #red, #remove_trailing_escape_part, #return_colour_for_nucleotides, #rev, #sdir, #set_will_we_use_colours, #sfancy, #sfile, #simp, #swarn, #use_colours?, #use_colours_within_the_bioroebe_namespace?

Methods inherited from Base

#append_what_into, #can_base_pair?, #convert_global_env, #delete_file, #directory_to_the_codon_tables?, #is_on_roebe?, #is_palindrome?, #main_encoding?, #mkdir, #move_file, #mv, #no_file_exists_at, #no_newlines, #project_yaml_directory?, #rds, #register_sigint, #return_pwd, #return_the_first_line_of_this_file, #word_wrap, #write_what_into

Methods included from InternalHashModule

#internal_hash?, #reset_the_internal_hash

Methods included from InferTheNamespaceModule

#infer_the_namespace, #namespace?

Constructor Details

#initialize(i, run_already = true) ⇒ `ParseFasta`

#

initialize

#

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 34

def initialize(
    i,
    run_already = true
  )
  reset
  set_location(i)
  try_to_read_in_the_dataset
  run if run_already
end

Class Method Details

.batch_process(from_this_directory = AA_DIR) ⇒ `Object`

#

ParseFasta.batch

#

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 285

def self.batch_process(from_this_directory = AA_DIR)
  e 'We will now process all fasta files from directory '+
    sdir(from_this_directory)+'.'
  entries = Dir[from_this_directory+'*.fa']
  e 'There are '+sfancy(entries.size.to_s)+' entries in '\
    'that directory.'+N+N
  entries.each {|entry|
    cliner
    e
    _ = ParseFasta.new(entry)
    _.report_status
    _.try_to_locate_info_file
    e
  }
  e 'Finished!'
end

Instance Method Details

#all_accession_entries ⇒ `Object`

#

all_accession_entries

Return all accession entries here.

#



307
308
309

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 307

def all_accession_entries
  @hash.keys
end

#debug ⇒ `Object`

#

debug

#

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 192

def debug
  pp @hash
  pp @hash.keys.size
end

#filesize(i = @location) ⇒ `Object`

#

filesize

#

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 167

def filesize(
    i = @location
  )
  File.size?(i).to_s
end

#hash? ⇒ `Boolean` Also known as: hash

#

hash?

#

Returns:

(Boolean)



143
144
145

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 143

def hash? # getter method for the @hash dataset.
  @hash
end

#is_it_dna_or_protein? ⇒ `Boolean`

#

is_it_dna_or_protein?

This method sets the @type variable. It makes use of class IsDNA for this.

#

Returns:

(Boolean)

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 267

def is_it_dna_or_protein?
  these_entries = sequence_and_accession_number(N_ENTRIES) # sequence() is a method call defined above.
  # Transpose the array next.
  transposed = these_entries.transpose
  @only_accession_numbers = transposed[0]
  @only_sequences = transposed[1]
  _ = IsDNA.new(@only_sequences)
  if _.is_dna?
    @type = 'DNA'
  else
    @type = 'Protein'
  end
  @total_characters = _.total_characters
end

#location? ⇒ `Boolean` Also known as: location

#

location?

#

Returns:

(Boolean)



341
342
343

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 341

def location?
  @location
end

#modification_time ⇒ `Object`

#

modification_time

We will return the german-variant for the time.

#



185
186
187

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 185

def modification_time
  File.mtime(@location).strftime('%H:%M:%S, %d.%m.%Y')
end

#n_entries ⇒ `Object`

#

n_entries

#



314
315
316

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 314

def n_entries
  all_accession_entries.size
end

#report_filesize ⇒ `Object`

#

report_filesize

#

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 159

def report_filesize
  e 'The filesize of '+sfile(@location)+' is '+
    sfancy(filesize)+' Bytes.'
end

#report_how_many_entries_exist ⇒ `Object`

#

report_how_many_entries_exist

#

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 150

def report_how_many_entries_exist
  n_entries = @hash.keys.size
  e 'We have '+sfancy(n_entries)+' Fasta entries (subsections) '\
    'in the file '+sfile(@location)+'.'
end

#report_last_modification_time ⇒ `Object`

#

report_last_modification_time

#



176
177
178

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 176

def report_last_modification_time
  e 'The file was last modified at '+simp(modification_time)
end

#report_one_sequence ⇒ `Object`

#

report_one_sequence

#

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 98

def report_one_sequence
  _ = sequence.first # sequence() is a method call.
  show_n_characters = 45
  if _
    if Bioroebe.do_truncate? and (_.size > show_n_characters+1)
      _ = _[0, show_n_characters]+' [TRUNCATED]'
    end
    e 'The (at least one) sequence is: '+sfancy(_)
  else
    e 'We could not find any sequence.'
  end
end

#report_whether_it_is_nt_or_proteine ⇒ `Object`

#

report_whether_it_is_nt_or_proteine

We find out whether we have NT data or proteine (polypeptide) data. For this to determine, we will have to make use of another class.

Also note that for this method to correctly work, we must call the method is_it_dna_or_protein? first.

#

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 241

def report_whether_it_is_nt_or_proteine
  n_entries = N_ENTRIES.to_s
  n_entries = @hash.keys.size if n_entries.to_i > @hash.keys.size
  joined_accession_numbers = @only_accession_numbers.each_with_index.map { |element, index|
    ['('+lightblue((index + 1).to_s)+') '+element]
  }.flatten.join(pink(' / '))
  e 'We assume that this dataset is '+sfancy(@type)+'. This conclusion '\
    'is based on analyzing/checking '
  e simp(n_entries)+' different entries (subsections) in the file '+
    sfile(File.basename(@location))+', for a total of '+sfancy(@total_characters)+' '+
    'characters (=tokens) (the '+simp(n_entries)+' Accession Numbers that were '+
    'checked are: '+joined_accession_numbers+').'
  e 'The allowed entries for DNA-sequences that we used for this assessment were: '+
    simp(IsDNA::ARRAY_VALID_SEQUENCES.join(', '))
  if Object.const_defined? :CalculateGCContent # Query the GC content here.
    e 'The percentage of the GC content is: '
    CalculateGCContent.new(@only_sequences).report
  end if @type == 'DNA' # Feedback this part here only if we work with DNA.
end

#reset ⇒ `Object`

#

reset

#

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 47

def reset
  super()
  # ======================================================================= #
  # === @data
  # ======================================================================= #
  @data = nil
  # ======================================================================= #
  # === @pointer
  # ======================================================================= #
  @pointer = nil
  # ======================================================================= #
  # === @hash
  # ======================================================================= #
  @hash = {}
  # ======================================================================= #
  # === @type
  # ======================================================================= #
  @type = nil
end

#run ⇒ `Object`

#

run (run tag)

#

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 373

def run
  begin
    run_everything
  rescue Interrupt
    e
    e 'User requested exit. Thus exiting now gracefully.'
    exit
  end
end

#run_everything ⇒ `Object`

#

run_everything

#

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 355

def run_everything
  if @data
    @data.split("\n").each {|entry|
      entry.chomp!
      if entry.include? '>' # This is a Description line.
        @pointer = entry.delete('>')
        @hash[@pointer] = ''.dup
      else
        @hash[@pointer] << entry unless @hash[@pointer].nil?
      end
    } 
    is_it_dna_or_protein?
  end
end

#sequence(return_n_entries = 1) ⇒ `Object`

#

sequence

By default, this method will feedback one entry, the first entry. We can however pass another number, in which case we will return more than one entry.

#

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 204

def sequence(return_n_entries = 1)
  result = []
  if return_n_entries == 1 # the default
    result << @hash[@hash.keys.first]
  else
    return_n_entries.times {
      result << @hash[@hash.keys.sample]
    }
  end
  return result
end

#sequence_and_accession_number(i = 3) ⇒ `Object`

#

sequence_and_accession_number

This method is similar to the above method called sequence(), but it also returns the accession number of the fasta file.

#

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 222

def sequence_and_accession_number(i = 3)
  result = []
  i.times {
    this_key   = @hash.keys.sample
    this_value = @hash[this_key]
    result << [this_key, this_value]
  }
  return result
end

#set_location(i = nil) ⇒ `Object`

#

set_location

This method sets the base location of our input-file.

#

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 79

def set_location(i = nil)
  i = i.first if i.is_a? Array # For now, use only the first element, if it is an Array.
  i = i.to_s
  if i =~ /^\d+$/ # if the input consists of only numbers.
    i = Dir['*'][i.to_i - 1]
  end
  unless File.exist? i # try a rescue in this case.
    i = AA_DIR+File.basename(i)
  end
  # ===================================================================== #
  # Expand to the proper path next:
  # ===================================================================== #
  i = return_pwd+i unless i.include? '/'
  @location = i
end

#status? ⇒ `Boolean` Also known as: report_status, report

#

status? (status tag)

We report some data about the dataset.

This is useful for finding out about:

(1) how many entries exist in our fasta file
(2) when the file was last modified
(3) how big the file is

#

Returns:

(Boolean)

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 330

def status?
  report_how_many_entries_exist
  report_last_modification_time
  report_filesize
  report_whether_it_is_nt_or_proteine
end

#try_to_locate_info_file(i = @location) ⇒ `Object`

#

try_to_locate_info_file

#

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 114

def try_to_locate_info_file(
    i = @location
  )
  i = File.basename(i)
  # chop away the extname.
  i.gsub!(/#{File.extname(i)}/, '')
  i.gsub!(/_pep/,'') if i.include? '_pep'
  _ = INFO_DIR+i+'*'
  results = Dir[_]
  if results.empty?
    e 'Could not find any entry for '+sfancy(_)+'.'
  else
    pp results
    data = Info.parse(results.first)
    if data.taxonomy_id
      set_pgpassword
      # Next, find out the tax ID through a postgre query:
      _ = ''.dup
      _ << POSTGRE_LOGIN_COMMAND
      _ << ' --command="'
      _ << 'select * from names where tax_id='+data.taxonomy_id+' LIMIT 15;"'
      esystem _
    end
  end
end

#try_to_read_in_the_dataset ⇒ `Object`

#

try_to_read_in_the_dataset

#



70
71
72

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 70

def try_to_read_in_the_dataset
  @data = File.read(@location) if File.exist?(@location)
end

#type? ⇒ `Boolean` Also known as: type

#

type?

#

Returns:

(Boolean)



348
349
350

# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 348

def type?
  @type
end

Class: Bioroebe::Taxonomy::ParseFasta

Overview

#

Bioroebe::Taxonomy::ParseFasta

#

Constant Summary collapse

#

N_ENTRIES

#

Constants included from Constants

Constants inherited from CommandlineApplication

Constants included from ColoursForBase

Constants inherited from Base

Class Method Summary collapse

# === ParseFasta.batch ========================================================================= #.

Instance Method Summary collapse

# === all_accession_entries.

# === debug ========================================================================= #.

# === filesize ========================================================================= #.

# === hash? ========================================================================= #.

# === initialize ========================================================================= #.

# === is_it_dna_or_protein?.

# === location? ========================================================================= #.

# === modification_time.

# === n_entries ========================================================================= #.

# === report_filesize ========================================================================= #.

# === report_how_many_entries_exist ========================================================================= #.

# === report_last_modification_time ========================================================================= #.

# === report_one_sequence ========================================================================= #.

# === report_whether_it_is_nt_or_proteine.

# === reset ========================================================================= #.

# === run (run tag) ========================================================================= #.

# === run_everything ========================================================================= #.

# === sequence.

# === sequence_and_accession_number.

# === set_location.

# === status? (status tag).

# === try_to_locate_info_file ========================================================================= #.

# === try_to_read_in_the_dataset ========================================================================= #.

# === type? ========================================================================= #.

Methods included from Shared

Methods included from Constants

Methods inherited from CommandlineApplication

Methods included from BaseModule

Methods included from CommandlineArguments

Methods included from ColoursForBase

Methods inherited from Base

Methods included from InternalHashModule

Methods included from InferTheNamespaceModule

Constructor Details

#initialize(i, run_already = true) ⇒ ParseFasta

#

initialize

#

Class Method Details

.batch_process(from_this_directory = AA_DIR) ⇒ Object

#

ParseFasta.batch

#

Instance Method Details

#all_accession_entries ⇒ Object

#

all_accession_entries

#

#debug ⇒ Object

#

debug

#

#filesize(i = @location) ⇒ Object

#

filesize

#

#hash? ⇒ Boolean Also known as: hash

#

hash?

#

#is_it_dna_or_protein? ⇒ Boolean

#

is_it_dna_or_protein?

#

#initialize(i, run_already = true) ⇒ `ParseFasta`

.batch_process(from_this_directory = AA_DIR) ⇒ `Object`

#all_accession_entries ⇒ `Object`

#debug ⇒ `Object`

#filesize(i = @location) ⇒ `Object`

#hash? ⇒ `Boolean` Also known as: hash

#is_it_dna_or_protein? ⇒ `Boolean`

#location? ⇒ `Boolean` Also known as: location

#modification_time ⇒ `Object`

#n_entries ⇒ `Object`

#report_filesize ⇒ `Object`

#report_how_many_entries_exist ⇒ `Object`

#report_last_modification_time ⇒ `Object`

#report_one_sequence ⇒ `Object`

#report_whether_it_is_nt_or_proteine ⇒ `Object`

#reset ⇒ `Object`

#run ⇒ `Object`

#run_everything ⇒ `Object`

#sequence(return_n_entries = 1) ⇒ `Object`

#sequence_and_accession_number(i = 3) ⇒ `Object`

#set_location(i = nil) ⇒ `Object`

#status? ⇒ `Boolean` Also known as: report_status, report

#try_to_locate_info_file(i = @location) ⇒ `Object`

#try_to_read_in_the_dataset ⇒ `Object`

#type? ⇒ `Boolean` Also known as: type