Class: Bioroebe::Taxonomy::ParseFasta

Inherits:
CommandlineApplication show all
Includes:
Shared
Defined in:
lib/bioroebe/taxonomy/parse_fasta.rb

Overview

#

Bioroebe::Taxonomy::ParseFasta

#

Constant Summary collapse

N_ENTRIES =
#

N_ENTRIES

#
3

Constants included from Constants

Constants::AA_DIR, Constants::ARRAY_PROJECT_FILES, Constants::Archaea_Taxonomy_ID, Constants::BASE, Constants::BASE_URL, Constants::BE_VERBOSE, Constants::Bacteria_Taxonomy_ID, Constants::CITATIONS, Constants::CURATED_DIR, Constants::DATA_DIR, Constants::DELNODES, Constants::DIVISION, Constants::Eukaryota_Taxonomy_ID, Constants::FILE_USE_THIS_DATABASE, Constants::GEM_DIR, Constants::GENCODE, Constants::INCOMING_DIR, Constants::INFO_DIR, Constants::LAST_INTERACTIVE_COMMAND, Constants::LOCALOME_DIR, Constants::LOCAL_MIRROR, Constants::MERGED, Constants::MODULE_PATH, Constants::NAMES, Constants::NAMES_SQL, Constants::NCBI_BASE, Constants::NODES, Constants::NODES_SQL, Constants::NT_DIR, Constants::POSTGRESQL_QUERY_SIZE, Constants::POSTGRE_LOGIN_COMMAND, Constants::PROJECT_DOC_DIR, Constants::SEQUENCES_DIR, Constants::SHARED_HOME, Constants::TAXONOMY_BROWSER, Constants::TEMP_DIR, Constants::TEST_DIR, Constants::TMP_DIR, Constants::URL1

Constants inherited from CommandlineApplication

CommandlineApplication::OLD_VERBOSE_VALUE

Constants included from ColoursForBase

ColoursForBase::ARRAY_HTML_COLOURS_IN_USE

Constants inherited from Base

Base::NAMESPACE

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Shared

be_quiet, #be_verbose?, be_verbose?, #cd, #edit_login_file, #eliminate_tabulator, #ensure_that_download_dir_exists, #ensure_that_temp_dir_exists, #mkdir, #readlines, #set_pgpassword, #show_password, #show_time_now, #split_at, #split_at_tabulator, #tokenize

Methods included from Constants

#info_dir?, #work_directory?

Methods inherited from CommandlineApplication

#all_aminoacids?, #append_what_into, #at_home?, #be_silent, #be_verbose?, #cat, #ccliner, #change_directory, #cliner, #codon_table_dataset?, #codon_to_aminoacid, #codons_for?, #colourize_this_dna_sequence, #complement, #cp, #disable_warnings, #download_dir?, #editor?, #enable_warnings, #ensure_that_the_base_directories_exist, #esystem, #extract, #is_this_a_start_codon?, #is_this_a_stop_codon?, #leading_five_prime, #load_bioroebe_yaml_file, #log_directory?, #one_letter_to_long_name, #one_to_three, #only_numbers?, #open_in_browser, #opnerev, #opnn, #pad_with_double_quotes, #pad_with_single_quotes, #partner_nucleotide, #remove_numbers, #remove_trailing_ansii_escape_code, #return_all_possible_start_codons, #return_array_of_one_letter_aminoacids, #return_cheerful_person, #return_chunked_display, #return_ubiquitin_sequence, #runmode?, #set_be_verbose, #set_runmode, #start_codon?, #stop_codons?, #strict_filter_away_invalid_aminoacids, #taxonomy_download_directory?, #three_to_one, #to_rna, #trailing_three_prime, #use_opn?, #verbose_truth, #was_or_were, #without_extname, #write_what_into

Methods included from BaseModule

#absolute_path, #default_file_read, #file_readlines

Methods included from CommandlineArguments

#commandline_arguments?, #commandline_arguments_that_are_files?, #e, #first?, #first_non_hyphen_argument?, #remove_hyphens_from_the_commandline_arguments, #return_commandline_arguments_as_string, #return_commandline_arguments_that_are_not_files, #return_entries_without_two_leading_hyphens, #select_commandline_arguments, #select_entries_starting_with_two_hyphens, #set_commandline_arguments

Methods included from ColoursForBase

#colourize_this_aminoacid_sequence_for_the_commandline, #colourize_this_nucleotide_sequence, #disable_colours, #ecomment, #efancy, #egold, #enable_colours, #eorange, #eparse, #erev, #red, #remove_trailing_escape_part, #return_colour_for_nucleotides, #rev, #sdir, #set_will_we_use_colours, #sfancy, #sfile, #simp, #swarn, #use_colours?, #use_colours_within_the_bioroebe_namespace?

Methods inherited from Base

#append_what_into, #can_base_pair?, #convert_global_env, #delete_file, #directory_to_the_codon_tables?, #is_on_roebe?, #is_palindrome?, #main_encoding?, #mkdir, #move_file, #mv, #no_file_exists_at, #no_newlines, #project_yaml_directory?, #rds, #register_sigint, #return_pwd, #return_the_first_line_of_this_file, #word_wrap, #write_what_into

Methods included from InternalHashModule

#internal_hash?, #reset_the_internal_hash

Methods included from InferTheNamespaceModule

#infer_the_namespace, #namespace?

Constructor Details

#initialize(i, run_already = true) ⇒ ParseFasta

#

initialize

#


34
35
36
37
38
39
40
41
42
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 34

def initialize(
    i,
    run_already = true
  )
  reset
  set_location(i)
  try_to_read_in_the_dataset
  run if run_already
end

Class Method Details

.batch_process(from_this_directory = AA_DIR) ⇒ Object

#

ParseFasta.batch

#


285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 285

def self.batch_process(from_this_directory = AA_DIR)
  e 'We will now process all fasta files from directory '+
    sdir(from_this_directory)+'.'
  entries = Dir[from_this_directory+'*.fa']
  e 'There are '+sfancy(entries.size.to_s)+' entries in '\
    'that directory.'+N+N
  entries.each {|entry|
    cliner
    e
    _ = ParseFasta.new(entry)
    _.report_status
    _.try_to_locate_info_file
    e
  }
  e 'Finished!'
end

Instance Method Details

#all_accession_entriesObject

#

all_accession_entries

Return all accession entries here.

#


307
308
309
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 307

def all_accession_entries
  @hash.keys
end

#debugObject

#

debug

#


192
193
194
195
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 192

def debug
  pp @hash
  pp @hash.keys.size
end

#filesize(i = @location) ⇒ Object

#

filesize

#


167
168
169
170
171
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 167

def filesize(
    i = @location
  )
  File.size?(i).to_s
end

#hash?Boolean Also known as: hash

#

hash?

#

Returns:

  • (Boolean)


143
144
145
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 143

def hash? # getter method for the @hash dataset.
  @hash
end

#is_it_dna_or_protein?Boolean

#

is_it_dna_or_protein?

This method sets the @type variable. It makes use of class IsDNA for this.

#

Returns:

  • (Boolean)


267
268
269
270
271
272
273
274
275
276
277
278
279
280
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 267

def is_it_dna_or_protein?
  these_entries = sequence_and_accession_number(N_ENTRIES) # sequence() is a method call defined above.
  # Transpose the array next.
  transposed = these_entries.transpose
  @only_accession_numbers = transposed[0]
  @only_sequences = transposed[1]
  _ = IsDNA.new(@only_sequences)
  if _.is_dna?
    @type = 'DNA'
  else
    @type = 'Protein'
  end
  @total_characters = _.total_characters
end

#location?Boolean Also known as: location

#

location?

#

Returns:

  • (Boolean)


341
342
343
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 341

def location?
  @location
end

#modification_timeObject

#

modification_time

We will return the german-variant for the time.

#


185
186
187
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 185

def modification_time
  File.mtime(@location).strftime('%H:%M:%S, %d.%m.%Y')
end

#n_entriesObject

#

n_entries

#


314
315
316
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 314

def n_entries
  all_accession_entries.size
end

#report_filesizeObject

#

report_filesize

#


159
160
161
162
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 159

def report_filesize
  e 'The filesize of '+sfile(@location)+' is '+
    sfancy(filesize)+' Bytes.'
end

#report_how_many_entries_existObject

#

report_how_many_entries_exist

#


150
151
152
153
154
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 150

def report_how_many_entries_exist
  n_entries = @hash.keys.size
  e 'We have '+sfancy(n_entries)+' Fasta entries (subsections) '\
    'in the file '+sfile(@location)+'.'
end

#report_last_modification_timeObject

#

report_last_modification_time

#


176
177
178
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 176

def report_last_modification_time
  e 'The file was last modified at '+simp(modification_time)
end

#report_one_sequenceObject

#

report_one_sequence

#


98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 98

def report_one_sequence
  _ = sequence.first # sequence() is a method call.
  show_n_characters = 45
  if _
    if Bioroebe.do_truncate? and (_.size > show_n_characters+1)
      _ = _[0, show_n_characters]+' [TRUNCATED]'
    end
    e 'The (at least one) sequence is: '+sfancy(_)
  else
    e 'We could not find any sequence.'
  end
end

#report_whether_it_is_nt_or_proteineObject

#

report_whether_it_is_nt_or_proteine

We find out whether we have NT data or proteine (polypeptide) data. For this to determine, we will have to make use of another class.

Also note that for this method to correctly work, we must call the method is_it_dna_or_protein? first.

#


241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 241

def report_whether_it_is_nt_or_proteine
  n_entries = N_ENTRIES.to_s
  n_entries = @hash.keys.size if n_entries.to_i > @hash.keys.size
  joined_accession_numbers = @only_accession_numbers.each_with_index.map { |element, index|
    ['('+lightblue((index + 1).to_s)+') '+element]
  }.flatten.join(pink(' / '))
  e 'We assume that this dataset is '+sfancy(@type)+'. This conclusion '\
    'is based on analyzing/checking '
  e simp(n_entries)+' different entries (subsections) in the file '+
    sfile(File.basename(@location))+', for a total of '+sfancy(@total_characters)+' '+
    'characters (=tokens) (the '+simp(n_entries)+' Accession Numbers that were '+
    'checked are: '+joined_accession_numbers+').'
  e 'The allowed entries for DNA-sequences that we used for this assessment were: '+
    simp(IsDNA::ARRAY_VALID_SEQUENCES.join(', '))
  if Object.const_defined? :CalculateGCContent # Query the GC content here.
    e 'The percentage of the GC content is: '
    CalculateGCContent.new(@only_sequences).report
  end if @type == 'DNA' # Feedback this part here only if we work with DNA.
end

#resetObject

#

reset

#


47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 47

def reset
  super()
  # ======================================================================= #
  # === @data
  # ======================================================================= #
  @data = nil
  # ======================================================================= #
  # === @pointer
  # ======================================================================= #
  @pointer = nil
  # ======================================================================= #
  # === @hash
  # ======================================================================= #
  @hash = {}
  # ======================================================================= #
  # === @type
  # ======================================================================= #
  @type = nil
end

#runObject

#

run (run tag)

#


373
374
375
376
377
378
379
380
381
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 373

def run
  begin
    run_everything
  rescue Interrupt
    e
    e 'User requested exit. Thus exiting now gracefully.'
    exit
  end
end

#run_everythingObject

#

run_everything

#


355
356
357
358
359
360
361
362
363
364
365
366
367
368
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 355

def run_everything
  if @data
    @data.split("\n").each {|entry|
      entry.chomp!
      if entry.include? '>' # This is a Description line.
        @pointer = entry.delete('>')
        @hash[@pointer] = ''.dup
      else
        @hash[@pointer] << entry unless @hash[@pointer].nil?
      end
    } 
    is_it_dna_or_protein?
  end
end

#sequence(return_n_entries = 1) ⇒ Object

#

sequence

By default, this method will feedback one entry, the first entry. We can however pass another number, in which case we will return more than one entry.

#


204
205
206
207
208
209
210
211
212
213
214
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 204

def sequence(return_n_entries = 1)
  result = []
  if return_n_entries == 1 # the default
    result << @hash[@hash.keys.first]
  else
    return_n_entries.times {
      result << @hash[@hash.keys.sample]
    }
  end
  return result
end

#sequence_and_accession_number(i = 3) ⇒ Object

#

sequence_and_accession_number

This method is similar to the above method called sequence(), but it also returns the accession number of the fasta file.

#


222
223
224
225
226
227
228
229
230
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 222

def sequence_and_accession_number(i = 3)
  result = []
  i.times {
    this_key   = @hash.keys.sample
    this_value = @hash[this_key]
    result << [this_key, this_value]
  }
  return result
end

#set_location(i = nil) ⇒ Object

#

set_location

This method sets the base location of our input-file.

#


79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 79

def set_location(i = nil)
  i = i.first if i.is_a? Array # For now, use only the first element, if it is an Array.
  i = i.to_s
  if i =~ /^\d+$/ # if the input consists of only numbers.
    i = Dir['*'][i.to_i - 1]
  end
  unless File.exist? i # try a rescue in this case.
    i = AA_DIR+File.basename(i)
  end
  # ===================================================================== #
  # Expand to the proper path next:
  # ===================================================================== #
  i = return_pwd+i unless i.include? '/'
  @location = i
end

#status?Boolean Also known as: report_status, report

#

status? (status tag)

We report some data about the dataset.

This is useful for finding out about:

(1) how many entries exist in our fasta file
(2) when the file was last modified
(3) how big the file is
#

Returns:

  • (Boolean)


330
331
332
333
334
335
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 330

def status?
  report_how_many_entries_exist
  report_last_modification_time
  report_filesize
  report_whether_it_is_nt_or_proteine
end

#try_to_locate_info_file(i = @location) ⇒ Object

#

try_to_locate_info_file

#


114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 114

def try_to_locate_info_file(
    i = @location
  )
  i = File.basename(i)
  # chop away the extname.
  i.gsub!(/#{File.extname(i)}/, '')
  i.gsub!(/_pep/,'') if i.include? '_pep'
  _ = INFO_DIR+i+'*'
  results = Dir[_]
  if results.empty?
    e 'Could not find any entry for '+sfancy(_)+'.'
  else
    pp results
    data = Info.parse(results.first)
    if data.taxonomy_id
      set_pgpassword
      # Next, find out the tax ID through a postgre query:
      _ = ''.dup
      _ << POSTGRE_LOGIN_COMMAND
      _ << ' --command="'
      _ << 'select * from names where tax_id='+data.taxonomy_id+' LIMIT 15;"'
      esystem _
    end
  end
end

#try_to_read_in_the_datasetObject

#

try_to_read_in_the_dataset

#


70
71
72
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 70

def try_to_read_in_the_dataset
  @data = File.read(@location) if File.exist?(@location)
end

#type?Boolean Also known as: type

#

type?

#

Returns:

  • (Boolean)


348
349
350
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 348

def type?
  @type
end