Class: Bioroebe::Taxonomy::ParseFasta

Inherits:
CommandlineApplication show all
Includes:
Shared
Defined in:
lib/bioroebe/taxonomy/parse_fasta.rb

Overview

#

Bioroebe::Taxonomy::ParseFasta

#

Constant Summary collapse

N_ENTRIES =
#

N_ENTRIES

#
3

Constants included from Constants

Constants::AA_DIR, Constants::ARRAY_PROJECT_FILES, Constants::Archaea_Taxonomy_ID, Constants::BASE, Constants::BASE_URL, Constants::BE_VERBOSE, Constants::Bacteria_Taxonomy_ID, Constants::CITATIONS, Constants::CURATED_DIR, Constants::DATA_DIR, Constants::DELNODES, Constants::DIVISION, Constants::Eukaryota_Taxonomy_ID, Constants::FILE_USE_THIS_DATABASE, Constants::GEM_DIR, Constants::GENCODE, Constants::INCOMING_DIR, Constants::INFO_DIR, Constants::LAST_INTERACTIVE_COMMAND, Constants::LOCALOME_DIR, Constants::LOCAL_MIRROR, Constants::MERGED, Constants::MODULE_PATH, Constants::NAMES, Constants::NAMES_SQL, Constants::NCBI_BASE, Constants::NODES, Constants::NODES_SQL, Constants::NT_DIR, Constants::POSTGRESQL_QUERY_SIZE, Constants::POSTGRE_LOGIN_COMMAND, Constants::PROJECT_DOC_DIR, Constants::SEQUENCES_DIR, Constants::SHARED_HOME, Constants::TAXONOMY_BROWSER, Constants::TEMP_DIR, Constants::TEST_DIR, Constants::TMP_DIR, Constants::URL1

Constants inherited from CommandlineApplication

CommandlineApplication::OLD_VERBOSE_VALUE

Constants included from ColoursForBase

ColoursForBase::ARRAY_HTML_COLOURS_IN_USE

Constants inherited from Base

Base::NAMESPACE

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Shared

be_quiet, #be_verbose?, be_verbose?, #cd, #edit_login_file, #eliminate_tabulator, #ensure_that_download_dir_exists, #ensure_that_temp_dir_exists, #mkdir, #readlines, #set_pgpassword, #show_password, #show_time_now, #split_at, #split_at_tabulator, #tokenize

Methods included from Constants

#info_dir?, #work_directory?

Methods inherited from CommandlineApplication

#all_aminoacids?, #append_what_into, #at_home?, #be_silent, #be_verbose?, #cat, #ccliner, #change_directory, #cliner, #codon_table_dataset?, #codon_to_aminoacid, #codons_for?, #colourize_this_dna_sequence, #complement, #cp, #disable_warnings, #download_dir?, #editor?, #enable_warnings, #ensure_that_the_base_directories_exist, #esystem, #extract, #is_this_a_start_codon?, #is_this_a_stop_codon?, #leading_five_prime, #load_bioroebe_yaml_file, #log_directory?, #one_letter_to_long_name, #one_to_three, #only_numbers?, #open_in_browser, #opne, #opnn, #pad_with_double_quotes, #pad_with_single_quotes, #partner_nucleotide, #remove_numbers, #remove_trailing_ansii_escape_code, #return_all_possible_start_codons, #return_array_of_one_letter_aminoacids, #return_cheerful_person, #return_chunked_display, #return_ubiquitin_sequence, #set_be_verbose, #start_codon?, #stop_codons?, #strict_filter_away_invalid_aminoacids, #taxonomy_download_directory?, #three_to_one, #to_rna, #trailing_three_prime, #use_opn?, #verbose_truth, #was_or_were, #without_extname, #write_what_into

Methods included from CommandlineArguments

#commandline_arguments?, #commandline_arguments_that_are_files?, #e, #first?, #first_non_hyphen_argument?, #remove_hyphens_from_the_commandline_arguments, #return_commandline_arguments_as_string, #return_commandline_arguments_that_are_not_files, #return_entries_without_two_leading_hyphens, #select_commandline_arguments, #select_entries_starting_with_two_hyphens, #set_commandline_arguments

Methods included from ColoursForBase

#colourize_this_aminoacid_sequence_for_the_commandline, #colourize_this_nucleotide_sequence, #disable_colours, #ecomment, #efancy, #egold, #enable_colours, #eorange, #eparse, #erev, #red, #remove_trailing_escape_part, #return_colour_for_nucleotides, #rev, #sdir, #set_use_colours, #sfancy, #sfile, #simp, #swarn, #use_colours?, #use_colours_within_the_bioroebe_namespace?

Methods inherited from Base

#append_what_into, #can_base_pair?, #convert_global_env, #delete_file, #directory_to_the_codon_tables?, #file_readlines, #infer_the_namespace, #is_on_roebe?, #is_palindrome?, #main_encoding?, #mkdir, #move_file, #mv, #namespace?, #no_file_exists_at, #no_newlines, #project_yaml_directory?, #rds, #register_sigint, #return_pwd, #return_the_first_line_of_this_file, #word_wrap, #write_what_into

Constructor Details

#initialize(i, run_already = true) ⇒ ParseFasta

#

initialize

#

34
35
36
37
38
39
40
41
42
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 34

def initialize(
    i,
    run_already = true
  )
  reset
  set_location(i)
  try_to_read_in_the_dataset
  run if run_already
end

Class Method Details

.batch_process(from_this_directory = AA_DIR) ⇒ Object

#

ParseFasta.batch

#

283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 283

def self.batch_process(from_this_directory = AA_DIR)
  e 'We will now process all fasta files from directory '+
    sdir(from_this_directory)+'.'
  entries = Dir[from_this_directory+'*.fa']
  e 'There are '+sfancy(entries.size.to_s)+' entries in '\
    'that directory.'+N+N
  entries.each {|entry|
    cliner
    e
    _ = ParseFasta.new(entry)
    _.report_status
    _.try_to_locate_info_file
    e
  }
  e 'Finished!'
end

Instance Method Details

#all_accession_entriesObject

#

all_accession_entries

Return all accession entries here.

#

305
306
307
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 305

def all_accession_entries
  @hash.keys
end

#debugObject

#

debug

#

190
191
192
193
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 190

def debug
  pp @hash
  pp @hash.keys.size
end

#filesize(i = @location) ⇒ Object

#

filesize

#

167
168
169
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 167

def filesize(i = @location)
  File.size?(i).to_s
end

#hash?Boolean Also known as: hash

#

hash?

#

Returns:

  • (Boolean)

143
144
145
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 143

def hash? # getter method for the @hash dataset.
  @hash
end

#is_it_dna_or_protein?Boolean

#

is_it_dna_or_protein?

This method sets the @type variable. It makes use of class IsDNA for this.

#

Returns:

  • (Boolean)

265
266
267
268
269
270
271
272
273
274
275
276
277
278
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 265

def is_it_dna_or_protein?
  these_entries = sequence_and_accession_number(N_ENTRIES) # sequence() is a method call defined above.
  # Transpose the array next.
  transposed = these_entries.transpose
  @only_accession_numbers = transposed[0]
  @only_sequences = transposed[1]
  _ = IsDNA.new(@only_sequences)
  if _.is_dna?
    @type = 'DNA'
  else
    @type = 'Protein'
  end
  @total_characters = _.total_characters
end

#location?Boolean Also known as: location

#

location?

#

Returns:

  • (Boolean)

339
340
341
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 339

def location?
  @location
end

#modification_timeObject

#

modification_time

We will return the german-variant for the time.

#

183
184
185
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 183

def modification_time
  File.mtime(@location).strftime('%H:%M:%S, %d.%m.%Y')
end

#n_entriesObject

#

n_entries

#

312
313
314
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 312

def n_entries
  all_accession_entries.size
end

#report_filesizeObject

#

report_filesize

#

159
160
161
162
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 159

def report_filesize
  e 'The filesize of '+sfile(@location)+' is '+
    sfancy(filesize)+' Bytes.'
end

#report_how_many_entries_existObject

#

report_how_many_entries_exist

#

150
151
152
153
154
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 150

def report_how_many_entries_exist
  n_entries = @hash.keys.size
  e 'We have '+sfancy(n_entries)+' Fasta entries (subsections) '\
    'in the file '+sfile(@location)+'.'
end

#report_last_modification_timeObject

#

report_last_modification_time

#

174
175
176
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 174

def report_last_modification_time
  e 'The file was last modified at '+simp(modification_time)
end

#report_one_sequenceObject

#

report_one_sequence

#

98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 98

def report_one_sequence
  _ = sequence.first # sequence() is a method call.
  show_n_characters = 45
  if _
    if Bioroebe.do_truncate? and (_.size > show_n_characters+1)
      _ = _[0, show_n_characters]+' [TRUNCATED]'
    end
    e 'The (at least one) sequence is: '+sfancy(_)
  else
    e 'We could not find any sequence.'
  end
end

#report_whether_it_is_nt_or_proteineObject

#

report_whether_it_is_nt_or_proteine

We find out whether we have NT data or proteine (polypeptide) data. For this to determine, we will have to make use of another class.

Also note that for this method to correctly work, we must call the method is_it_dna_or_protein? first.

#

239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 239

def report_whether_it_is_nt_or_proteine
  n_entries = N_ENTRIES.to_s
  n_entries = @hash.keys.size if n_entries.to_i > @hash.keys.size
  joined_accession_numbers = @only_accession_numbers.each_with_index.map { |element, index|
    ['('+lightblue((index + 1).to_s)+') '+element]
  }.flatten.join(pink(' / '))
  e 'We assume that this dataset is '+sfancy(@type)+'. This conclusion '\
    'is based on analyzing/checking '
  e simp(n_entries)+' different entries (subsections) in the file '+
    sfile(File.basename(@location))+', for a total of '+sfancy(@total_characters)+' '+
    'characters (=tokens) (the '+simp(n_entries)+' Accession Numbers that were '+
    'checked are: '+joined_accession_numbers+').'
  e 'The allowed entries for DNA-sequences that we used for this assessment were: '+
    simp(IsDNA::ARRAY_VALID_SEQUENCES.join(', '))
  if Object.const_defined? :CalculateGCContent # Query the GC content here.
    e 'The percentage of the GC content is: '
    CalculateGCContent.new(@only_sequences).report
  end if @type == 'DNA' # Feedback this part here only if we work with DNA.
end

#resetObject

#

reset

#

47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 47

def reset
  super()
  # ======================================================================= #
  # === @data
  # ======================================================================= #
  @data = nil
  # ======================================================================= #
  # === @pointer
  # ======================================================================= #
  @pointer = nil
  # ======================================================================= #
  # === @hash
  # ======================================================================= #
  @hash = {}
  # ======================================================================= #
  # === @type
  # ======================================================================= #
  @type = nil
end

#runObject

#

run (run tag)

#

371
372
373
374
375
376
377
378
379
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 371

def run
  begin
    run_everything
  rescue Interrupt
    e
    e 'User requested exit. Thus exiting now gracefully.'
    exit
  end
end

#run_everythingObject

#

run_everything

#

353
354
355
356
357
358
359
360
361
362
363
364
365
366
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 353

def run_everything
  if @data
    @data.split("\n").each {|entry|
      entry.chomp!
      if entry.include? '>' # This is a Description line.
        @pointer = entry.delete('>')
        @hash[@pointer] = ''.dup
      else
        @hash[@pointer] << entry unless @hash[@pointer].nil?
      end
    } 
    is_it_dna_or_protein?
  end
end

#sequence(return_n_entries = 1) ⇒ Object

#

sequence

By default, this method will feedback one entry, the first entry. We can however pass another number, in which case we will return more than one entry.

#

202
203
204
205
206
207
208
209
210
211
212
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 202

def sequence(return_n_entries = 1)
  result = []
  if return_n_entries == 1 # the default
    result << @hash[@hash.keys.first]
  else
    return_n_entries.times {
      result << @hash[@hash.keys.sample]
    }
  end
  return result
end

#sequence_and_accession_number(i = 3) ⇒ Object

#

sequence_and_accession_number

This method is similar to the above method called sequence(), but it also returns the accession number of the fasta file.

#

220
221
222
223
224
225
226
227
228
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 220

def sequence_and_accession_number(i = 3)
  result = []
  i.times {
    this_key   = @hash.keys.sample
    this_value = @hash[this_key]
    result << [this_key, this_value]
  }
  return result
end

#set_location(i = nil) ⇒ Object

#

set_location

This method sets the base location of our input-file.

#

79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 79

def set_location(i = nil)
  i = i.first if i.is_a? Array # For now, use only the first element, if it is an Array.
  i = i.to_s
  if i =~ /^\d+$/ # if the input consists of only numbers.
    i = Dir['*'][i.to_i - 1]
  end
  unless File.exist? i # try a rescue in this case.
    i = AA_DIR+File.basename(i)
  end
  # ===================================================================== #
  # Expand to the proper path next:
  # ===================================================================== #
  i = return_pwd+i unless i.include? '/'
  @location = i
end

#status?Boolean Also known as: report_status, report

#

status? (status tag)

We report some data about the dataset.

This is useful for finding out about:

(1) how many entries exist in our fasta file
(2) when the file was last modified
(3) how big the file is
#

Returns:

  • (Boolean)

328
329
330
331
332
333
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 328

def status?
  report_how_many_entries_exist
  report_last_modification_time
  report_filesize
  report_whether_it_is_nt_or_proteine
end

#try_to_locate_info_file(i = @location) ⇒ Object

#

try_to_locate_info_file

#

114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 114

def try_to_locate_info_file(
    i = @location
  )
  i = File.basename(i)
  # chop away the extname.
  i.gsub!(/#{File.extname(i)}/, '')
  i.gsub!(/_pep/,'') if i.include? '_pep'
  _ = INFO_DIR+i+'*'
  results = Dir[_]
  if results.empty?
    e 'Could not find any entry for '+sfancy(_)+'.'
  else
    pp results
    data = Info.parse(results.first)
    if data.taxonomy_id
      set_pgpassword
      # Next, find out the tax ID through a postgre query:
      _ = ''.dup
      _ << POSTGRE_LOGIN_COMMAND
      _ << ' --command="'
      _ << 'select * from names where tax_id='+data.taxonomy_id+' LIMIT 15;"'
      esystem _
    end
  end
end

#try_to_read_in_the_datasetObject

#

try_to_read_in_the_dataset

#

70
71
72
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 70

def try_to_read_in_the_dataset
  @data = File.read(@location) if File.exist?(@location)
end

#type?Boolean Also known as: type

#

type?

#

Returns:

  • (Boolean)

346
347
348
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 346

def type?
  @type
end