Class: Bioroebe::Taxonomy::ParseFasta
Overview
#
Bioroebe::Taxonomy::ParseFasta
#
Constant Summary
collapse
- N_ENTRIES =
3
Constants included
from Constants
Constants::AA_DIR, Constants::ARRAY_PROJECT_FILES, Constants::Archaea_Taxonomy_ID, Constants::BASE, Constants::BASE_URL, Constants::BE_VERBOSE, Constants::Bacteria_Taxonomy_ID, Constants::CITATIONS, Constants::CURATED_DIR, Constants::DATA_DIR, Constants::DELNODES, Constants::DIVISION, Constants::Eukaryota_Taxonomy_ID, Constants::FILE_USE_THIS_DATABASE, Constants::GEM_DIR, Constants::GENCODE, Constants::INCOMING_DIR, Constants::INFO_DIR, Constants::LAST_INTERACTIVE_COMMAND, Constants::LOCALOME_DIR, Constants::LOCAL_MIRROR, Constants::MERGED, Constants::MODULE_PATH, Constants::NAMES, Constants::NAMES_SQL, Constants::NCBI_BASE, Constants::NODES, Constants::NODES_SQL, Constants::NT_DIR, Constants::POSTGRESQL_QUERY_SIZE, Constants::POSTGRE_LOGIN_COMMAND, Constants::PROJECT_DOC_DIR, Constants::SEQUENCES_DIR, Constants::SHARED_HOME, Constants::TAXONOMY_BROWSER, Constants::TEMP_DIR, Constants::TEST_DIR, Constants::TMP_DIR, Constants::URL1
CommandlineApplication::OLD_VERBOSE_VALUE
ColoursForBase::ARRAY_HTML_COLOURS_IN_USE
Constants inherited
from Base
Base::NAMESPACE
Class Method Summary
collapse
Instance Method Summary
collapse
Methods included from Shared
be_quiet, #be_verbose?, be_verbose?, #cd, #edit_login_file, #eliminate_tabulator, #ensure_that_download_dir_exists, #ensure_that_temp_dir_exists, #mkdir, #readlines, #set_pgpassword, #show_password, #show_time_now, #split_at, #split_at_tabulator, #tokenize
Methods included from Constants
#info_dir?, #work_directory?
#all_aminoacids?, #append_what_into, #at_home?, #be_silent, #be_verbose?, #cat, #ccliner, #change_directory, #cliner, #codon_table_dataset?, #codon_to_aminoacid, #codons_for?, #colourize_this_dna_sequence, #complement, #cp, #disable_warnings, #download_dir?, #editor?, #enable_warnings, #ensure_that_the_base_directories_exist, #esystem, #extract, #is_this_a_start_codon?, #is_this_a_stop_codon?, #leading_five_prime, #load_bioroebe_yaml_file, #log_directory?, #one_letter_to_long_name, #one_to_three, #only_numbers?, #open_in_browser, #opnerev, #opnn, #pad_with_double_quotes, #pad_with_single_quotes, #partner_nucleotide, #remove_numbers, #remove_trailing_ansii_escape_code, #return_all_possible_start_codons, #return_array_of_one_letter_aminoacids, #return_cheerful_person, #return_chunked_display, #return_ubiquitin_sequence, #runmode?, #set_be_verbose, #set_runmode, #start_codon?, #stop_codons?, #strict_filter_away_invalid_aminoacids, #taxonomy_download_directory?, #three_to_one, #to_rna, #trailing_three_prime, #use_opn?, #verbose_truth, #was_or_were, #without_extname, #write_what_into
Methods included from BaseModule
#absolute_path, #default_file_read, #file_readlines
#commandline_arguments?, #commandline_arguments_that_are_files?, #e, #first?, #first_non_hyphen_argument?, #remove_hyphens_from_the_commandline_arguments, #return_commandline_arguments_as_string, #return_commandline_arguments_that_are_not_files, #return_entries_without_two_leading_hyphens, #select_commandline_arguments, #select_entries_starting_with_two_hyphens, #set_commandline_arguments
#colourize_this_aminoacid_sequence_for_the_commandline, #colourize_this_nucleotide_sequence, #disable_colours, #ecomment, #efancy, #egold, #enable_colours, #eorange, #eparse, #erev, #red, #remove_trailing_escape_part, #return_colour_for_nucleotides, #rev, #sdir, #set_will_we_use_colours, #sfancy, #sfile, #simp, #swarn, #use_colours?, #use_colours_within_the_bioroebe_namespace?
Methods inherited from Base
#append_what_into, #can_base_pair?, #convert_global_env, #delete_file, #directory_to_the_codon_tables?, #is_on_roebe?, #is_palindrome?, #main_encoding?, #mkdir, #move_file, #mv, #no_file_exists_at, #no_newlines, #project_yaml_directory?, #rds, #register_sigint, #return_pwd, #return_the_first_line_of_this_file, #word_wrap, #write_what_into
#internal_hash?, #reset_the_internal_hash
#infer_the_namespace, #namespace?
Constructor Details
#initialize(i, run_already = true) ⇒ ParseFasta
34
35
36
37
38
39
40
41
42
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 34
def initialize(
i,
run_already = true
)
reset
set_location(i)
try_to_read_in_the_dataset
run if run_already
end
|
Class Method Details
.batch_process(from_this_directory = AA_DIR) ⇒ Object
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 285
def self.batch_process(from_this_directory = AA_DIR)
e 'We will now process all fasta files from directory '+
sdir(from_this_directory)+'.'
entries = Dir[from_this_directory+'*.fa']
e 'There are '+sfancy(entries.size.to_s)+' entries in '\
'that directory.'+N+N
entries.each {|entry|
cliner
e
_ = ParseFasta.new(entry)
_.report_status
_.try_to_locate_info_file
e
}
e 'Finished!'
end
|
Instance Method Details
#all_accession_entries ⇒ Object
#
all_accession_entries
Return all accession entries here.
#
307
308
309
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 307
def all_accession_entries
@hash.keys
end
|
#debug ⇒ Object
192
193
194
195
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 192
def debug
pp @hash
pp @hash.keys.size
end
|
#filesize(i = @location) ⇒ Object
167
168
169
170
171
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 167
def filesize(
i = @location
)
File.size?(i).to_s
end
|
#hash? ⇒ Boolean
Also known as:
hash
143
144
145
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 143
def hash? @hash
end
|
#is_it_dna_or_protein? ⇒ Boolean
#
is_it_dna_or_protein?
This method sets the @type variable. It makes use of class IsDNA for this.
#
267
268
269
270
271
272
273
274
275
276
277
278
279
280
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 267
def is_it_dna_or_protein?
these_entries = sequence_and_accession_number(N_ENTRIES) transposed = these_entries.transpose
@only_accession_numbers = transposed[0]
@only_sequences = transposed[1]
_ = IsDNA.new(@only_sequences)
if _.is_dna?
@type = 'DNA'
else
@type = 'Protein'
end
@total_characters = _.total_characters
end
|
#location? ⇒ Boolean
Also known as:
location
341
342
343
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 341
def location?
@location
end
|
#modification_time ⇒ Object
#
modification_time
We will return the german-variant for the time.
#
185
186
187
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 185
def modification_time
File.mtime(@location).strftime('%H:%M:%S, %d.%m.%Y')
end
|
#n_entries ⇒ Object
314
315
316
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 314
def n_entries
all_accession_entries.size
end
|
#report_filesize ⇒ Object
159
160
161
162
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 159
def report_filesize
e 'The filesize of '+sfile(@location)+' is '+
sfancy(filesize)+' Bytes.'
end
|
#report_how_many_entries_exist ⇒ Object
#
report_how_many_entries_exist
#
150
151
152
153
154
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 150
def report_how_many_entries_exist
n_entries = @hash.keys.size
e 'We have '+sfancy(n_entries)+' Fasta entries (subsections) '\
'in the file '+sfile(@location)+'.'
end
|
#report_last_modification_time ⇒ Object
#
report_last_modification_time
#
176
177
178
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 176
def report_last_modification_time
e 'The file was last modified at '+simp(modification_time)
end
|
#report_one_sequence ⇒ Object
98
99
100
101
102
103
104
105
106
107
108
109
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 98
def report_one_sequence
_ = sequence.first show_n_characters = 45
if _
if Bioroebe.do_truncate? and (_.size > show_n_characters+1)
_ = _[0, show_n_characters]+' [TRUNCATED]'
end
e 'The (at least one) sequence is: '+sfancy(_)
else
e 'We could not find any sequence.'
end
end
|
#report_whether_it_is_nt_or_proteine ⇒ Object
#
report_whether_it_is_nt_or_proteine
We find out whether we have NT data or proteine (polypeptide) data. For this to determine, we will have to make use of another class.
Also note that for this method to correctly work, we must call the method is_it_dna_or_protein? first.
#
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 241
def report_whether_it_is_nt_or_proteine
n_entries = N_ENTRIES.to_s
n_entries = @hash.keys.size if n_entries.to_i > @hash.keys.size
joined_accession_numbers = @only_accession_numbers.each_with_index.map { |element, index|
['('+lightblue((index + 1).to_s)+') '+element]
}.flatten.join(pink(' / '))
e 'We assume that this dataset is '+sfancy(@type)+'. This conclusion '\
'is based on analyzing/checking '
e simp(n_entries)+' different entries (subsections) in the file '+
sfile(File.basename(@location))+', for a total of '+sfancy(@total_characters)+' '+
'characters (=tokens) (the '+simp(n_entries)+' Accession Numbers that were '+
'checked are: '+joined_accession_numbers+').'
e 'The allowed entries for DNA-sequences that we used for this assessment were: '+
simp(IsDNA::ARRAY_VALID_SEQUENCES.join(', '))
if Object.const_defined? :CalculateGCContent e 'The percentage of the GC content is: '
CalculateGCContent.new(@only_sequences).report
end if @type == 'DNA' end
|
#reset ⇒ Object
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 47
def reset
super()
@data = nil
@pointer = nil
@hash = {}
@type = nil
end
|
#run ⇒ Object
373
374
375
376
377
378
379
380
381
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 373
def run
begin
run_everything
rescue Interrupt
e
e 'User requested exit. Thus exiting now gracefully.'
exit
end
end
|
#run_everything ⇒ Object
355
356
357
358
359
360
361
362
363
364
365
366
367
368
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 355
def run_everything
if @data
@data.split("\n").each {|entry|
entry.chomp!
if entry.include? '>' @pointer = entry.delete('>')
@hash[@pointer] = ''.dup
else
@hash[@pointer] << entry unless @hash[@pointer].nil?
end
}
is_it_dna_or_protein?
end
end
|
#sequence(return_n_entries = 1) ⇒ Object
#
sequence
By default, this method will feedback one entry, the first entry. We can however pass another number, in which case we will return more than one entry.
#
204
205
206
207
208
209
210
211
212
213
214
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 204
def sequence(return_n_entries = 1)
result = []
if return_n_entries == 1 result << @hash[@hash.keys.first]
else
return_n_entries.times {
result << @hash[@hash.keys.sample]
}
end
return result
end
|
#sequence_and_accession_number(i = 3) ⇒ Object
#
sequence_and_accession_number
This method is similar to the above method called sequence(), but it also returns the accession number of the fasta file.
#
222
223
224
225
226
227
228
229
230
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 222
def sequence_and_accession_number(i = 3)
result = []
i.times {
this_key = @hash.keys.sample
this_value = @hash[this_key]
result << [this_key, this_value]
}
return result
end
|
#set_location(i = nil) ⇒ Object
#
set_location
This method sets the base location of our input-file.
#
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 79
def set_location(i = nil)
i = i.first if i.is_a? Array i = i.to_s
if i =~ /^\d+$/ i = Dir['*'][i.to_i - 1]
end
unless File.exist? i i = AA_DIR+File.basename(i)
end
i = return_pwd+i unless i.include? '/'
@location = i
end
|
#status? ⇒ Boolean
Also known as:
report_status, report
#
status? (status tag)
We report some data about the dataset.
This is useful for finding out about:
(1) how many entries exist in our fasta file
(2) when the file was last modified
(3) how big the file is
#
330
331
332
333
334
335
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 330
def status?
report_how_many_entries_exist
report_last_modification_time
report_filesize
report_whether_it_is_nt_or_proteine
end
|
#try_to_locate_info_file(i = @location) ⇒ Object
#
try_to_locate_info_file
#
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 114
def try_to_locate_info_file(
i = @location
)
i = File.basename(i)
i.gsub!(/#{File.extname(i)}/, '')
i.gsub!(/_pep/,'') if i.include? '_pep'
_ = INFO_DIR+i+'*'
results = Dir[_]
if results.empty?
e 'Could not find any entry for '+sfancy(_)+'.'
else
pp results
data = Info.parse(results.first)
if data.taxonomy_id
set_pgpassword
_ = ''.dup
_ << POSTGRE_LOGIN_COMMAND
_ << ' --command="'
_ << 'select * from names where tax_id='+data.taxonomy_id+' LIMIT 15;"'
esystem _
end
end
end
|
#try_to_read_in_the_dataset ⇒ Object
#
try_to_read_in_the_dataset
#
70
71
72
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 70
def try_to_read_in_the_dataset
@data = File.read(@location) if File.exist?(@location)
end
|
#type? ⇒ Boolean
Also known as:
type
348
349
350
|
# File 'lib/bioroebe/taxonomy/parse_fasta.rb', line 348
def type?
@type
end
|