Module: Bioroebe::Taxonomy
- Includes:
- Constants
- Included in:
- Taxonomy, Interactive
- Defined in:
- lib/bioroebe/taxonomy/edit.rb,
lib/bioroebe/taxonomy/menu.rb,
lib/bioroebe/taxonomy/node.rb,
lib/bioroebe/taxonomy/chart.rb,
lib/bioroebe/project/project.rb,
lib/bioroebe/taxonomy/shared.rb,
lib/bioroebe/taxonomy/colours.rb,
lib/bioroebe/taxonomy/taxonomy.rb,
lib/bioroebe/taxonomy/constants.rb,
lib/bioroebe/taxonomy/help/help.rb,
lib/bioroebe/taxonomy/info/info.rb,
lib/bioroebe/taxonomy/info/is_dna.rb,
lib/bioroebe/taxonomy/interactive.rb,
lib/bioroebe/taxonomy/parse_fasta.rb,
lib/bioroebe/taxonomy/class_methods.rb,
lib/bioroebe/taxonomy/help/helpline.rb,
lib/bioroebe/toplevel_methods/taxonomy.rb,
lib/bioroebe/taxonomy/info/check_available.rb
Overview
Bioroebe::Taxonomy
Defined Under Namespace
Modules: Constants, Shared Classes: Chart, CheckAvailable, Helpline, Info, Interactive, IsDNA, Node, ParseFasta
Constant Summary collapse
- PROJECT_YAML_DIR =
#
Bioroebe::Taxonomy::PROJECT_YAML_DIR
#
::Bioroebe.project_base_directory?
Constants included from Constants
Constants::AA_DIR, Constants::ARRAY_PROJECT_FILES, Constants::Archaea_Taxonomy_ID, Constants::BASE, Constants::BASE_URL, Constants::BE_VERBOSE, Constants::Bacteria_Taxonomy_ID, Constants::CITATIONS, Constants::CURATED_DIR, Constants::DATA_DIR, Constants::DELNODES, Constants::DIVISION, Constants::Eukaryota_Taxonomy_ID, Constants::FILE_USE_THIS_DATABASE, Constants::GEM_DIR, Constants::GENCODE, Constants::INCOMING_DIR, Constants::INFO_DIR, Constants::LAST_INTERACTIVE_COMMAND, Constants::LOCALOME_DIR, Constants::LOCAL_MIRROR, Constants::MERGED, Constants::MODULE_PATH, Constants::NAMES, Constants::NAMES_SQL, Constants::NCBI_BASE, Constants::NODES, Constants::NODES_SQL, Constants::NT_DIR, Constants::POSTGRESQL_QUERY_SIZE, Constants::POSTGRE_LOGIN_COMMAND, Constants::PROJECT_DOC_DIR, Constants::SEQUENCES_DIR, Constants::SHARED_HOME, Constants::TAXONOMY_BROWSER, Constants::TEMP_DIR, Constants::TEST_DIR, Constants::TMP_DIR, Constants::URL1
Class Method Summary collapse
-
.be_verbose? ⇒ Boolean
# === Taxonomy.be_verbose? ========================================================================= #.
-
.cd(i = '..') ⇒ Object
# === Bioroebe::Taxonomy.cd ========================================================================= #.
-
.cleanup ⇒ Object
# === Taxonomy.cleanup.
-
.download_directory? ⇒ Boolean
# === Bioroebe::Taxonomy.download_directory? ========================================================================= #.
-
.e(i = '') ⇒ Object
# === Bioroebe::Taxonomy.e ========================================================================= #.
-
.enable_colours ⇒ Object
# === Bioroebe::Taxonomy.enable_colours ========================================================================= #.
-
.ensure_that_temp_dir_exists ⇒ Object
# === Taxonomy.ensure_that_temp_dir_exists ========================================================================= #.
-
.interactive(i = nil) ⇒ Object
# === Taxonomy.interactive.
-
.load(_ = 'taxonomy/citations.dmp') ⇒ Object
# === Taxonomy.load.
-
.project_base_dir? ⇒ Boolean
# === Bioroebe::Taxonomy.project_base_dir?.
-
.project_yaml_dir? ⇒ Boolean
# === Bioroebe::Taxonomy.project_yaml_dir? ========================================================================= #.
-
.report_n_species ⇒ Object
# === Taxonomy.report_n_species.
-
.save_into_which_file? ⇒ Boolean
# === Bioroebe::Taxonomy.save_into_which_file? ========================================================================= #.
-
.save_when_we_last_updated_the_database(be_verbose = ::Bioroebe::BE_VERBOSE) ⇒ Object
# === Taxonomy.save_when_we_last_updated_the_database.
-
.show_current_time_and_date ⇒ Object
# === Taxonomy.show_current_time_and_date ========================================================================= #.
-
.show_remote_urls_to_the_NCBI_taxonomy_webpage(optional_id = nil) ⇒ Object
# === Bioroebe.show_remote_urls_to_the_NCBI_taxonomy_webpage.
-
.status ⇒ Object
# === Bioroebe::Taxonomy.status.
-
.status? ⇒ Boolean
# === Bioroebe::Taxonomy.status? (status tag).
-
.taxonomy_download_directory? ⇒ Boolean
# === Bioroebe::Taxonomy.taxonomy_download_directory?.
-
.update(this_dir = ::Bioroebe.taxonomy_download_directory?, be_verbose = true) ⇒ Object
# === Bioroebe::Taxonomy.update (download tag).
-
.use_colours? ⇒ Boolean
# === Bioroebe::Taxonomy.use_colours? ========================================================================= #.
Instance Method Summary collapse
-
#array_size?(array_input = citations?) ) ⇒ Boolean
# === array_size?.
-
#citations? ⇒ Boolean
# === citations?.
-
#clean(i) ⇒ Object
(also: #remove_delimiters)
# === clean.
-
#delnodes? ⇒ Boolean
# === delnodes?.
-
#division? ⇒ Boolean
# === division?.
-
#edit(i = '') ⇒ Object
# === edit (edit tag, ed tag).
-
#edit_gemspec ⇒ Object
# === edit_gemspec ========================================================================= #.
-
#edit_instructions ⇒ Object
# === edit_instructions ========================================================================= #.
-
#edit_password ⇒ Object
# === edit_password.
-
#edit_shared_code_file ⇒ Object
# === edit_shared_code_file ========================================================================= #.
-
#edit_taxonomy ⇒ Object
# === edit_taxonomy ========================================================================= #.
-
#find_highest_entries_in_sql ⇒ Object
# === find_highest_entries_in_sql.
-
#gencode? ⇒ Boolean
# === gencode?.
-
#generate_html_links_for(i) ⇒ Object
# === generate_html_links_for.
-
#get_id_of(id = 9606, be_verbose = true) ⇒ Object
# === get_id_of.
-
#get_parent_id_of(i) ⇒ Object
# === get_parent_id_of.
-
#get_scientific_name_of(taxid) ⇒ Object
(also: #get_scientific_name)
# === get_scientific_name_of.
-
#merged? ⇒ Boolean
# === merged?.
-
#names?(i = NAMES) ⇒ Boolean
# === names?.
-
#nodes?(i = NODES) ⇒ Boolean
# === nodes?.
-
#pad(i) ⇒ Object
# === pad.
-
#pad_properly(i = "5\t|\tThe domestic cat: perspective on the nature and diversity of cats.\t|\t0\t|\t8603894\t|\t \t|\t\t|\t9685 \t|\n") ⇒ Object
# === pad_properly.
-
#pad_sql(i) ⇒ Object
# === pad_sql.
-
#project_base_dir? ⇒ Boolean
(also: #base_dir?)
# === project_base_dir? ========================================================================= #.
-
#remove_delimiter(i) ⇒ Object
# === remove_delimiter.
-
#return_current_hours_minutes_seconds ⇒ Object
# === return_current_hours_minutes_seconds ========================================================================= #.
-
#return_full_lineage_of(i = nil) ⇒ Object
# === return_full_lineage_of.
-
#search_id(i = '7460') ⇒ Object
(also: #search, #id?)
# === search_id.
-
#show_current_time_and_date ⇒ Object
# === show_current_time_and_date ========================================================================= #.
-
#show_help ⇒ Object
# === show_help (help tag).
-
#split(i = citations? ) ⇒ Object
# === split (split tag).
-
#test(use_this_url = URL1) ⇒ Object
# === test ========================================================================= #.
-
#to_utf(i) ⇒ Object
(also: #sanitize)
# === to_utf ========================================================================= #.
-
#verify_proper_sql_structures ⇒ Object
# === verify_proper_sql_structures (debug tag).
Methods included from Shared
be_quiet, be_verbose?, cd, edit_login_file, eliminate_tabulator, ensure_that_download_dir_exists, ensure_that_temp_dir_exists, mkdir, readlines, set_pgpassword, show_password, show_time_now, split_at, split_at_tabulator, tokenize
Methods included from Constants
Class Method Details
.be_verbose? ⇒ Boolean
#
Taxonomy.be_verbose?
#
136 137 138 |
# File 'lib/bioroebe/taxonomy/class_methods.rb', line 136 def self.be_verbose? Taxonomy::Constants::BE_VERBOSE end |
.cd(i = '..') ⇒ Object
#
Bioroebe::Taxonomy.cd
#
74 75 76 |
# File 'lib/bioroebe/toplevel_methods/taxonomy.rb', line 74 def self.cd(i = '..') ::Bioroebe.cd(i) end |
.cleanup ⇒ Object
#
Taxonomy.cleanup
Get rid of some .sql files through this method here.
#
100 101 102 103 104 105 106 |
# File 'lib/bioroebe/taxonomy/class_methods.rb', line 100 def self.cleanup e 'Trying to remove some files now.' i = 'names.sql' remove_file(i) if File.exist? i i = 'nodes.sql' remove_file(i) if File.exist? i end |
.download_directory? ⇒ Boolean
#
Bioroebe::Taxonomy.download_directory?
#
64 65 66 |
# File 'lib/bioroebe/taxonomy/class_methods.rb', line 64 def self.download_directory? DOWNLOAD_DIR end |
.e(i = '') ⇒ Object
#
Bioroebe::Taxonomy.e
#
67 68 69 |
# File 'lib/bioroebe/toplevel_methods/taxonomy.rb', line 67 def self.e(i = '') ::Bioroebe.e(i) end |
.enable_colours ⇒ Object
#
Bioroebe::Taxonomy.enable_colours
#
21 22 23 24 |
# File 'lib/bioroebe/taxonomy/colours.rb', line 21 def self.enable_colours e 'Enabling colours.' ::Bioroebe.enable_colours end |
.ensure_that_temp_dir_exists ⇒ Object
#
Taxonomy.ensure_that_temp_dir_exists
#
81 82 83 |
# File 'lib/bioroebe/toplevel_methods/taxonomy.rb', line 81 def self.ensure_that_temp_dir_exists ::Bioroebe.ensure_that_the_base_directories_exist end |
.interactive(i = nil) ⇒ Object
#
Taxonomy.interactive
Invoke this method if you wish to directly invoke the interactive component of the Taxonomy module.
#
1961 1962 1963 |
# File 'lib/bioroebe/taxonomy/interactive.rb', line 1961 def self.interactive(i = nil) ::Bioroebe::Taxonomy::Interactive.new(i) end |
.load(_ = 'taxonomy/citations.dmp') ⇒ Object
#
Taxonomy.load
Load a specific .dmp file via this method.
The first argument should be the target location of the file that we wish to load (we assume this to be a local file for now).
#
116 117 118 119 120 121 122 123 124 |
# File 'lib/bioroebe/taxonomy/class_methods.rb', line 116 def self.load(_ = 'taxonomy/citations.dmp') if File.exist? _ data = File.readlines(_).map {|line| sanitize(line) } # Needed because File.readlines() may read in an invalid encoding. return data else e 'File at location `'+_+'` does not exist.' return nil end end |
.project_base_dir? ⇒ Boolean
#
Bioroebe::Taxonomy.project_base_dir?
The Taxonomy project has been fully integrated into the Bioroebe namespace in the year ~2015.
This method will return a String such as:
"/Programs/Ruby/2.6.4/lib/ruby/site_ruby/2.6.0/bioroebe/taxonomy/"
#
249 250 251 |
# File 'lib/bioroebe/project/project.rb', line 249 def self.project_base_dir? "#{::Bioroebe.project_base_directory?}taxonomy/" end |
.project_yaml_dir? ⇒ Boolean
#
Bioroebe::Taxonomy.project_yaml_dir?
#
256 257 258 |
# File 'lib/bioroebe/project/project.rb', line 256 def self.project_yaml_dir? Taxonomy::PROJECT_YAML_DIR end |
.report_n_species ⇒ Object
#
Taxonomy.report_n_species
This method will report how many species are registered in the NCBI database.
#
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
# File 'lib/bioroebe/taxonomy/class_methods.rb', line 146 def self.report_n_species remote_url = 'http://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=statistics&unclassified=hide&uncultured=hide&unspecified=hide&period=&from=&to=' e 'We will now obtain how many species are registered. This '\ 'may take a while.' e e 'In 26.09.2011, the Taxonomy database included exactly' e '234_991 species with a formal name.' e # ======================================================================= # # See: http://rubular.com/r/awsCk2nF4D # ======================================================================= # regex_to_use = /gov\/taxonomy\/\?term.+">(\d+)<\/A><\/TD>\s+<\/TR>$/ dataset = open(remote_url).read.split(N) _ = '' is_open = false dataset.each {|line| if is_open _ << line end if line.include? '<TR><TD><A HREF="/Taxonomy/Browser/wwwtax.cgi?id=1">All taxa</A></TD>' is_open = true elsif line.include? '</TR>' is_open = false end } _ =~ regex_to_use n_species = $1.to_s.dup e 'We found `'+sfancy(n_species)+::Bioroebe.rev+'` registered species.' end |
.save_into_which_file? ⇒ Boolean
#
Bioroebe::Taxonomy.save_into_which_file?
#
26 27 28 |
# File 'lib/bioroebe/toplevel_methods/taxonomy.rb', line 26 def self.save_into_which_file? TAXONOMY_NCBI_DATABASE_LAST_UPDATE_LOG_FILE end |
.save_when_we_last_updated_the_database(be_verbose = ::Bioroebe::BE_VERBOSE) ⇒ Object
#
Taxonomy.save_when_we_last_updated_the_database
Save into a file when we last updated the database. This method will make use of the constant SAVE_FILE, which tells us where to keep the save file, and can be found at: lib/taxonomy/shared/shared.rb
#
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
# File 'lib/bioroebe/toplevel_methods/taxonomy.rb', line 37 def self.save_when_we_last_updated_the_database( be_verbose = ::Bioroebe::BE_VERBOSE ) if be_verbose # BE_VERBOSE is defined in shared/shared.rb e "Trying to log last update into file "\ "#{::Bioroebe.sfile(save_into_which_file?)} next." end # ======================================================================= # # Only store into a file if the constant SHALL_WE_LOG_LAST_UPDATE # is set to true. # ======================================================================= # if SHALL_WE_LOG_LAST_UPDATE into = save_into_which_file? # ===================================================================== # # Verify that we can actually write into the directory. # ===================================================================== # if File.stat(File.dirname(into)).writable? ::Bioroebe.write_what_into( 'Last Update of the Taxonomy NCBI Database: '+ ::Bioroebe.return_current_day_month_year+', '+ ::Bioroebe.return_current_hours_minutes_seconds+N, into ) end end end |
.show_current_time_and_date ⇒ Object
#
Taxonomy.show_current_time_and_date
#
492 493 494 495 496 497 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 492 def self.show_current_time_and_date if be_verbose? e 'The current time is '+sfancy(return_current_hours_minutes_seconds)+ ', and today is the '+simp(return_current_day_month_year)+'.' end end |
.show_remote_urls_to_the_NCBI_taxonomy_webpage(optional_id = nil) ⇒ Object
#
Bioroebe.show_remote_urls_to_the_NCBI_taxonomy_webpage
This method will show the remote URLs to different tax-IDs.
If no argument is given (thus no ID) then the NCBI taxonomy parts are displayed.
#
93 94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/bioroebe/toplevel_methods/taxonomy.rb', line 93 def self.show_remote_urls_to_the_NCBI_taxonomy_webpage(optional_id = nil) if optional_id erev 'http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id='+optional_id.to_s else erev 'The remote URLs towards the NCBI taxonomy part are:' e erev ' NCBI: '+sfancy('http://www.ncbi.nlm.nih.gov/taxonomy') erev ' NCBI: '+sfancy('http://www.ncbi.nlm.nih.gov/taxonomy?term=1') erev ' NCBI: '+sfancy('http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi') e end end |
.status ⇒ Object
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/bioroebe/taxonomy/class_methods.rb', line 32 def self.status e "The status for the Taxonomy-related datafiles is as follows:#{N}" array = %w( names.dmp merged.dmp delnodes.dmp citations.dmp nodes.dmp taxdump.tar ) sorted_array = array.sort_by {|entry| File.size?(entry) } sorted_array.reverse.each {|entry| file = base_dir?+entry if File.exist? file filesize = File.size?(file) _ = ('The file `'+file+'` exists.').ljust(65, ' ') _ << (' (Filesize: '+filesize.to_s+')').ljust(30, ' ') e _ if entry.include? 'dump.tar' e 'Consider extracting this .tar-file if it was not yet '+ 'extracted - it contains the various .dmp files.' end else e 'We could not find any file at '+sfile(file) end } end |
.status? ⇒ Boolean
#
Bioroebe::Taxonomy.status? (status tag)
Feedback the status to the user.
#
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
# File 'lib/bioroebe/taxonomy/class_methods.rb', line 73 def self.status? e e 'The localomics URL should be at:' e ' '+sfancy('http://localomics.imp.univie.ac.at:8666/') e 'The shared data of the local mirror can be found at: ' e ' '+sdir(LOCAL_MIRROR) e 'The shared folder (where we generate our .sql files '+ 'to) can be found at: ' e ' '+sdir(AUTOGENERATED_SQL_FILES_DIR) e 'The '+File.basename(LOCALOME_DIR)+'/ directory can be found at: ' e ' '+sdir(LOCALOME_DIR) e 'The '+File.basename(AA_DIR)+'/ directory can be found at: ' e ' '+sdir(AA_DIR) e 'The '+File.basename(NT_DIR)+'/ directory can be found at: ' e ' '+sdir(NT_DIR) e 'The '+File.basename(INFO_DIR)+'/ directory can be found at: ' e ' '+sdir(INFO_DIR) e 'The '+File.basename(INCOMING_DIR)+'/ directory can be found at: ' e ' '+sdir(INCOMING_DIR) e end |
.taxonomy_download_directory? ⇒ Boolean
#
Bioroebe::Taxonomy.taxonomy_download_directory?
This method will return the download directory for use in the Taxonomy subcomponent.
#
169 170 171 |
# File 'lib/bioroebe/toplevel_methods/taxonomy.rb', line 169 def self.taxonomy_download_directory? "#{::Bioroebe.log_dir?}taxonomy/" end |
.update(this_dir = ::Bioroebe.taxonomy_download_directory?, be_verbose = true) ⇒ Object
#
Bioroebe::Taxonomy.update (download tag)
This class method will obtain the file taxdump.tar.gz.
By default we will download into TEMP_DIR, which at the moment of documenting this method defaults to /tmp/robert/ on the I.M.P. cluster, or rather the value that is stored in the very constant DOWNLOAD_DIR.
#
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
# File 'lib/bioroebe/toplevel_methods/taxonomy.rb', line 117 def self.update( this_dir = ::Bioroebe.taxonomy_download_directory?, be_verbose = true ) case be_verbose when :be_silent be_verbose = false end ::Bioroebe.show_time_now # Display the current start-time. ::Bioroebe.ensure_that_the_base_directories_exist unless File.directory? this_dir ::Bioroebe.mkdir(this_dir) end if Dir.exist? this_dir # All is fine here. else # Safeguard here. if at_home? if be_verbose this_dir = '/home/Temp/' # Hardcoded. e "The directory at #{sdir(this_dir)} does not "\ "exist, thus trying to" e 'use '+sdir(this_dir)+' instead.' end else # Then we create that directory. ::Bioroebe.mkdir(this_dir) unless Dir.exist? this_dir end end if be_verbose e "Now trying to change to the base directory at "\ "#{::Bioroebe.sdir(this_dir)}." end cd this_dir e N+'We will next attempt to download the file `'+ ::Bioroebe.sfancy(URL_TO_TAXONOMY_ARCHIVE)+'`' e 'into the local directory `'+ ::Bioroebe.sdir(this_dir)+'` via wget.' if be_verbose e '(This file will usually have a file size of about '\ '~36 MB. [Last verification of this size: Dec 2016])' end local_file = File.basename(URL_TO_TAXONOMY_ARCHIVE) ::Bioroebe.remove_file(local_file) if File.exist? local_file ::Bioroebe.wget_download(URL_TO_TAXONOMY_ARCHIVE) # Use class Download to download something. ::Bioroebe.extract(local_file) save_when_we_last_updated_the_database end |
.use_colours? ⇒ Boolean
#
Bioroebe::Taxonomy.use_colours?
#
14 15 16 |
# File 'lib/bioroebe/taxonomy/colours.rb', line 14 def self.use_colours? ::Bioroebe.use_colours? end |
Instance Method Details
#array_size?(array_input = citations?) ) ⇒ Boolean
#
array_size?
Throwaway method to give us back all text entries from a citation base.
#
266 267 268 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 266 def array_size?(array_input = citations?) return array_input.map {|entry| split(entry)[5].size } end |
#citations? ⇒ Boolean
#
citations?
Obtain the citations.dmp dataset through this method.
This dataset includes the following keys:
cit_id, cit_key, pubmed_id,
medline_id, url,
text, taxid_list
#
257 258 259 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 257 def citations? Taxonomy.load(CITATIONS) end |
#clean(i) ⇒ Object Also known as: remove_delimiters
#
clean
Clean the input string from delimiters. Input can be Array or String.
#
124 125 126 127 128 129 130 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 124 def clean(i) if i.is_a? Array return i.map {|entry| clean(entry) } else return i.gsub(/#{MAIN_DELIMITER}/, ' ').chomp.strip # Added .chomp at 04.02.2014 end end |
#delnodes? ⇒ Boolean
#
delnodes?
Obtain the delnodes.dmp dataset. This one has deleted nodes - nodes that existed but were deleted.
#
226 227 228 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 226 def delnodes? Taxonomy.load(DELNODES) end |
#division? ⇒ Boolean
#
division?
Obtain the division.dmp dataset.
The Divisions file has these fields:
division id -- taxonomy database division id
division cde -- GenBank division code (three characters)
division name -- e.g. BCT, PLN, VRT, MAM, PRI...
Comments.
#
200 201 202 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 200 def division? Taxonomy.load(DIVISION) end |
#edit(i = '') ⇒ Object
#
edit (edit tag, ed tag)
This method allows us to quickly open the internal files.
We can typically use vim for this.
#
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
# File 'lib/bioroebe/taxonomy/edit.rb', line 27 def edit(i = '') i = i.to_s i.gsub!(/edit_/,'') if i.include? 'edit_' case i # ======================================================================= # # === taxonomy # ======================================================================= # when /taxonomy/ # Edit the taxonomy "binary". edit_taxonomy # ======================================================================= # # === shared # ======================================================================= # when 'shared', 'shared_code' edit_shared_code_file # ======================================================================= # # === gem # ======================================================================= # when 'gem', 'gemspec' edit_gemspec # ======================================================================= # # === password # ======================================================================= # when /^password$/, 'passwd' edit_password # ======================================================================= # # === login # ======================================================================= # when 'login', 'main' edit_login_file # ======================================================================= # # === instructions # ======================================================================= # when '','instructions' # '' is default. edit_instructions end end |
#edit_gemspec ⇒ Object
#
edit_gemspec
#
71 72 73 |
# File 'lib/bioroebe/taxonomy/edit.rb', line 71 def edit_gemspec esystem Bioroebe.editor?+' '+return_gemspec_file end |
#edit_instructions ⇒ Object
#
edit_instructions
#
16 17 18 |
# File 'lib/bioroebe/taxonomy/edit.rb', line 16 def edit_instructions esystem "#{Bioroebe.editor?} #{return_instructions}" end |
#edit_password ⇒ Object
#
edit_password
We can use this method to edit the login-file.
#
94 95 96 |
# File 'lib/bioroebe/taxonomy/edit.rb', line 94 def edit_password edit_login_file ' +27' end |
#edit_shared_code_file ⇒ Object
#
edit_shared_code_file
#
85 86 87 |
# File 'lib/bioroebe/taxonomy/edit.rb', line 85 def edit_shared_code_file esystem Bioroebe.editor?+' '+return_shared_code end |
#edit_taxonomy ⇒ Object
#
edit_taxonomy
#
78 79 80 |
# File 'lib/bioroebe/taxonomy/edit.rb', line 78 def edit_taxonomy esystem Bioroebe.editor?+' '+return_taxonomy_file end |
#find_highest_entries_in_sql ⇒ Object
#
find_highest_entries_in_sql
This method will find the highest entries in the sql database. This may be useful if we wish to optimize the database (i.e. find the best varchar attribute in question).
#
382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 382 def find_highest_entries_in_sql e 'We will find the various highest entries in the sql files.' this_file = TAXONOMY_DIR+'names.dmp' e 'Starting with '+sfile(this_file)+' first:' e 'Names has 4 entries - we are interested in all of them.' e this_file max_taxid = 0 max_name_txt = 0 max_unique_name = 0 max_name_class = 0 e 'Please be patient, this may take a while ...' readlines(this_file).each {|entry| _ = split_at_tabulator(entry) if _[0].to_s.size > max_taxid max_taxid = _[0].to_s.size.to_i end if _[1].to_s.size > max_name_txt max_name_txt = _[1].to_s.size.to_i end if _[2].to_s.size > max_unique_name max_unique_name = _[2].to_s.size.to_i end if _[3].to_s.size > max_name_class max_name_class = _[3].to_s.size.to_i end } max_taxid = '%05s' % max_taxid e ' max_taxid is '+sfancy(max_taxid.to_s)+' characters long (should be an int anyway).' e ' max_name_txt is '+sfancy('%05s' % max_name_txt.to_s)+' characters long.' e ' max_unique_name is '+sfancy('%05s' % max_unique_name.to_s)+' characters long.' e ' max_name_class is '+sfancy('%05s' % max_name_class.to_s)+' characters long.' # Next, we will work on nodes.dmp: this_file = TAXONOMY_DIR+'nodes.dmp' e 'Now working on '+sfile(this_file)+' first:' e 'Nodes has 3 relevant entries - taxid, parent_taxid and rank (but in total it has 13 entries)' e this_file max_taxid = 0 max_parent_taxid = 0 max_rank = 0 e 'Please be patient, this may take a while ...' readlines(this_file).each {|entry| _ = split_at_tabulator(entry) if _[0].to_s.size > max_taxid max_taxid = _[0].to_s.size.to_i end if _[1].to_s.size > max_parent_taxid max_parent_taxid = _[1].to_s.size.to_i end if _[2].to_s.size > max_rank max_rank = _[2].to_s.size.to_i end } e ' max_taxid is '+ sfancy('%05s' % max_taxid.to_s)+' characters long.' e ' max_parent_taxid is '+ sfancy('%05s' % max_parent_taxid.to_s)+' characters long.' e ' max_rank is '+ sfancy('%05s' % max_rank.to_s)+' characters long.' end |
#gencode? ⇒ Boolean
#
gencode?
Obtain gencode.dmp dataset, “genetic codes” file.
genetic code id -- GenBank genetic code id
abbreviation -- genetic code name abbreviation
name -- genetic code name
cde -- translation table for this genetic code
starts -- start codons for this genetic code
#
216 217 218 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 216 def gencode? Taxonomy.load(GENCODE) end |
#generate_html_links_for(i) ⇒ Object
#
generate_html_links_for
Input to this method should be an array of taxonomic IDs.
#
110 111 112 113 114 115 116 117 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 110 def generate_html_links_for(i) base_url = 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=' if i.is_a? Array i.each {|entry| generate_html_links_for(entry) } else e base_url+i.to_s end end |
#get_id_of(id = 9606, be_verbose = true) ⇒ Object
#
get_id_of
Use this method to query the database for a specific ID. For this to work, the database must have the ids.
#
359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 359 def get_id_of( id = 9606, be_verbose = true ) id = id.to_s run_sql_query( 'SELECT * from names WHERE taxid='+id+' AND name_class=\'scientific name\' LIMIT 3;"', be_verbose ) result = run_sql_query( 'SELECT taxid from names WHERE taxid='+id+' AND name_class=\'scientific name\' LIMIT 3;"', :silent, :tuples ).strip return result end |
#get_parent_id_of(i) ⇒ Object
#
get_parent_id_of
This method is similar to the method above, but we will fetch the parent id instead.
#
324 325 326 327 328 329 330 331 332 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 324 def get_parent_id_of(i) if i.to_s.empty? e 'No valid input was given to us (in method '+__method__.to_s+').' end cmd = 'SELECT parent_taxid FROM nodes WHERE taxid='+i.to_s+' limit 3;' result = run_sql_query(cmd) # More verbose here. result = run_sql_query(cmd, true, :tuples).strip return result end |
#get_scientific_name_of(taxid) ⇒ Object Also known as: get_scientific_name
#
get_scientific_name_of
Get the scientific name here. The input should be a taxid.
Usage example:
get_scientific_name_of 333
#
344 345 346 347 348 349 350 351 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 344 def get_scientific_name_of(taxid) _ = 'SELECT name_txt FROM names where taxid='+taxid.to_s+ ' AND name_class=\'scientific name\' LIMIT 3;' result = run_sql_query(_) # _ << 'SELECT name_txt FROM names where taxid='+taxid.to_s+' LIMIT 3;"' result = run_sql_query(_, true, ' --tuples-only').strip return result end |
#merged? ⇒ Boolean
#
merged?
Obtain information from merged.dmp.
Merged nodes file fields has these ids:
old_taxid -- id of nodes which has been merged
new_taxid -- id of nodes which is result of merging
#
241 242 243 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 241 def merged? Taxonomy.load(MERGED) end |
#names?(i = NAMES) ⇒ Boolean
#
names?
Obtain the names.dmp dataset, which are “Taxonomy names”.
Four IDs are known for this set:
taxid
name_txt
unique name
name class
#
183 184 185 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 183 def names?(i = NAMES) Taxonomy.load(i) end |
#nodes?(i = NODES) ⇒ Boolean
#
nodes?
Use this method in order to obtain the nodes.dmp dataset.
Nodes are “taxonomic nodes”.
The description for each node includes the following fields:
taxid - node id in GenBank taxonomy database
parent taxid - parent node id in GenBank taxonomy database
rank - rank of this node (superkingdom, kingdom, ...)
embl code - locus-name prefix; not unique
division id - see division.dmp file
inherited div flag (1 or 0) - 1 if node inherits division from parent
genetic code id - see gencode.dmp file
inherited GC flag (1 or 0) - 1 if node inherits genetic code from parent
mitochondrial genetic code id - see gencode.dmp file
inherited MGC flag (1 or 0) - 1 if node inherits mitochondrial gencode from parent
GenBank hidden flag (1 or 0) - 1 if name is suppressed in GenBank entry lineage
hidden subtree root flag(1 or 0) - 1 if this subtree has no sequence data yet
comments - free-text comments and citations
#
156 157 158 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 156 def nodes?(i = NODES) Taxonomy.load(i) end |
#pad(i) ⇒ Object
#
pad
The input is something like:
1457406 | Bionia Mart. ex Benth., 1837 | | authority |
but it could also be an array.
#
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 88 def pad(i) if i.is_a? String i = i.chomp # Newlines aren't really useful - let's eliminate them. if i.include?(TOKEN) splitted = tokenize(i) # Now splitted input. # Next, pad sql. # splitted = splitted.map {|entry| pad_sql(entry) } # ^^^ This may lead to problems however. i = splitted end # Next, we will get rid of "'" characters. i = i.delete("'") if i.include? "'" end i = i.join("','") if i.is_a? Array return i end |
#pad_properly(i = "5\t|\tThe domestic cat: perspective on the nature and diversity of cats.\t|\t0\t|\t8603894\t|\t \t|\t\t|\t9685 \t|\n") ⇒ Object
#
pad_properly
Input is an Array. We will pad it for a proper SQL query.
#
102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
# File 'lib/bioroebe/taxonomy/node.rb', line 102 def pad_properly(i = "5\t|\tThe domestic cat: perspective on the nature and diversity of cats.\t|\t0\t|\t8603894\t|\t \t|\t\t|\t9685 \t|\n") splitted = split( sanitize(i.chomp) ) splitted = splitted.map {|e| begin string = sanitize(e).strip # token = to_utf('|') if string.include? "'" string = string.gsub(/'/, "\'") end return string rescue Exception end } return splitted.join("', '") end |
#pad_sql(i) ⇒ Object
#
pad_sql
Escape ' characters here.
#
92 93 94 95 |
# File 'lib/bioroebe/taxonomy/node.rb', line 92 def pad_sql(i) i = i.gsub(/'/, "\'") if i.include? "'" return i end |
#project_base_dir? ⇒ Boolean Also known as: base_dir?
#
project_base_dir?
#
129 130 131 |
# File 'lib/bioroebe/taxonomy/class_methods.rb', line 129 def project_base_dir? Taxonomy.base_dir? end |
#remove_delimiter(i) ⇒ Object
#
remove_delimiter
Get rid of the '|' token.
#
504 505 506 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 504 def remove_delimiter(i) return i.delete(TOKEN) end |
#return_current_hours_minutes_seconds ⇒ Object
#
return_current_hours_minutes_seconds
#
485 486 487 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 485 def return_current_hours_minutes_seconds ::Bioroebe.return_current_hours_minutes_seconds end |
#return_full_lineage_of(i = nil) ⇒ Object
#
return_full_lineage_of
This method will return an array (or nil) with the full lineage of the given input (which should be a Taxonomic id).
The logic for this method is as follows:
-
Given an arbitrary tax ID as input, we will keep on searching the postgresql database for parent_id entries, until we eventually reach input number 1, which is the mother of all IDs. So when we have 1 finally, we can stop, and return the result (the array in question).
The full lineage is thus given as part of the returned array. The format is to not only return the ID but to also return the scientific name. In other words, our returned array will have this format:
[parent_id, scientific_name]
#
291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 291 def return_full_lineage_of(i = nil) if i array = [] array << [ i, get_scientific_name_of(i) ] rescue_counter = 0 loop { id = array[-1][0] parent_id = get_parent_id_of(id) scientific_name = get_scientific_name_of(parent_id) array << [ parent_id, scientific_name ] rescue_counter += 1 break if array[-1][0].to_i == 1 if rescue_counter > 50 e 'It seems as if something is not working properly here. We reached' e 'a count of 50 now, without finding a parent id (which should be 1).' e 'It is quite unlikely that a lineage will have more than 50 subentries' e 'so we will now break out of the loop.' return array end } return array else e 'Missing input. Please provide an ID (a number, like 6).' return nil end end |
#search_id(i = '7460') ⇒ Object Also known as: search, id?
#
search_id
Search the Taxonomic ID here.
#
165 166 167 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 165 def search_id(i = '7460') e BASE_URL+'id='+i.to_s end |
#show_current_time_and_date ⇒ Object
#
show_current_time_and_date
#
511 512 513 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 511 def show_current_time_and_date Taxonomy.show_current_time_and_date end |
#show_help ⇒ Object
#
show_help (help tag)
Shows the options available in the interactive taxonomy 'shell'.
#
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
# File 'lib/bioroebe/taxonomy/help/help.rb', line 19 def show_help cliner { e N+'The following options are available (pwd: '+sdir(return_pwd)+'):'+ N+N Helpline[:sql,'# Use this to generate the SQL that can be '+ 'used to populate a SQL database with INSERT statements.'] e ' '*(Helpline::LJUST+3)+'# You can pass values such as 1 '+ 'or 2 (node or name) to this method.' Helpline[:info, '# Show some info about where we store data.'] Helpline[:instructions?, '# Show instructions.'] Helpline[:fasta, '# Use this to populate the fasta table.'] Helpline[:names, '# Use this to populate the names table.'] Helpline[:nodes, '# Use this to populate the nodes table.'] # ===================================================================== # # Only display the following line when colours are enabled still. # ===================================================================== # if use_colours? Helpline[:nocolours,'# Use this to disable the '+ 'colours. (Use "yescolours" to enable them again)'] end Helpline[:taxid,'# Find out the name of the organism through '\ 'the input ID from the NCBI dataset. For example: "taxid 33"'] Helpline[:table_names?,'# Use this to show the SQL command '\ 'that was used to generate the SQL Tables.'] Helpline[:verify,'# Use this to verify that the '\ '.sql files (nodes and names) are valid.'] Helpline[:verbose,'# be verbose, in other words provide '\ 'extra information to us whenever feasible'] Helpline[:ll, '# Show the content of the current working '\ 'directory.'] Helpline[:id,'# Query the postgre database to get the ID of '\ 'a given species.'] Helpline[:download,'# Download the remote NCBI database '\ '(at '+simp(URL_TO_TAXONOMY_ARCHIVE)+')'] if SHALL_WE_LOG_LAST_UPDATE Helpline[:last_update?,'# When did we last update the database'] end Helpline[:update_database, '# download the remote NCBI database, '+ 'extract it, generate nodes.sql and names.sql,'] _ = ' ' * Helpline::LJUST e _+Helpline::PADDING+' # and then populate the '\ 'postgresql-database with this information' e # This here to keep a trailing newline. } end |
#split(i = citations? ) ⇒ Object
#
split (split tag)
Split up the input on the default delimiter.
#
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 57 def split( i = citations? ) if i.is_a? Array return i.map {|entry| entry.split(MAIN_DELIMITER) } else begin i = sanitize(i) if i.include?(TOKEN) and ! i.include?("\t") i = i.split(TOKEN) else return i.split(MAIN_DELIMITER) end rescue Exception => error pp error e 'The string that failed was '+i $error = i return end end end |
#test(use_this_url = URL1) ⇒ Object
#
test
#
46 47 48 49 50 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 46 def test(use_this_url = URL1) e 'Opening URL at "'+sfancy(use_this_url)+'"' data = File.readlines(DIVISION) return data end |
#to_utf(i) ⇒ Object Also known as: sanitize
#
to_utf
#
39 40 41 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 39 def to_utf(i) ::Bioroebe.to_utf(i) end |
#verify_proper_sql_structures ⇒ Object
#
verify_proper_sql_structures (debug tag)
We will try to verify that the SQL commands are accurate.
Can also be called by issuing this:
Taxonomy.verify_proper_sql_structures
#
461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 |
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 461 def verify_proper_sql_structures n_chars_to_show = 975 files = [ NODES_SQL, NAMES_SQL ] e 'We will now attempt to verify that the SQL structure is proper.' e 'We have these '+sfancy(files.size.to_s)+' .sql files.' pp files files.each {|entry| if File.exist? entry size = File.size(entry).to_s chunk = File.read(entry)[0..n_chars_to_show] e 'File '+sfile(entry)+' (Filesize: '+sfancy(size)+ ' KB) has this content '+ '(showing up to '+simp(n_chars_to_show)+' chars):' cliner { e chunk }; e else e 'We can not verify the sql structure because the file' e 'at `'+sfile(entry)+'` does not exist.' end } end |