Module: Bioroebe::Taxonomy

Extended by:
Taxonomy, Shared, Colours, Colours::E
Includes:
Constants
Included in:
Taxonomy, Interactive
Defined in:
lib/bioroebe/taxonomy/edit.rb,
lib/bioroebe/taxonomy/menu.rb,
lib/bioroebe/taxonomy/node.rb,
lib/bioroebe/taxonomy/chart.rb,
lib/bioroebe/project/project.rb,
lib/bioroebe/taxonomy/shared.rb,
lib/bioroebe/taxonomy/colours.rb,
lib/bioroebe/taxonomy/taxonomy.rb,
lib/bioroebe/taxonomy/constants.rb,
lib/bioroebe/taxonomy/help/help.rb,
lib/bioroebe/taxonomy/info/info.rb,
lib/bioroebe/taxonomy/info/is_dna.rb,
lib/bioroebe/taxonomy/interactive.rb,
lib/bioroebe/taxonomy/parse_fasta.rb,
lib/bioroebe/taxonomy/class_methods.rb,
lib/bioroebe/taxonomy/help/helpline.rb,
lib/bioroebe/toplevel_methods/taxonomy.rb,
lib/bioroebe/taxonomy/info/check_available.rb

Overview

Bioroebe::Taxonomy

Defined Under Namespace

Modules: Constants, Shared Classes: Chart, CheckAvailable, Helpline, Info, Interactive, IsDNA, Node, ParseFasta

Constant Summary collapse

PROJECT_YAML_DIR =
#

Bioroebe::Taxonomy::PROJECT_YAML_DIR

#
::Bioroebe.project_base_directory?

Constants included from Constants

Constants::AA_DIR, Constants::ARRAY_PROJECT_FILES, Constants::Archaea_Taxonomy_ID, Constants::BASE, Constants::BASE_URL, Constants::BE_VERBOSE, Constants::Bacteria_Taxonomy_ID, Constants::CITATIONS, Constants::CURATED_DIR, Constants::DATA_DIR, Constants::DELNODES, Constants::DIVISION, Constants::Eukaryota_Taxonomy_ID, Constants::FILE_USE_THIS_DATABASE, Constants::GEM_DIR, Constants::GENCODE, Constants::INCOMING_DIR, Constants::INFO_DIR, Constants::LAST_INTERACTIVE_COMMAND, Constants::LOCALOME_DIR, Constants::LOCAL_MIRROR, Constants::MERGED, Constants::MODULE_PATH, Constants::NAMES, Constants::NAMES_SQL, Constants::NCBI_BASE, Constants::NODES, Constants::NODES_SQL, Constants::NT_DIR, Constants::POSTGRESQL_QUERY_SIZE, Constants::POSTGRE_LOGIN_COMMAND, Constants::PROJECT_DOC_DIR, Constants::SEQUENCES_DIR, Constants::SHARED_HOME, Constants::TAXONOMY_BROWSER, Constants::TEMP_DIR, Constants::TEST_DIR, Constants::TMP_DIR, Constants::URL1

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Shared

be_quiet, be_verbose?, cd, edit_login_file, eliminate_tabulator, ensure_that_download_dir_exists, ensure_that_temp_dir_exists, mkdir, readlines, set_pgpassword, show_password, show_time_now, split_at, split_at_tabulator, tokenize

Methods included from Constants

#info_dir?, #work_directory?

Class Method Details

.be_verbose?Boolean

#

Taxonomy.be_verbose?

#

Returns:

  • (Boolean)

136
137
138
# File 'lib/bioroebe/taxonomy/class_methods.rb', line 136

def self.be_verbose?
  Taxonomy::Constants::BE_VERBOSE
end

.cd(i = '..') ⇒ Object

#

Bioroebe::Taxonomy.cd

#

74
75
76
# File 'lib/bioroebe/toplevel_methods/taxonomy.rb', line 74

def self.cd(i = '..')
  ::Bioroebe.cd(i)
end

.cleanupObject

#

Taxonomy.cleanup

Get rid of some .sql files through this method here.

#

100
101
102
103
104
105
106
# File 'lib/bioroebe/taxonomy/class_methods.rb', line 100

def self.cleanup
  e 'Trying to remove some files now.'
  i = 'names.sql'
  remove_file(i) if File.exist? i
  i = 'nodes.sql'
  remove_file(i) if File.exist? i
end

.download_directory?Boolean

#

Bioroebe::Taxonomy.download_directory?

#

Returns:

  • (Boolean)

64
65
66
# File 'lib/bioroebe/taxonomy/class_methods.rb', line 64

def self.download_directory?
  DOWNLOAD_DIR
end

.e(i = '') ⇒ Object

#

Bioroebe::Taxonomy.e

#

67
68
69
# File 'lib/bioroebe/toplevel_methods/taxonomy.rb', line 67

def self.e(i = '')
  ::Bioroebe.e(i)
end

.enable_coloursObject

#

Bioroebe::Taxonomy.enable_colours

#

21
22
23
24
# File 'lib/bioroebe/taxonomy/colours.rb', line 21

def self.enable_colours
  e 'Enabling colours.'
  ::Bioroebe.enable_colours
end

.ensure_that_temp_dir_existsObject

#

Taxonomy.ensure_that_temp_dir_exists

#

81
82
83
# File 'lib/bioroebe/toplevel_methods/taxonomy.rb', line 81

def self.ensure_that_temp_dir_exists
  ::Bioroebe.ensure_that_the_base_directories_exist
end

.interactive(i = nil) ⇒ Object

#

Taxonomy.interactive

Invoke this method if you wish to directly invoke the interactive component of the Taxonomy module.

#

1961
1962
1963
# File 'lib/bioroebe/taxonomy/interactive.rb', line 1961

def self.interactive(i = nil)
  ::Bioroebe::Taxonomy::Interactive.new(i)
end

.load(_ = 'taxonomy/citations.dmp') ⇒ Object

#

Taxonomy.load

Load a specific .dmp file via this method.

The first argument should be the target location of the file that we wish to load (we assume this to be a local file for now).

#

116
117
118
119
120
121
122
123
124
# File 'lib/bioroebe/taxonomy/class_methods.rb', line 116

def self.load(_ = 'taxonomy/citations.dmp')
  if File.exist? _
    data = File.readlines(_).map {|line| sanitize(line) } # Needed because File.readlines() may read in an invalid encoding.
    return data
  else
    e 'File at location `'+_+'` does not exist.'
    return nil
  end
end

.project_base_dir?Boolean

#

Bioroebe::Taxonomy.project_base_dir?

The Taxonomy project has been fully integrated into the Bioroebe namespace in the year ~2015.

This method will return a String such as:

"/Programs/Ruby/2.6.4/lib/ruby/site_ruby/2.6.0/bioroebe/taxonomy/"
#

Returns:

  • (Boolean)

249
250
251
# File 'lib/bioroebe/project/project.rb', line 249

def self.project_base_dir?
  "#{::Bioroebe.project_base_directory?}taxonomy/"
end

.project_yaml_dir?Boolean

#

Bioroebe::Taxonomy.project_yaml_dir?

#

Returns:

  • (Boolean)

256
257
258
# File 'lib/bioroebe/project/project.rb', line 256

def self.project_yaml_dir?
  Taxonomy::PROJECT_YAML_DIR
end

.report_n_speciesObject

#

Taxonomy.report_n_species

This method will report how many species are registered in the NCBI database.

#

146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# File 'lib/bioroebe/taxonomy/class_methods.rb', line 146

def self.report_n_species
  remote_url = 'http://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=statistics&unclassified=hide&uncultured=hide&unspecified=hide&period=&from=&to='
  e 'We will now obtain how many species are registered. This '\
    'may take a while.'
  e
  e 'In 26.09.2011, the Taxonomy database included exactly'
  e '234_991 species with a formal name.'
  e
  # ======================================================================= #
  # See: http://rubular.com/r/awsCk2nF4D
  # ======================================================================= #
  regex_to_use = /gov\/taxonomy\/\?term.+">(\d+)<\/A><\/TD>\s+<\/TR>$/
  dataset = open(remote_url).read.split(N)
  _ = ''
  is_open = false
  dataset.each {|line|
    if is_open
      _ << line
    end
    if line.include? '<TR><TD><A HREF="/Taxonomy/Browser/wwwtax.cgi?id=1">All taxa</A></TD>'
      is_open = true
    elsif line.include? '</TR>'
      is_open = false
    end
  }
  _ =~ regex_to_use
  n_species = $1.to_s.dup
  e 'We found `'+sfancy(n_species)+::Bioroebe.rev+'` registered species.'
end

.save_into_which_file?Boolean

#

Bioroebe::Taxonomy.save_into_which_file?

#

Returns:

  • (Boolean)

26
27
28
# File 'lib/bioroebe/toplevel_methods/taxonomy.rb', line 26

def self.save_into_which_file?
  TAXONOMY_NCBI_DATABASE_LAST_UPDATE_LOG_FILE
end

.save_when_we_last_updated_the_database(be_verbose = ::Bioroebe::BE_VERBOSE) ⇒ Object

#

Taxonomy.save_when_we_last_updated_the_database

Save into a file when we last updated the database. This method will make use of the constant SAVE_FILE, which tells us where to keep the save file, and can be found at: lib/taxonomy/shared/shared.rb

#

37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/bioroebe/toplevel_methods/taxonomy.rb', line 37

def self.save_when_we_last_updated_the_database(
    be_verbose = ::Bioroebe::BE_VERBOSE
  )
  if be_verbose # BE_VERBOSE is defined in shared/shared.rb
    e "Trying to log last update into file "\
      "#{::Bioroebe.sfile(save_into_which_file?)} next."
  end
  # ======================================================================= #
  # Only store into a file if the constant SHALL_WE_LOG_LAST_UPDATE
  # is set to true.
  # ======================================================================= #
  if SHALL_WE_LOG_LAST_UPDATE
    into = save_into_which_file?
    # ===================================================================== #
    # Verify that we can actually write into the directory.
    # ===================================================================== #
    if File.stat(File.dirname(into)).writable?
      ::Bioroebe.write_what_into(
        'Last Update of the Taxonomy NCBI Database: '+
        ::Bioroebe.return_current_day_month_year+', '+
        ::Bioroebe.return_current_hours_minutes_seconds+N,
        into
      )
    end
  end
end

.show_current_time_and_dateObject

#

Taxonomy.show_current_time_and_date

#

492
493
494
495
496
497
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 492

def self.show_current_time_and_date
  if be_verbose?
    e 'The current time is '+sfancy(return_current_hours_minutes_seconds)+
      ', and today is the '+simp(return_current_day_month_year)+'.'
  end
end

.show_remote_urls_to_the_NCBI_taxonomy_webpage(optional_id = nil) ⇒ Object

#

Bioroebe.show_remote_urls_to_the_NCBI_taxonomy_webpage

This method will show the remote URLs to different tax-IDs.

If no argument is given (thus no ID) then the NCBI taxonomy parts are displayed.

#

93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/bioroebe/toplevel_methods/taxonomy.rb', line 93

def self.show_remote_urls_to_the_NCBI_taxonomy_webpage(optional_id = nil)
  if optional_id
    erev 'http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id='+optional_id.to_s
  else
    erev 'The remote URLs towards the NCBI taxonomy part are:'
    e
    erev '  NCBI: '+sfancy('http://www.ncbi.nlm.nih.gov/taxonomy')
    erev '  NCBI: '+sfancy('http://www.ncbi.nlm.nih.gov/taxonomy?term=1')
    erev '  NCBI: '+sfancy('http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi')
    e
  end
end

.statusObject

#

Bioroebe::Taxonomy.status

Invoke this method like that:

Taxonomy.status
#

32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/bioroebe/taxonomy/class_methods.rb', line 32

def self.status
  e "The status for the Taxonomy-related datafiles is as follows:#{N}"
  array = %w(
    names.dmp
    merged.dmp
    delnodes.dmp
    citations.dmp
    nodes.dmp
    taxdump.tar
  )
  sorted_array = array.sort_by {|entry| File.size?(entry) }
  sorted_array.reverse.each {|entry|
    file = base_dir?+entry
    if File.exist? file
      filesize = File.size?(file)
      _ = ('The file `'+file+'` exists.').ljust(65, ' ')
      _ << (' (Filesize: '+filesize.to_s+')').ljust(30, ' ')
      e _
      if entry.include? 'dump.tar'
        e 'Consider extracting this .tar-file if it was not yet '+
        'extracted - it contains the various .dmp files.'
      end
    else
      e 'We could not find any file at '+sfile(file)
    end
  }
end

.status?Boolean

#

Bioroebe::Taxonomy.status? (status tag)

Feedback the status to the user.

#

Returns:

  • (Boolean)

73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/bioroebe/taxonomy/class_methods.rb', line 73

def self.status?
  e
  e 'The localomics URL should be at:'
  e '  '+sfancy('http://localomics.imp.univie.ac.at:8666/')
  e 'The shared data of the local mirror can be found at: '
  e '  '+sdir(LOCAL_MIRROR)
  e 'The shared folder (where we generate our .sql files '+
    'to) can be found at: '
  e '  '+sdir(AUTOGENERATED_SQL_FILES_DIR)
  e 'The '+File.basename(LOCALOME_DIR)+'/ directory can be found at: '
  e '  '+sdir(LOCALOME_DIR)
  e 'The '+File.basename(AA_DIR)+'/ directory can be found at: '
  e '  '+sdir(AA_DIR)
  e 'The '+File.basename(NT_DIR)+'/ directory can be found at: '
  e '  '+sdir(NT_DIR)
  e 'The '+File.basename(INFO_DIR)+'/ directory can be found at: '
  e '  '+sdir(INFO_DIR)
  e 'The '+File.basename(INCOMING_DIR)+'/ directory can be found at: '
  e '  '+sdir(INCOMING_DIR)
  e
end

.taxonomy_download_directory?Boolean

#

Bioroebe::Taxonomy.taxonomy_download_directory?

This method will return the download directory for use in the Taxonomy subcomponent.

#

Returns:

  • (Boolean)

169
170
171
# File 'lib/bioroebe/toplevel_methods/taxonomy.rb', line 169

def self.taxonomy_download_directory?
  "#{::Bioroebe.log_dir?}taxonomy/"
end

.update(this_dir = ::Bioroebe.taxonomy_download_directory?, be_verbose = true) ⇒ Object

#

Bioroebe::Taxonomy.update (download tag)

This class method will obtain the file taxdump.tar.gz.

By default we will download into TEMP_DIR, which at the moment of documenting this method defaults to /tmp/robert/ on the I.M.P. cluster, or rather the value that is stored in the very constant DOWNLOAD_DIR.

#

117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# File 'lib/bioroebe/toplevel_methods/taxonomy.rb', line 117

def self.update(
    this_dir   = ::Bioroebe.taxonomy_download_directory?,
    be_verbose = true
  )
  case be_verbose
  when :be_silent
    be_verbose = false
  end
  ::Bioroebe.show_time_now # Display the current start-time.
  ::Bioroebe.ensure_that_the_base_directories_exist
  unless File.directory? this_dir
    ::Bioroebe.mkdir(this_dir)
  end
  if Dir.exist? this_dir # All is fine here.
  else # Safeguard here.
    if at_home?
      if be_verbose
        this_dir = '/home/Temp/' # Hardcoded.
        e "The directory at #{sdir(this_dir)} does not "\
          "exist, thus trying to"
        e 'use '+sdir(this_dir)+' instead.'
      end
    else # Then we create that directory.
      ::Bioroebe.mkdir(this_dir) unless Dir.exist? this_dir
    end
  end
  if be_verbose
    e "Now trying to change to the base directory at "\
      "#{::Bioroebe.sdir(this_dir)}."
  end
  cd this_dir
  e N+'We will next attempt to download the file `'+
    ::Bioroebe.sfancy(URL_TO_TAXONOMY_ARCHIVE)+'`'
  e 'into the local directory `'+
     ::Bioroebe.sdir(this_dir)+'` via wget.'
  if be_verbose
    e '(This file will usually have a file size of about '\
      '~36 MB. [Last verification of this size: Dec 2016])'
  end
  local_file = File.basename(URL_TO_TAXONOMY_ARCHIVE)
  ::Bioroebe.remove_file(local_file) if File.exist? local_file
  ::Bioroebe.wget_download(URL_TO_TAXONOMY_ARCHIVE) # Use class Download to download something.
  ::Bioroebe.extract(local_file)
  save_when_we_last_updated_the_database
end

.use_colours?Boolean

#

Bioroebe::Taxonomy.use_colours?

#

Returns:

  • (Boolean)

14
15
16
# File 'lib/bioroebe/taxonomy/colours.rb', line 14

def self.use_colours?
  ::Bioroebe.use_colours?
end

Instance Method Details

#array_size?(array_input = citations?) ) ⇒ Boolean

#

array_size?

Throwaway method to give us back all text entries from a citation base.

#

Returns:

  • (Boolean)

266
267
268
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 266

def array_size?(array_input = citations?)
  return array_input.map {|entry| split(entry)[5].size }
end

#citations?Boolean

#

citations?

Obtain the citations.dmp dataset through this method.

This dataset includes the following keys:

cit_id, cit_key, pubmed_id,
medline_id, url,
text, taxid_list
#

Returns:

  • (Boolean)

257
258
259
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 257

def citations?
  Taxonomy.load(CITATIONS)
end

#clean(i) ⇒ Object Also known as: remove_delimiters

#

clean

Clean the input string from delimiters. Input can be Array or String.

#

124
125
126
127
128
129
130
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 124

def clean(i)
  if i.is_a? Array
    return i.map {|entry| clean(entry) }
  else
    return i.gsub(/#{MAIN_DELIMITER}/, ' ').chomp.strip # Added .chomp at 04.02.2014
  end
end

#delnodes?Boolean

#

delnodes?

Obtain the delnodes.dmp dataset. This one has deleted nodes - nodes that existed but were deleted.

#

Returns:

  • (Boolean)

226
227
228
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 226

def delnodes?
  Taxonomy.load(DELNODES)
end

#division?Boolean

#

division?

Obtain the division.dmp dataset.

The Divisions file has these fields:

division id -- taxonomy database division id
division cde -- GenBank division code (three characters)
division name -- e.g. BCT, PLN, VRT, MAM, PRI...

Comments.

#

Returns:

  • (Boolean)

200
201
202
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 200

def division?
  Taxonomy.load(DIVISION)
end

#edit(i = '') ⇒ Object

#

edit (edit tag, ed tag)

This method allows us to quickly open the internal files.

We can typically use vim for this.

#

27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/bioroebe/taxonomy/edit.rb', line 27

def edit(i = '')
  i = i.to_s
  i.gsub!(/edit_/,'') if i.include? 'edit_'
  case i
  # ======================================================================= #
  # === taxonomy
  # ======================================================================= #
  when /taxonomy/ # Edit the taxonomy "binary".
    edit_taxonomy
  # ======================================================================= #
  # === shared
  # ======================================================================= #
  when 'shared',
       'shared_code'
    edit_shared_code_file
  # ======================================================================= #
  # === gem
  # ======================================================================= #
  when 'gem',
       'gemspec'
    edit_gemspec
  # ======================================================================= #
  # === password
  # ======================================================================= #
  when /^password$/,
      'passwd'
    edit_password
  # ======================================================================= #
  # === login
  # ======================================================================= #
  when 'login',
       'main'
    
  # ======================================================================= #
  # === instructions
  # ======================================================================= #
  when '','instructions' # '' is default.
    edit_instructions
  end
end

#edit_gemspecObject

#

edit_gemspec

#

71
72
73
# File 'lib/bioroebe/taxonomy/edit.rb', line 71

def edit_gemspec
  esystem Bioroebe.editor?+' '+return_gemspec_file
end

#edit_instructionsObject

#

edit_instructions

#

16
17
18
# File 'lib/bioroebe/taxonomy/edit.rb', line 16

def edit_instructions
  esystem "#{Bioroebe.editor?} #{return_instructions}"
end

#edit_passwordObject

#

edit_password

We can use this method to edit the login-file.

#

94
95
96
# File 'lib/bioroebe/taxonomy/edit.rb', line 94

def edit_password
   ' +27'
end

#edit_shared_code_fileObject

#

edit_shared_code_file

#

85
86
87
# File 'lib/bioroebe/taxonomy/edit.rb', line 85

def edit_shared_code_file
  esystem Bioroebe.editor?+' '+return_shared_code
end

#edit_taxonomyObject

#

edit_taxonomy

#

78
79
80
# File 'lib/bioroebe/taxonomy/edit.rb', line 78

def edit_taxonomy
  esystem Bioroebe.editor?+' '+return_taxonomy_file
end

#find_highest_entries_in_sqlObject

#

find_highest_entries_in_sql

This method will find the highest entries in the sql database. This may be useful if we wish to optimize the database (i.e. find the best varchar attribute in question).

#

382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 382

def find_highest_entries_in_sql
  e 'We will find the various highest entries in the sql files.'
  this_file = TAXONOMY_DIR+'names.dmp'
  e 'Starting with '+sfile(this_file)+' first:'
  e 'Names has 4 entries - we are interested in all of them.'
  e this_file

  max_taxid       = 0
  max_name_txt    = 0
  max_unique_name = 0
  max_name_class  = 0

  e 'Please be patient, this may take a while ...'
  readlines(this_file).each {|entry|
    _ = split_at_tabulator(entry)

    if _[0].to_s.size > max_taxid
      max_taxid = _[0].to_s.size.to_i
    end
    if _[1].to_s.size > max_name_txt
      max_name_txt = _[1].to_s.size.to_i
    end
    if _[2].to_s.size > max_unique_name
      max_unique_name = _[2].to_s.size.to_i
    end
    if _[3].to_s.size > max_name_class
      max_name_class = _[3].to_s.size.to_i
    end
  }

  max_taxid = '%05s' % max_taxid
  e '  max_taxid       is '+sfancy(max_taxid.to_s)+' characters long (should be an int anyway).'
  e '  max_name_txt    is '+sfancy('%05s' % max_name_txt.to_s)+' characters long.'
  e '  max_unique_name is '+sfancy('%05s' % max_unique_name.to_s)+' characters long.'
  e '  max_name_class  is '+sfancy('%05s' % max_name_class.to_s)+' characters long.'

  # Next, we will work on nodes.dmp:
  this_file = TAXONOMY_DIR+'nodes.dmp'
  e 'Now working on '+sfile(this_file)+' first:'
  e 'Nodes has 3 relevant entries - taxid, parent_taxid and rank (but in total it has 13 entries)'
  e this_file

  max_taxid         = 0
  max_parent_taxid  = 0
  max_rank          = 0

  e 'Please be patient, this may take a while ...'
  readlines(this_file).each {|entry|
    _ = split_at_tabulator(entry)

    if _[0].to_s.size > max_taxid
      max_taxid = _[0].to_s.size.to_i
    end
    if _[1].to_s.size > max_parent_taxid
      max_parent_taxid = _[1].to_s.size.to_i
    end
    if _[2].to_s.size > max_rank
      max_rank = _[2].to_s.size.to_i
    end
  }

  e '  max_taxid        is '+
    sfancy('%05s' % max_taxid.to_s)+' characters long.'
  e '  max_parent_taxid is '+
    sfancy('%05s' % max_parent_taxid.to_s)+' characters long.'
  e '  max_rank         is '+
    sfancy('%05s' % max_rank.to_s)+' characters long.'
end

#gencode?Boolean

#

gencode?

Obtain gencode.dmp dataset, “genetic codes” file.

genetic code id -- GenBank genetic code id
abbreviation    -- genetic code name abbreviation
name            -- genetic code name
cde             -- translation table for this genetic code
starts          -- start codons for this genetic code
#

Returns:

  • (Boolean)

216
217
218
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 216

def gencode?
  Taxonomy.load(GENCODE)
end
#

Input to this method should be an array of taxonomic IDs.

#

110
111
112
113
114
115
116
117
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 110

def generate_html_links_for(i)
  base_url = 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id='
  if i.is_a? Array
    i.each {|entry| generate_html_links_for(entry) }
  else
    e base_url+i.to_s
  end
end

#get_id_of(id = 9606, be_verbose = true) ⇒ Object

#

get_id_of

Use this method to query the database for a specific ID. For this to work, the database must have the ids.

#

359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 359

def get_id_of(
    id = 9606,
    be_verbose = true
  )
  id = id.to_s
  run_sql_query(
    'SELECT * from names WHERE taxid='+id+' AND name_class=\'scientific name\' LIMIT 3;"',
    be_verbose
  )
  result = run_sql_query(
    'SELECT taxid from names WHERE taxid='+id+' AND name_class=\'scientific name\' LIMIT 3;"',
    :silent, :tuples
  ).strip
  return result
end

#get_parent_id_of(i) ⇒ Object

#

get_parent_id_of

This method is similar to the method above, but we will fetch the parent id instead.

#

324
325
326
327
328
329
330
331
332
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 324

def get_parent_id_of(i)
  if i.to_s.empty?
    e 'No valid input was given to us (in method '+__method__.to_s+').'
  end
  cmd = 'SELECT parent_taxid FROM nodes WHERE taxid='+i.to_s+' limit 3;'
  result = run_sql_query(cmd) # More verbose here.
  result = run_sql_query(cmd, true, :tuples).strip
  return result
end

#get_scientific_name_of(taxid) ⇒ Object Also known as: get_scientific_name

#

get_scientific_name_of

Get the scientific name here. The input should be a taxid.

Usage example:

get_scientific_name_of 333
#

344
345
346
347
348
349
350
351
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 344

def get_scientific_name_of(taxid)
  _ = 'SELECT name_txt FROM names where taxid='+taxid.to_s+
      ' AND name_class=\'scientific name\' LIMIT 3;'
  result = run_sql_query(_)
  # _ << 'SELECT name_txt FROM names where taxid='+taxid.to_s+' LIMIT 3;"'
  result = run_sql_query(_, true, ' --tuples-only').strip
  return result
end

#merged?Boolean

#

merged?

Obtain information from merged.dmp.

Merged nodes file fields has these ids:

old_taxid -- id of nodes which has been merged
new_taxid -- id of nodes which is result of merging
#

Returns:

  • (Boolean)

241
242
243
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 241

def merged?
  Taxonomy.load(MERGED)
end

#names?(i = NAMES) ⇒ Boolean

#

names?

Obtain the names.dmp dataset, which are “Taxonomy names”.

Four IDs are known for this set:

taxid
name_txt
unique name
name class
#

Returns:

  • (Boolean)

183
184
185
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 183

def names?(i = NAMES)
  Taxonomy.load(i)
end

#nodes?(i = NODES) ⇒ Boolean

#

nodes?

Use this method in order to obtain the nodes.dmp dataset.

Nodes are “taxonomic nodes”.

The description for each node includes the following fields:

taxid        - node id in GenBank taxonomy database
parent taxid - parent node id in GenBank taxonomy database
rank         - rank of this node (superkingdom, kingdom, ...)
embl code    - locus-name prefix; not unique
division id  - see division.dmp file
inherited div flag (1 or 0) - 1 if node inherits division from parent
genetic code id - see gencode.dmp file
inherited GC flag (1 or 0) - 1 if node inherits genetic code from parent
mitochondrial genetic code id - see gencode.dmp file
inherited MGC flag (1 or 0) - 1 if node inherits mitochondrial gencode from parent
GenBank hidden flag (1 or 0) - 1 if name is suppressed in GenBank entry lineage
hidden subtree root flag(1 or 0) - 1 if this subtree has no sequence data yet
comments - free-text comments and citations
#

Returns:

  • (Boolean)

156
157
158
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 156

def nodes?(i = NODES)
  Taxonomy.load(i)
end

#pad(i) ⇒ Object

#

pad

The input is something like:

1457406 | Bionia Mart. ex Benth., 1837 | | authority |

but it could also be an array.

#

88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 88

def pad(i)
  if i.is_a? String
    i = i.chomp # Newlines aren't really useful - let's eliminate them.
    if i.include?(TOKEN)
      splitted = tokenize(i) # Now splitted input.
      # Next, pad sql.
      # splitted = splitted.map {|entry| pad_sql(entry) }
      # ^^^ This may lead to problems however.
      i = splitted
    end
    # Next, we will get rid of "'" characters.
    i = i.delete("'") if i.include? "'"
  end
  i = i.join("','") if i.is_a? Array
  return i
end

#pad_properly(i = "5\t|\tThe domestic cat: perspective on the nature and diversity of cats.\t|\t0\t|\t8603894\t|\t \t|\t\t|\t9685 \t|\n") ⇒ Object

#

pad_properly

Input is an Array. We will pad it for a proper SQL query.

#

102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# File 'lib/bioroebe/taxonomy/node.rb', line 102

def pad_properly(i = "5\t|\tThe domestic cat: perspective on the nature and diversity of cats.\t|\t0\t|\t8603894\t|\t \t|\t\t|\t9685 \t|\n")
  splitted = split( sanitize(i.chomp) )
  splitted = splitted.map {|e|
  begin
    string = sanitize(e).strip
    # token = to_utf('|')
    if string.include? "'"
      string = string.gsub(/'/, "\'")
    end
    return string
  rescue Exception
  end
  }
  return splitted.join("', '")
end

#pad_sql(i) ⇒ Object

#

pad_sql

Escape ' characters here.

#

92
93
94
95
# File 'lib/bioroebe/taxonomy/node.rb', line 92

def pad_sql(i)
  i = i.gsub(/'/, "\'") if i.include? "'"
  return i
end

#project_base_dir?Boolean Also known as: base_dir?

#

project_base_dir?

#

Returns:

  • (Boolean)

129
130
131
# File 'lib/bioroebe/taxonomy/class_methods.rb', line 129

def project_base_dir?
  Taxonomy.base_dir?
end

#remove_delimiter(i) ⇒ Object

#

remove_delimiter

Get rid of the '|' token.

#

504
505
506
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 504

def remove_delimiter(i)
  return i.delete(TOKEN)
end

#return_current_hours_minutes_secondsObject

#

return_current_hours_minutes_seconds

#

485
486
487
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 485

def return_current_hours_minutes_seconds
  ::Bioroebe.return_current_hours_minutes_seconds
end

#return_full_lineage_of(i = nil) ⇒ Object

#

return_full_lineage_of

This method will return an array (or nil) with the full lineage of the given input (which should be a Taxonomic id).

The logic for this method is as follows:

  • Given an arbitrary tax ID as input, we will keep on searching the postgresql database for parent_id entries, until we eventually reach input number 1, which is the mother of all IDs. So when we have 1 finally, we can stop, and return the result (the array in question).

The full lineage is thus given as part of the returned array. The format is to not only return the ID but to also return the scientific name. In other words, our returned array will have this format:

[parent_id, scientific_name]
#

291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 291

def return_full_lineage_of(i = nil)
  if i
    array = []
    array << [ i, get_scientific_name_of(i) ]
    rescue_counter = 0
    loop {
      id = array[-1][0]
      parent_id       = get_parent_id_of(id)
      scientific_name = get_scientific_name_of(parent_id)
      array << [ parent_id, scientific_name ]
      rescue_counter += 1
      break if array[-1][0].to_i == 1
      if rescue_counter > 50
        e 'It seems as if something is not working properly here. We reached'
        e 'a count of 50 now, without finding a parent id (which should be 1).'
        e 'It is quite unlikely that a lineage will have more than 50 subentries'
        e 'so we will now break out of the loop.'
        return array
      end
    }
    return array
  else
    e 'Missing input. Please provide an ID (a number, like 6).'
    return nil
  end
end

#search_id(i = '7460') ⇒ Object Also known as: search, id?

#

search_id

Search the Taxonomic ID here.

#

165
166
167
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 165

def search_id(i = '7460')
  e BASE_URL+'id='+i.to_s
end

#show_current_time_and_dateObject

#

show_current_time_and_date

#

511
512
513
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 511

def show_current_time_and_date
  Taxonomy.show_current_time_and_date
end

#show_helpObject

#

show_help (help tag)

Shows the options available in the interactive taxonomy 'shell'.

#

19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/bioroebe/taxonomy/help/help.rb', line 19

def show_help
  cliner {
    e N+'The following options are available (pwd: '+sdir(return_pwd)+'):'+
      N+N
    Helpline[:sql,'# Use this to generate the SQL that can be '+
      'used to populate a SQL database with INSERT statements.']
    e ' '*(Helpline::LJUST+3)+'# You can pass values such as 1 '+
      'or 2 (node or name) to this method.'
    Helpline[:info,  '# Show some info about where we store data.']
    Helpline[:instructions?, '# Show instructions.']
    Helpline[:fasta, '# Use this to populate the fasta table.']
    Helpline[:names, '# Use this to populate the names table.']
    Helpline[:nodes, '# Use this to populate the nodes table.']
    # ===================================================================== #
    # Only display the following line when colours are enabled still.
    # ===================================================================== #
    if use_colours?
      Helpline[:nocolours,'# Use this to disable the '+
        'colours. (Use "yescolours" to enable them again)']
    end
    Helpline[:taxid,'# Find out the name of the organism through '\
      'the input ID from the NCBI dataset. For example: "taxid 33"']
    Helpline[:table_names?,'# Use this to show the SQL command '\
      'that was used to generate the SQL Tables.']
    Helpline[:verify,'# Use this to verify that the '\
      '.sql files (nodes and names) are valid.']
    Helpline[:verbose,'# be verbose, in other words provide '\
      'extra information to us whenever feasible']
    Helpline[:ll, '# Show the content of the current working '\
      'directory.']
    Helpline[:id,'# Query the postgre database to get the ID of '\
      'a given species.']
    Helpline[:download,'# Download the remote NCBI database '\
      '(at '+simp(URL_TO_TAXONOMY_ARCHIVE)+')']
    if SHALL_WE_LOG_LAST_UPDATE
      Helpline[:last_update?,'# When did we last update the database']
    end
    Helpline[:update_database, '# download the remote NCBI database, '+
      'extract it, generate nodes.sql and names.sql,']
    _ = ' ' * Helpline::LJUST
    e _+Helpline::PADDING+' # and then populate the '\
      'postgresql-database with this information'
    e # This here to keep a trailing newline.
  }
end

#split(i = citations? ) ⇒ Object

#

split (split tag)

Split up the input on the default delimiter.

#

57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 57

def split(
    i = citations?
  )
  if i.is_a? Array
    return i.map {|entry| entry.split(MAIN_DELIMITER) }
  else
    begin
      i = sanitize(i)
      if i.include?(TOKEN) and ! i.include?("\t")
        i = i.split(TOKEN)
      else
        return i.split(MAIN_DELIMITER)
      end
    rescue Exception => error
      pp error
      e 'The string that failed was '+i
      $error = i
      return
    end
  end
end

#test(use_this_url = URL1) ⇒ Object

#

test

#

46
47
48
49
50
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 46

def test(use_this_url = URL1)
  e 'Opening URL at "'+sfancy(use_this_url)+'"'
  data = File.readlines(DIVISION)
  return data
end

#to_utf(i) ⇒ Object Also known as: sanitize

#

to_utf

#

39
40
41
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 39

def to_utf(i)
  ::Bioroebe.to_utf(i)
end

#verify_proper_sql_structuresObject

#

verify_proper_sql_structures (debug tag)

We will try to verify that the SQL commands are accurate.

Can also be called by issuing this:

Taxonomy.verify_proper_sql_structures
#

461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
# File 'lib/bioroebe/taxonomy/taxonomy.rb', line 461

def verify_proper_sql_structures
  n_chars_to_show = 975
  files = [ NODES_SQL, NAMES_SQL ]
  e 'We will now attempt to verify that the SQL structure is proper.'
  e 'We have these '+sfancy(files.size.to_s)+' .sql files.'
  pp files
  files.each {|entry|
    if File.exist? entry
      size = File.size(entry).to_s
      chunk = File.read(entry)[0..n_chars_to_show]
      e 'File '+sfile(entry)+' (Filesize: '+sfancy(size)+
        ' KB) has this content '+
        '(showing up to '+simp(n_chars_to_show)+' chars):'
      cliner { e chunk }; e
    else
      e 'We can not verify the sql structure because the file'
      e 'at `'+sfile(entry)+'` does not exist.'
    end
  }
end