Class: Update

Inherits:
Object
  • Object
show all
Defined in:
lib/ncbi_taxonomy_update.rb

Overview

Later, I should fix this.

Instance Method Summary collapse

Constructor Details

#initializeUpdate

Returns a new instance of Update.



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/ncbi_taxonomy_update.rb', line 9

def initialize
  @home_dir = Dir.home
  @work_dir = @home_dir + "/.ncbi_taxonomy"
  md5_file = @work_dir + "/taxdump.tar.gz.md5"
  md5_old_file = @work_dir + "/taxdump.tar.gz.md5.old"
  @taxdb = @work_dir + "/taxonomy.db.prep"
  @taxdb_release = @work_dir + "/taxonomy.db"
  @status = nil

  # check workinng directory, if not exist, make it.
  if File.exist?(@work_dir)
    if !File.directory?(@work_dir)
      @status = "This software uses $HOME/.ncbi_taxonomy directory. However, in your home directory there is same name of file. We recommend you change that file name to another name."
      return
    else
      Dir.chdir @work_dir
      if File.exist?(md5_file)
        `rm -f #{md5_old_file}`
        File.rename(md5_file, md5_old_file)
        self.download_md5
        `diff #{md5_file} #{md5_old_file}`
        if $?.exitstatus == 0
          @status = false
          return
        else
          @status = true
          return
        end
      else
        self.download_md5
      end
    end
  else
    Dir.mkdir @work_dir
    Dir.chdir @work_dir
    self.download_md5
  end

  @status = true
  return
end

Instance Method Details

#doObject



87
88
89
90
91
92
# File 'lib/ncbi_taxonomy_update.rb', line 87

def do
  self.download_dump
  self.substitution
  self.load_db
  self.release
end

#download_dumpObject



59
60
61
# File 'lib/ncbi_taxonomy_update.rb', line 59

def download_dump
  `curl -s ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz 2>/dev/null | tar zxf - `
end

#download_md5Object



55
56
57
# File 'lib/ncbi_taxonomy_update.rb', line 55

def download_md5
  `curl -s ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz.md5 > taxdump.tar.gz.md5 2>/dev/null`
end

#load_dbObject



94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# File 'lib/ncbi_taxonomy_update.rb', line 94

def load_db
  sql = "PRAGMA page_size=4096;\nPRAGMA main.locking_mode=EXCLUSIVE;\n\n.separator '\\t'\n\nCREATE TABLE citations (\ncit_id BIGINT,\ncit_key VARCHAR(255),\npubmed_id BIGINT,\nmedline_id BIGINT,\nur LONGTEXT,\ntext LONGTEXT,\ntaxid_list LONGTEXT\n);\n\nCREATE TABLE delnodes (\ntax_id BIGINT\n);\n\nCREATE TABLE division (\ndivision_id     BIGINT,\ndivision_cde VARCHAR(255),\ndivision_name VARCHAR(255),\ncomments VARCHAR(255)\n);\n\nCREATE TABLE gencode (\ngenetic_code_id INT,\nabbreviation VARCHAR(255),\nname VARCHAR(255),\ncde LONGTEXT,\nstarts LONGTEXT\n);\n\nCREATE TABLE merged (\nold_tax_id BIGINT,\nnew_tax_id BIGINT\n);\n\nCREATE TABLE names (\ntax_id BIGINT,\nname_txt VARCHAR(255),\nunique_name VARCHAR(255),\nname_class VARCHAR(255)\n);\n\nCREATE TABLE nodes (\ntax_id BIGINT,\nparent_tax_id BIGINT,\nrank VARCHAR(64), \nembl_code VARCHAR(64),\ndivision_id INTEGER,\ninherited_div_flag BOOLEAN,\ngenetic_code_id INTEGER,\ninherited_GC_flag BOOLEAN,\nmitochondrial_genetic_code_id INTEGER,\ninherited_MGC_flag BOOLEAN,\nGenBank_hidden_flag BOOLEAN,\nhidden_subtree_root_flag BOOLEAN,\ncomments VARCHAR(255)\n);\n\n\nCREATE INDEX citations_idx ON citations(cit_id,cit_key,pubmed_id,medline_id,ur,text,taxid_list);\nCREATE INDEX delnodes_idx ON delnodes(tax_id);\nCREATE INDEX division_idx ON division(division_id,division_cde,division_name,comments);\nCREATE INDEX gencode_idx ON gencode(genetic_code_id,abbreviation,name,cde,starts);\nCREATE INDEX merged_idx ON merged(old_tax_id,new_tax_id);\nCREATE INDEX names_idx ON names(tax_id,name_txt,unique_name,name_class);\nCREATE INDEX nodes_idx ON nodes(tax_id,parent_tax_id,rank,embl_code,division_id,inherited_div_flag,genetic_code_id,inherited_GC_flag,mitochondrial_genetic_code_id,inherited_MGC_flag,GenBank_hidden_flag,hidden_subtree_root_flag,comments);\n\n\n.import citations.dmp.trim citations\n.import delnodes.dmp.trim delnodes\n.import division.dmp.trim division\n.import gencode.dmp.trim gencode\n.import merged.dmp.trim merged\n.import names.dmp.trim names\n.import nodes.dmp.trim nodes\n  \n"
  `echo "#{sql}" | sqlite3 #{@taxdb} < /dev/stdin`
end

#releaseObject



75
76
77
78
79
80
81
82
83
84
85
# File 'lib/ncbi_taxonomy_update.rb', line 75

def release
  begin
    FileUtils.rm "#{@taxdb_release}.old"
  rescue Errno::ENOENT => e
  end
  begin
    File.rename(@taxdb_release, @taxdb_release+".old")
  rescue Errno::ENOENT => e
  end
  File.rename(@taxdb, @taxdb_release)
end

#statusObject



51
52
53
# File 'lib/ncbi_taxonomy_update.rb', line 51

def status
  return @status
end

#substitutionObject

substitute some characters



64
65
66
67
68
69
70
71
72
73
# File 'lib/ncbi_taxonomy_update.rb', line 64

def substitution
  Dir.entries(@work_dir).each do |file|
    if file =~ /dmp$/
      #STDERR.puts "treating #{file}"
      File.open(@work_dir+"/"+file+".trim","w") do |out|
        out << File.open(@work_dir+"/"+file).read.force_encoding('iso-8859-1').encode('utf-8').gsub(/([^|]) ([^|])/,'\1 \2').gsub(/\t\|\t/,"\t").gsub(/\t\|$/,"").gsub(/\"/,"%22")
      end
    end
  end
end