Class: Update

Inherits:
Object
  • Object
show all
Defined in:
lib/ncbi_taxonomy_update.rb

Overview

Later, I should fix this.

Instance Method Summary collapse

Constructor Details

#initializeUpdate

Returns a new instance of Update.



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/ncbi_taxonomy_update.rb', line 9

def initialize
	@home_dir = Dir.home
	@work_dir = @home_dir + "/.ncbi_taxonomy"
	md5_file = @work_dir + "/taxdump.tar.gz.md5"
	md5_old_file = @work_dir + "/taxdump.tar.gz.md5.old"
	@taxdb = @work_dir + "/taxonomy.db.prep"
	@taxdb_release = @work_dir + "/taxonomy.db"
	@status = nil

	# check workinng directory, if not exist, make it.
	if File.exist?(@work_dir)
		if !File.directory?(@work_dir)
			@status = "This software uses $HOME/.ncbi_taxonomy directory. However, in your home directory there is same name of file. We recommend you change that file name to another name."
			return
		else
			Dir.chdir @work_dir
			if File.exist?(md5_file)
				`rm -f #{md5_old_file}`
				File.rename(md5_file, md5_old_file)
				self.download_md5
				`diff #{md5_file} #{md5_old_file}`
				if $?.exitstatus == 0
					@status = false
					return
				else
					@status = true
					return
				end
			else
				self.download_md5
			end
		end
	else
		Dir.mkdir @work_dir
		Dir.chdir @work_dir
		self.download_md5
	end

	@status = true
	return
end

Instance Method Details

#doObject



87
88
89
90
91
92
# File 'lib/ncbi_taxonomy_update.rb', line 87

def do
	self.download_dump
	self.substitution
	self.load_db
	self.release
end

#download_dumpObject



59
60
61
# File 'lib/ncbi_taxonomy_update.rb', line 59

def download_dump
	`curl -s https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz 2>/dev/null | tar zxf - `
end

#download_md5Object



55
56
57
# File 'lib/ncbi_taxonomy_update.rb', line 55

def download_md5
	`curl -s https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz.md5 > taxdump.tar.gz.md5 2>/dev/null`
end

#load_dbObject



94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# File 'lib/ncbi_taxonomy_update.rb', line 94

def load_db
	sql = <<EOF
PRAGMA page_size=4096;
PRAGMA main.locking_mode=EXCLUSIVE;

.separator '\t'

CREATE TABLE citations (
cit_id BIGINT,
cit_key VARCHAR(255),
pubmed_id BIGINT,
medline_id BIGINT,
ur LONGTEXT,
text LONGTEXT,
taxid_list LONGTEXT
);

CREATE TABLE delnodes (
tax_id BIGINT
);

CREATE TABLE division (
division_id     BIGINT,
division_cde VARCHAR(255),
division_name VARCHAR(255),
comments VARCHAR(255)
);

CREATE TABLE gencode (
genetic_code_id INT,
abbreviation VARCHAR(255),
name VARCHAR(255),
cde LONGTEXT,
starts LONGTEXT
);

CREATE TABLE merged (
old_tax_id BIGINT,
new_tax_id BIGINT
);

CREATE TABLE names (
tax_id BIGINT,
name_txt VARCHAR(255),
unique_name VARCHAR(255),
name_class VARCHAR(255)
);

CREATE TABLE nodes (
tax_id BIGINT,
parent_tax_id BIGINT,
rank VARCHAR(64),
embl_code VARCHAR(64),
division_id INTEGER,
inherited_div_flag BOOLEAN,
genetic_code_id INTEGER,
inherited_GC_flag BOOLEAN,
mitochondrial_genetic_code_id INTEGER,
inherited_MGC_flag BOOLEAN,
GenBank_hidden_flag BOOLEAN,
hidden_subtree_root_flag BOOLEAN,
comments VARCHAR(255)
);


CREATE INDEX citations_idx ON citations(cit_id,cit_key,pubmed_id,medline_id,ur,text,taxid_list);
CREATE INDEX delnodes_idx ON delnodes(tax_id);
CREATE INDEX division_idx ON division(division_id,division_cde,division_name,comments);
CREATE INDEX gencode_idx ON gencode(genetic_code_id,abbreviation,name,cde,starts);
CREATE INDEX merged_idx ON merged(old_tax_id,new_tax_id);
CREATE INDEX names_idx ON names(tax_id,name_txt,unique_name,name_class);
CREATE INDEX nodes_idx ON nodes(tax_id,parent_tax_id,rank,embl_code,division_id,inherited_div_flag,genetic_code_id,inherited_GC_flag,mitochondrial_genetic_code_id,inherited_MGC_flag,GenBank_hidden_flag,hidden_subtree_root_flag,comments);


.import citations.dmp.trim citations
.import delnodes.dmp.trim delnodes
.import division.dmp.trim division
.import gencode.dmp.trim gencode
.import merged.dmp.trim merged
.import names.dmp.trim names
.import nodes.dmp.trim nodes

EOF
	`echo "#{sql}" | sqlite3 #{@taxdb} < /dev/stdin`
end

#releaseObject



75
76
77
78
79
80
81
82
83
84
85
# File 'lib/ncbi_taxonomy_update.rb', line 75

def release
	begin
		FileUtils.rm "#{@taxdb_release}.old"
	rescue Errno::ENOENT => e
	end
	begin
		File.rename(@taxdb_release, @taxdb_release+".old")
	rescue Errno::ENOENT => e
	end
	File.rename(@taxdb, @taxdb_release)
end

#statusObject



51
52
53
# File 'lib/ncbi_taxonomy_update.rb', line 51

def status
	return @status
end

#substitutionObject

substitute some characters



64
65
66
67
68
69
70
71
72
73
# File 'lib/ncbi_taxonomy_update.rb', line 64

def substitution
	Dir.entries(@work_dir).each do |file|
		if file =~ /dmp$/
			#STDERR.puts "treating #{file}"
			File.open(@work_dir+"/"+file+".trim","w") do |out|
				out << File.open(@work_dir+"/"+file).read.force_encoding('iso-8859-1').encode('utf-8').gsub(/([^|])	([^|])/,'\1 \2').gsub(/\t\|\t/,"\t").gsub(/\t\|$/,"").gsub(/\"/,"%22")
			end
		end
	end
end