Module: MiGA::Cli::Action::Download::Base
Overview
Helper module including download functions for the *_get actions
Instance Method Summary collapse
- #cli_base_flags(opt) ⇒ Object
- #cli_filters(opt) ⇒ Object
- #cli_save_actions(opt) ⇒ Object
- #discard_excluded(ds) ⇒ Object
- #download_entries(ds, p) ⇒ Object
- #exclude_newer(ds) ⇒ Object
- #finalize_tasks(d, downloaded) ⇒ Object
- #generic_perform ⇒ Object
- #impose_limit(ds) ⇒ Object
- #load_ncbi_taxonomy_dump ⇒ Object
- #load_tasks ⇒ Object
-
#save_entry(name, body, p) ⇒ Object
Saves the (generic remote) entry identified by
name
withbody
into the projectp
, and returnstrue
on success andfalse
otherwise. - #unlink_entries(p, unlink) ⇒ Object
Instance Method Details
#cli_base_flags(opt) ⇒ Object
10 11 12 13 14 15 16 17 18 19 20 |
# File 'lib/miga/cli/action/download/base.rb', line 10 def cli_base_flags(opt) opt.on( '--max-download INT', Integer, 'Maximum number of datasets to download (by default: unlimited)' ) { |v| cli[:max_datasets] = v } opt.on( '-m', '--metadata STRING', 'Metadata as key-value pairs separated by = and delimited by comma', 'Values are saved as strings except for booleans (true / false) or nil' ) { |v| cli[:metadata] = v } end |
#cli_filters(opt) ⇒ Object
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
# File 'lib/miga/cli/action/download/base.rb', line 22 def cli_filters(opt) opt.on( '--exclude PATH', 'A file with dataset names to exclude' ) { |v| cli[:exclude] = v } cli.opt_flag(opt, 'dry', 'Do not download or save the datasets') opt.on( '--ignore-until STRING', 'Ignores all datasets until a name is found (useful for large reruns)' ) { |v| cli[:ignore_until] = v } opt.on( '--ignore-removed', 'Ignores entries removed from NCBI (by default fails on removed entries)' ) { |v| cli[:ignore_removed] = v } cli.opt_flag( opt, 'get-metadata', 'Only download and update metadata for existing datasets', :get_md ) opt.on( '--updated-before DATE', 'Only download metadata for datasets last updated before the given date', 'Requires --get-metadata, supports date or date-time' ) { |v| cli[:updated_before] = DateTime.parse(v) } end |
#cli_save_actions(opt) ⇒ Object
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# File 'lib/miga/cli/action/download/base.rb', line 47 def cli_save_actions(opt) cli.opt_flag( opt, 'only-metadata', 'Create datasets without input data but retrieve all metadata', :only_md ) opt.on( '--save-every INT', Integer, 'Save project every this many downloaded datasets', 'If zero, it saves the project only once upon completion', "By default: #{cli[:save_every]}" ) { |v| cli[:save_every] = v } opt.on( '-q', '--query', 'Register the datasets as queries, not reference datasets' ) { |v| cli[:query] = v } opt.on( '-u', '--unlink', 'Unlink all datasets in the project missing from the download list' ) { |v| cli[:unlink] = v } opt.on( '-R', '--remote-list PATH', 'Path to an output file with the list of all datasets listed remotely' ) { |v| cli[:remote_list] = v } opt.on( '--ncbi-taxonomy-dump [path]', 'Path to an NCBI Taxonomy dump directory to query instead of API calls', 'If the path is not passed, the dump is automatically downloaded' ) { |v| cli[:ncbi_taxonomy_dump] = v || true } end |
#discard_excluded(ds) ⇒ Object
135 136 137 138 139 140 141 142 143 144 |
# File 'lib/miga/cli/action/download/base.rb', line 135 def discard_excluded(ds) unless cli[:exclude].nil? cli.say "Discarding datasets in #{cli[:exclude]}" File.readlines(cli[:exclude]) .select { |i| i !~ /^#/ } .map(&:chomp) .each { |i| ds.delete i } end ds end |
#download_entries(ds, p) ⇒ Object
166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
# File 'lib/miga/cli/action/download/base.rb', line 166 def download_entries(ds, p) cli.say "Downloading #{ds.size} " + (ds.size == 1 ? 'entry' : 'entries') p.do_not_save = true if cli[:save_every] != 1 ignore = !cli[:ignore_until].nil? downloaded = 0 d = [] ds.each do |name, body| d << name cli.puts name ignore = false if ignore && name == cli[:ignore_until] next if ignore || p.dataset(name).nil? == cli[:get_md] downloaded += 1 unless cli[:dry] unless save_entry(name, body, p) downloaded -= 1 d.pop next end p.save! if cli[:save_every] > 1 && (downloaded % cli[:save_every]).zero? end end p.do_not_save = false p.save! if cli[:save_every] != 1 [d, downloaded] end |
#exclude_newer(ds) ⇒ Object
146 147 148 149 150 151 152 153 154 |
# File 'lib/miga/cli/action/download/base.rb', line 146 def exclude_newer(ds) return ds unless cli[:updated_before] project = cli.load_project ds.select do |name| d = project.dataset(name) d && DateTime.parse(d.[:updated]) < cli[:updated_before] end end |
#finalize_tasks(d, downloaded) ⇒ Object
119 120 121 122 123 124 125 126 127 128 |
# File 'lib/miga/cli/action/download/base.rb', line 119 def finalize_tasks(d, downloaded) cli.say "Datasets listed: #{d.size}" act = cli[:dry] ? 'to download' : 'downloaded' cli.say "Datasets #{act}: #{downloaded}" unless cli[:remote_list].nil? File.open(cli[:remote_list], 'w') do |fh| d.each { |i| fh.puts i } end end end |
#generic_perform ⇒ Object
78 79 80 81 82 83 84 85 |
# File 'lib/miga/cli/action/download/base.rb', line 78 def generic_perform p, ds = load_tasks d, downloaded = download_entries(ds, p) # Finalize finalize_tasks(d, downloaded) unlink_entries(p, p.dataset_names - d) if cli[:unlink] end |
#impose_limit(ds) ⇒ Object
156 157 158 159 160 161 162 163 164 |
# File 'lib/miga/cli/action/download/base.rb', line 156 def impose_limit(ds) max = cli[:max_datasets].to_i if !max.zero? && max < ds.size cli.say "Subsampling list from #{ds.size} to #{max} datasets" sample = ds.keys.sample(max) ds.select! { |k, _| sample.include? k } end ds end |
#load_ncbi_taxonomy_dump ⇒ Object
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
# File 'lib/miga/cli/action/download/base.rb', line 98 def load_ncbi_taxonomy_dump return unless cli[:ncbi_taxonomy_dump] if cli[:ncbi_taxonomy_dump] == true cli.say 'Downloading and reading NCBI Taxonomy dump' Dir.mktmpdir do |dir| file = 'taxdump.tar.gz' path = File.join(dir, file) url = 'https://ftp.ncbi.nih.gov/pub/taxonomy/%s' % file File.open(path, 'wb') { |fh| fh.print MiGA::MiGA.net_method(:get, url) } MiGA::MiGA.run_cmd('cd "%s" && tar -zxf "%s"' % [dir, file]) MiGA::RemoteDataset.use_ncbi_taxonomy_dump(dir, cli) end else cli.say "Reading NCBI Taxonomy dump: #{cli[:ncbi_taxonomy_dump]}" MiGA::RemoteDataset.use_ncbi_taxonomy_dump(cli[:ncbi_taxonomy_dump], cli) end end |
#load_tasks ⇒ Object
87 88 89 90 91 92 93 94 95 96 |
# File 'lib/miga/cli/action/download/base.rb', line 87 def load_tasks sanitize_cli p = cli.load_project load_ncbi_taxonomy_dump ds = remote_list ds = discard_excluded(ds) ds = exclude_newer(ds) ds = impose_limit(ds) [p, ds] end |
#save_entry(name, body, p) ⇒ Object
Saves the (generic remote) entry identified by name
with body
into the project p
, and returns true
on success and false
otherwise
196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
# File 'lib/miga/cli/action/download/base.rb', line 196 def save_entry(name, body, p) cli.say " Locating remote dataset: #{name}" body[:md][:metadata_only] = true if cli[:only_md] rd = MiGA::RemoteDataset.new(body[:ids], body[:db], body[:universe]) if cli[:get_md] cli.say ' Updating dataset' rd.(p.dataset(name), body[:md]) else cli.say ' Creating dataset' rd.save_to(p, name, !cli[:query], body[:md]) cli.(p.add_dataset(name)) end true rescue MiGA::RemoteDataMissingError => e raise(e) unless cli[:ignore_removed] cli.say " Removed dataset ignored: #{name}" false end |
#unlink_entries(p, unlink) ⇒ Object
130 131 132 133 |
# File 'lib/miga/cli/action/download/base.rb', line 130 def unlink_entries(p, unlink) unlink.each { |i| p.unlink_dataset(i).remove! } cli.say "Datasets unlinked: #{unlink.size}" end |