Module: MiGA::Cli::Action::NcbiGet::Downloads
- Included in:
- MiGA::Cli::Action::NcbiGet
- Defined in:
- lib/miga/cli/action/ncbi_get/downloads.rb
Overview
Helper module including download functions for the ncbi_get action
Instance Method Summary collapse
- #cli_filters(opt) ⇒ Object
- #cli_name_modifiers(opt) ⇒ Object
- #cli_save_actions(opt) ⇒ Object
- #cli_task_flags(opt) ⇒ Object
- #discard_blacklisted(ds) ⇒ Object
- #download_entries(ds, p) ⇒ Object
- #impose_limit(ds) ⇒ Object
- #remote_list ⇒ Object
- #remote_list_url ⇒ Object
- #remote_row_name(r, rep, asm) ⇒ Object
- #remote_row_replicons(r) ⇒ Object
- #sanitize_cli ⇒ Object
- #save_entry(name, body, p) ⇒ Object
Instance Method Details
#cli_filters(opt) ⇒ Object
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 42 def cli_filters(opt) opt.on( '--blacklist PATH', 'A file with dataset names to blacklist' ) { |v| cli[:blacklist] = v } cli.opt_flag(opt, 'dry', 'Do not download or save the datasets') opt.on( '--ignore-until STRING', 'Ignores all datasets until a name is found (useful for large reruns)' ) { |v| cli[:ignore_until] = v } cli.opt_flag( opt, 'get-metadata', 'Only download and update metadata for existing datasets', :get_md ) end |
#cli_name_modifiers(opt) ⇒ Object
29 30 31 32 33 34 35 36 37 38 39 40 |
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 29 def cli_name_modifiers(opt) opt.on( '--no-version-name', 'Do not add sequence version to the dataset name', 'Only affects --complete and --chromosome' ) { |v| cli[:add_version] = v } cli.opt_flag( opt, 'legacy-name', 'Use dataset names based on chromosome entries instead of assembly', :legacy_name ) end |
#cli_save_actions(opt) ⇒ Object
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 58 def cli_save_actions(opt) cli.opt_flag( opt, 'only-metadata', 'Create datasets without input data but retrieve all metadata', :only_md ) opt.on( '--save-every INT', Integer, 'Save project every this many downloaded datasets', 'If zero, it saves the project only once upon completion', "By default: #{cli[:save_every]}" ) { |v| cli[:save_every] = v } opt.on( '-q', '--query', 'Register the datasets as queries, not reference datasets' ) { |v| cli[:query] = v } opt.on( '-u', '--unlink', 'Unlink all datasets in the project missing from the download list' ) { |v| cli[:unlink] = v } opt.on( '-R', '--remote-list PATH', 'Path to an output file with the list of all datasets listed remotely' ) { |v| cli[:remote_list] = v } end |
#cli_task_flags(opt) ⇒ Object
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 9 def cli_task_flags(opt) cli.opt_flag( opt, 'reference', 'Download all reference genomes (ignore any other status)' ) cli.opt_flag(opt, 'complete', 'Download complete genomes') cli.opt_flag(opt, 'chromosome', 'Download complete chromosomes') cli.opt_flag(opt, 'scaffold', 'Download genomes in scaffolds') cli.opt_flag(opt, 'contig', 'Download genomes in contigs') opt.on( '--all', 'Download all genomes (in any status)' ) do cli[:complete] = true cli[:chromosome] = true cli[:scaffold] = true cli[:contig] = true end end |
#discard_blacklisted(ds) ⇒ Object
173 174 175 176 177 178 179 180 181 182 |
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 173 def discard_blacklisted(ds) unless cli[:blacklist].nil? cli.say "Discarding datasets in #{cli[:blacklist]}" File.readlines(cli[:blacklist]) .select { |i| i !~ /^#/ } .map(&:chomp) .each { |i| ds.delete i } end ds end |
#download_entries(ds, p) ⇒ Object
194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 |
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 194 def download_entries(ds, p) cli.say "Downloading #{ds.size} " + (ds.size == 1 ? 'entry' : 'entries') p.do_not_save = true if cli[:save_every] != 1 ignore = !cli[:ignore_until].nil? downloaded = 0 d = [] ds.each do |name, body| d << name cli.puts name ignore = false if ignore && name == cli[:ignore_until] next if ignore || p.dataset(name).nil? == cli[:get_md] downloaded += 1 unless cli[:dry] save_entry(name, body, p) p.save! if cli[:save_every] > 1 && (downloaded % cli[:save_every]).zero? end end p.do_not_save = false p.save! if cli[:save_every] != 1 [d, downloaded] end |
#impose_limit(ds) ⇒ Object
184 185 186 187 188 189 190 191 192 |
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 184 def impose_limit(ds) max = cli[:max_datasets].to_i if !max.zero? && max < ds.size cli.say "Subsampling list from #{ds.size} to #{max} datasets" sample = ds.keys.sample(max) ds.select! { |k, _| sample.include? k } end ds end |
#remote_list ⇒ Object
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 94 def remote_list cli.say 'Downloading genome list' ds = {} url = remote_list_url doc = MiGA::RemoteDataset.download_url(url) CSV.parse(doc, headers: true).each do |r| asm = r['assembly'] next if asm.nil? || asm.empty? || asm == '-' next unless r['ftp_path_genbank'] rep = remote_row_replicons(r) n = remote_row_name(r, rep, asm) # Register for download fna_url = '%s/%s_genomic.fna.gz' % [r['ftp_path_genbank'], File.basename(r['ftp_path_genbank'])] ds[n] = { ids: [fna_url], db: :assembly_gz, universe: :web, md: { type: :genome, ncbi_asm: asm, strain: r['strain'] } } ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil? unless r['release_date'].nil? ds[n][:md][:release_date] = Time.parse(r['release_date']).to_s end end ds end |
#remote_list_url ⇒ Object
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 145 def remote_list_url url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?' url_param = { q: '[display()].' \ 'from(GenomeAssemblies).' \ 'usingschema(/schema/GenomeAssemblies).' \ 'matching(tab==["Prokaryotes"] and q=="' \ "#{cli[:taxon]&.tr('"', "'")}\"", fields: 'organism|organism,assembly|assembly,replicons|replicons,' \ 'level|level,ftp_path_genbank|ftp_path_genbank,' \ 'release_date|release_date,strain|strain', nolimit: 'on' } if cli[:reference] url_param[:q] += ' and refseq_category==["representative"]' else status = { complete: 'Complete', chromosome: ' Chromosome', # <- The leading space is *VERY* important! scaffold: 'Scaffold', contig: 'Contig' }.map { |k, v| '"' + v + '"' if cli[k] }.compact.join(',') url_param[:q] += ' and level==[' + status + ']' end url_param[:q] += ')' url_base + URI.encode_www_form(url_param) end |
#remote_row_name(r, rep, asm) ⇒ Object
133 134 135 136 137 138 139 140 141 142 143 |
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 133 def remote_row_name(r, rep, asm) return r['#organism'].miga_name if cli[:legacy_name] && cli[:reference] if cli[:legacy_name] && ['Complete', ' Chromosome'].include?(r['level']) acc = rep.nil? ? '' : rep.first else acc = asm end acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version] "#{r['#organism']}_#{acc}".miga_name end |
#remote_row_replicons(r) ⇒ Object
124 125 126 127 128 129 130 131 |
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 124 def remote_row_replicons(r) return if r['replicons'].nil? r['replicons'] .split('; ') .map { |i| i.gsub(/.*:/, '') } .map { |i| i.gsub(%r{/.*}, '') } end |
#sanitize_cli ⇒ Object
84 85 86 87 88 89 90 91 92 |
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 84 def sanitize_cli cli.ensure_par(taxon: '-T') tasks = %w[reference complete chromosome scaffold contig] unless tasks.any? { |i| cli[i.to_sym] } raise 'No action requested: pick at least one type of genome' end cli[:save_every] = 1 if cli[:dry] end |
#save_entry(name, body, p) ⇒ Object
217 218 219 220 221 222 223 224 225 226 227 228 229 |
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 217 def save_entry(name, body, p) cli.say ' Locating remote dataset' body[:md][:metadata_only] = true if cli[:only_md] rd = MiGA::RemoteDataset.new(body[:ids], body[:db], body[:universe]) if cli[:get_md] cli.say ' Updating dataset' rd.(p.dataset(name), body[:md]) else cli.say ' Creating dataset' rd.save_to(p, name, !cli[:query], body[:md]) cli.(p.add_dataset(name)) end end |