Module: MiGA::Cli::Action::NcbiGet::Downloads

Included in:
MiGA::Cli::Action::NcbiGet
Defined in:
lib/miga/cli/action/ncbi_get/downloads.rb

Overview

Helper module including download functions for the ncbi_get action

Instance Method Summary collapse

Instance Method Details

#cli_filters(opt) ⇒ Object



42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 42

def cli_filters(opt)
  opt.on(
    '--blacklist PATH',
    'A file with dataset names to blacklist'
  ) { |v| cli[:blacklist] = v }
  cli.opt_flag(opt, 'dry', 'Do not download or save the datasets')
  opt.on(
    '--ignore-until STRING',
    'Ignores all datasets until a name is found (useful for large reruns)'
  ) { |v| cli[:ignore_until] = v }
  cli.opt_flag(
    opt, 'get-metadata',
    'Only download and update metadata for existing datasets', :get_md
  )
end

#cli_name_modifiers(opt) ⇒ Object



29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 29

def cli_name_modifiers(opt)
  opt.on(
    '--no-version-name',
    'Do not add sequence version to the dataset name',
    'Only affects --complete and --chromosome'
  ) { |v| cli[:add_version] = v }
  cli.opt_flag(
    opt, 'legacy-name',
    'Use dataset names based on chromosome entries instead of assembly',
    :legacy_name
  )
end

#cli_save_actions(opt) ⇒ Object



58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 58

def cli_save_actions(opt)
  cli.opt_flag(
    opt, 'only-metadata',
    'Create datasets without input data but retrieve all metadata',
    :only_md
  )
  opt.on(
    '--save-every INT', Integer,
    'Save project every this many downloaded datasets',
    'If zero, it saves the project only once upon completion',
    "By default: #{cli[:save_every]}"
  ) { |v| cli[:save_every] = v }
  opt.on(
    '-q', '--query',
    'Register the datasets as queries, not reference datasets'
  ) { |v| cli[:query] = v }
  opt.on(
    '-u', '--unlink',
    'Unlink all datasets in the project missing from the download list'
  ) { |v| cli[:unlink] = v }
  opt.on(
    '-R', '--remote-list PATH',
    'Path to an output file with the list of all datasets listed remotely'
  ) { |v| cli[:remote_list] = v }
end

#cli_task_flags(opt) ⇒ Object



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 9

def cli_task_flags(opt)
  cli.opt_flag(
    opt, 'reference',
    'Download all reference genomes (ignore any other status)'
  )
  cli.opt_flag(opt, 'complete', 'Download complete genomes')
  cli.opt_flag(opt, 'chromosome', 'Download complete chromosomes')
  cli.opt_flag(opt, 'scaffold', 'Download genomes in scaffolds')
  cli.opt_flag(opt, 'contig', 'Download genomes in contigs')
  opt.on(
    '--all',
    'Download all genomes (in any status)'
  ) do
    cli[:complete] = true
    cli[:chromosome] = true
    cli[:scaffold] = true
    cli[:contig] = true
  end
end

#discard_blacklisted(ds) ⇒ Object



173
174
175
176
177
178
179
180
181
182
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 173

def discard_blacklisted(ds)
  unless cli[:blacklist].nil?
    cli.say "Discarding datasets in #{cli[:blacklist]}"
    File.readlines(cli[:blacklist])
        .select { |i| i !~ /^#/ }
        .map(&:chomp)
        .each { |i| ds.delete i }
  end
  ds
end

#download_entries(ds, p) ⇒ Object



194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 194

def download_entries(ds, p)
  cli.say "Downloading #{ds.size} " + (ds.size == 1 ? 'entry' : 'entries')
  p.do_not_save = true if cli[:save_every] != 1
  ignore = !cli[:ignore_until].nil?
  downloaded = 0
  d = []
  ds.each do |name, body|
    d << name
    cli.puts name
    ignore = false if ignore && name == cli[:ignore_until]
    next if ignore || p.dataset(name).nil? == cli[:get_md]

    downloaded += 1
    unless cli[:dry]
      save_entry(name, body, p)
      p.save! if cli[:save_every] > 1 && (downloaded % cli[:save_every]).zero?
    end
  end
  p.do_not_save = false
  p.save! if cli[:save_every] != 1
  [d, downloaded]
end

#impose_limit(ds) ⇒ Object



184
185
186
187
188
189
190
191
192
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 184

def impose_limit(ds)
  max = cli[:max_datasets].to_i
  if !max.zero? && max < ds.size
    cli.say "Subsampling list from #{ds.size} to #{max} datasets"
    sample = ds.keys.sample(max)
    ds.select! { |k, _| sample.include? k }
  end
  ds
end

#remote_listObject



94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 94

def remote_list
  cli.say 'Downloading genome list'
  ds = {}
  url = remote_list_url
  doc = MiGA::RemoteDataset.download_url(url)
  CSV.parse(doc, headers: true).each do |r|
    asm = r['assembly']
    next if asm.nil? || asm.empty? || asm == '-'
    next unless r['ftp_path_genbank']

    rep = remote_row_replicons(r)
    n = remote_row_name(r, rep, asm)

    # Register for download
    fna_url = '%s/%s_genomic.fna.gz' %
              [r['ftp_path_genbank'], File.basename(r['ftp_path_genbank'])]
    ds[n] = {
      ids: [fna_url], db: :assembly_gz, universe: :web,
      md: {
        type: :genome, ncbi_asm: asm, strain: r['strain']
      }
    }
    ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
    unless r['release_date'].nil?
      ds[n][:md][:release_date] = Time.parse(r['release_date']).to_s
    end
  end
  ds
end

#remote_list_urlObject



145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 145

def remote_list_url
  url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
  url_param = {
    q: '[display()].' \
      'from(GenomeAssemblies).' \
      'usingschema(/schema/GenomeAssemblies).' \
      'matching(tab==["Prokaryotes"] and q=="' \
        "#{cli[:taxon]&.tr('"', "'")}\"",
    fields: 'organism|organism,assembly|assembly,replicons|replicons,' \
      'level|level,ftp_path_genbank|ftp_path_genbank,' \
      'release_date|release_date,strain|strain',
    nolimit: 'on'
  }
  if cli[:reference]
    url_param[:q] += ' and refseq_category==["representative"]'
  else
    status = {
      complete: 'Complete',
      chromosome: ' Chromosome', # <- The leading space is *VERY* important!
      scaffold: 'Scaffold',
      contig: 'Contig'
    }.map { |k, v| '"' + v + '"' if cli[k] }.compact.join(',')
    url_param[:q] += ' and level==[' + status + ']'
  end
  url_param[:q] += ')'
  url_base + URI.encode_www_form(url_param)
end

#remote_row_name(r, rep, asm) ⇒ Object



133
134
135
136
137
138
139
140
141
142
143
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 133

def remote_row_name(r, rep, asm)
  return r['#organism'].miga_name if cli[:legacy_name] && cli[:reference]

  if cli[:legacy_name] && ['Complete', ' Chromosome'].include?(r['level'])
    acc = rep.nil? ? '' : rep.first
  else
    acc = asm
  end
  acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
  "#{r['#organism']}_#{acc}".miga_name
end

#remote_row_replicons(r) ⇒ Object



124
125
126
127
128
129
130
131
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 124

def remote_row_replicons(r)
  return if r['replicons'].nil?

  r['replicons']
    .split('; ')
    .map { |i| i.gsub(/.*:/, '') }
    .map { |i| i.gsub(%r{/.*}, '') }
end

#sanitize_cliObject



84
85
86
87
88
89
90
91
92
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 84

def sanitize_cli
  cli.ensure_par(taxon: '-T')
  tasks = %w[reference complete chromosome scaffold contig]
  unless tasks.any? { |i| cli[i.to_sym] }
    raise 'No action requested: pick at least one type of genome'
  end

  cli[:save_every] = 1 if cli[:dry]
end

#save_entry(name, body, p) ⇒ Object



217
218
219
220
221
222
223
224
225
226
227
228
229
# File 'lib/miga/cli/action/ncbi_get/downloads.rb', line 217

def save_entry(name, body, p)
  cli.say '  Locating remote dataset'
  body[:md][:metadata_only] = true if cli[:only_md]
  rd = MiGA::RemoteDataset.new(body[:ids], body[:db], body[:universe])
  if cli[:get_md]
    cli.say '  Updating dataset'
    rd.(p.dataset(name), body[:md])
  else
    cli.say '  Creating dataset'
    rd.save_to(p, name, !cli[:query], body[:md])
    cli.(p.add_dataset(name))
  end
end