Class: MiGA::RemoteDataset

Inherits:
MiGA
  • Object
show all
Defined in:
lib/miga/remote_dataset.rb

Overview

MiGA representation of datasets with data in remote locations.

Constant Summary collapse

@@_EUTILS =

Class-level

"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
@@UNIVERSE =
{
  web:{
    dbs: {
      assembly:{stage: :assembly, format: :fasta},
      assembly_gz:{stage: :assembly, format: :fasta_gz}
    },
    url: "%2$s",
    method: :net
  },
  ebi:{
    dbs: { embl:{stage: :assembly, format: :fasta} },
    url: "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/%1$s/%2$s/%3$s",
    method: :rest
  },
  ncbi:{
    dbs: { nuccore:{stage: :assembly, format: :fasta} },
    url: "#{@@_EUTILS}efetch.fcgi?db=%1$s&id=%2$s&rettype=%3$s&retmode=text",
    method: :rest
  },
  ncbi_map:{
    dbs: { assembly:{map_to: :nuccore, format: :text} },
      # FIXME ncbi_map is intended to do internal NCBI mapping between
      # databases.
    url: "#{@@_EUTILS}elink.fcgi?dbfrom=%1$s&id=%2$s&db=%3$s - - - - -",
    method: :rest,
    map_to_universe: :ncbi
  }
}

Constants included from MiGA

CITATION, VERSION, VERSION_DATE, VERSION_NAME

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from MiGA

CITATION, DEBUG, DEBUG_OFF, DEBUG_ON, DEBUG_TRACE_OFF, DEBUG_TRACE_ON, FULL_VERSION, LONG_VERSION, VERSION, VERSION_DATE, clean_fasta_file, initialized?, #result_files_exist?, root_path, script_path, seqs_length, tabulate

Constructor Details

#initialize(ids, db, universe) ⇒ RemoteDataset

Initialize MiGA::RemoteDataset with ids in database db from universe.



119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# File 'lib/miga/remote_dataset.rb', line 119

def initialize(ids, db, universe)
  ids = [ids] unless ids.is_a? Array
  @ids = (ids.is_a?(Array) ? ids : [ids])
  @db = db.to_sym
  @universe = universe.to_sym
  raise "Unknown Universe: #{@universe}. Try one of: "+
    "#{@@UNIVERSE.keys}" unless @@UNIVERSE.keys.include? @universe
  raise "Unknown Database: #{@db}. Try one of: "+
    "#{@@UNIVERSE[@universe][:dbs]}" unless
    @@UNIVERSE[@universe][:dbs].include? @db
  # FIXME Part of the +map_to+ support:
  #unless @@UNIVERSE[@universe][:dbs][@db][:map_to].nil?
  #  MiGA::RemoteDataset.download
  #end
end

Instance Attribute Details

#dbObject (readonly)

Database storing the dataset.



113
114
115
# File 'lib/miga/remote_dataset.rb', line 113

def db
  @db
end

#idsObject (readonly)

IDs of the entries composing the dataset.



115
116
117
# File 'lib/miga/remote_dataset.rb', line 115

def ids
  @ids
end

#universeObject (readonly)

Universe of the dataset.



111
112
113
# File 'lib/miga/remote_dataset.rb', line 111

def universe
  @universe
end

Class Method Details

.download(universe, db, ids, format, file = nil) ⇒ Object

Download data from the universe in the database db with IDs ids and in format. If passed, it saves the result in file. Returns String.



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/miga/remote_dataset.rb', line 57

def self.download(universe, db, ids, format, file=nil)
  ids = [ids] unless ids.is_a? Array
  case @@UNIVERSE[universe][:method]
  when :rest
    doc = download_rest(universe, db, ids, format)
  when :net
    doc = download_net(universe, db, ids, format)
  end
  unless file.nil?
    ofh = File.open(file, "w")
    ofh.print doc
    ofh.close
  end
  doc
end

.download_net(universe, db, ids, format) ⇒ Object

Download data using a GET request from the universe in the database db with IDs ids and in format. Returns the doc as String.



90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/miga/remote_dataset.rb', line 90

def self.download_net(universe, db, ids, format)
  u = @@UNIVERSE[universe]
  map_to = u[:dbs][db].nil? ? nil : u[:dbs][db][:map_to]
  url = sprintf(u[:url], db, ids.join(","), format, map_to)
  doc = ""
  @timeout_try = 0
  begin
    open(url) { |f| doc = f.read }
  rescue Net::ReadTimeout
    @timeout_try += 1
    if @timeout_try > 3 ; raise Net::ReadTimeout
    else ; retry
    end
  end
  doc
end

.download_rest(universe, db, ids, format) ⇒ Object

Download data using a REST method from the universe in the database db with IDs ids and in format. Returns the doc as String.



76
77
78
79
80
81
82
83
84
85
# File 'lib/miga/remote_dataset.rb', line 76

def self.download_rest(universe, db, ids, format)
  u = @@UNIVERSE[universe]
  map_to = u[:dbs][db].nil? ? nil : u[:dbs][db][:map_to]
  url = sprintf(u[:url], db, ids.join(","), format, map_to)
  response = RestClient::Request.execute(method: :get, url:url, timeout:600)
  unless response.code == 200
    raise "Unable to reach #{universe} client, error code #{response.code}."
  end
  response.to_s
end

.UNIVERSEObject

Structure of the different database Universes or containers. The structure is a Hash with universe names as keys as Symbol and values being a Hash with supported keys as Symbol:

  • :dbs => Hash with keys being the database name and the values a Hash of properties such as stage, format, and map_to.

  • url => Pattern of the URL where the data can be obtained, where %1$s is the name of the database, %2$s is the IDs, and %3$s is format.

  • method => Method used to query the URL. Only :rest is currently supported.

  • map_to_universe => Universe where results map to. Currently unsupported.



24
# File 'lib/miga/remote_dataset.rb', line 24

def self.UNIVERSE ; @@UNIVERSE ; end

Instance Method Details

#download(file) ⇒ Object

Download data into file.



189
190
191
192
# File 'lib/miga/remote_dataset.rb', line 189

def download(file)
  MiGA::RemoteDataset.download(universe, db, ids,
    @@UNIVERSE[universe][:dbs][db][:format], file)
end

#get_metadata(metadata = {}) ⇒ Object

Get metadata from the remote location.



178
179
180
181
182
183
184
185
# File 'lib/miga/remote_dataset.rb', line 178

def (={})
  case universe
  when :ebi, :ncbi
    # Get taxonomy
    [:tax] = get_ncbi_taxonomy
  end
  
end

#get_ncbi_taxidObject

Get NCBI Taxonomy ID.



196
197
198
# File 'lib/miga/remote_dataset.rb', line 196

def get_ncbi_taxid
  send("get_ncbi_taxid_from_#{universe}")
end

#get_ncbi_taxonomyObject

Get NCBI taxonomy as MiGA::Taxonomy.



202
203
204
205
206
207
208
209
210
211
212
213
214
# File 'lib/miga/remote_dataset.rb', line 202

def get_ncbi_taxonomy
  lineage = {}
  tax_id = get_ncbi_taxid
  while !(tax_id.nil? or %w{0 1}.include? tax_id)
    doc = MiGA::RemoteDataset.download(:ebi, :taxonomy, tax_id, "")
    name = doc.scan(/SCIENTIFIC NAME\s+:\s+(.+)/).first.to_a.first
    rank = doc.scan(/RANK\s+:\s+(.+)/).first.to_a.first
    rank = "dataset" if lineage.empty? and rank=="no rank"
    lineage[rank] = name unless rank.nil?
    tax_id = doc.scan(/PARENT ID\s+:\s+(.+)/).first.to_a.first
  end
  MiGA::Taxonomy.new(lineage)
end

#save_to(project, name = nil, is_ref = true, metadata = {}) ⇒ Object

Save dataset to the MiGA::Project project identified with name. is_ref indicates if it should be a reference dataset, and contains metadata.



138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# File 'lib/miga/remote_dataset.rb', line 138

def save_to(project, name=nil, is_ref=true, ={})
  name ||= ids.join("_").miga_name
  project = MiGA::Project.new(project) if project.is_a? String
  if MiGA::Dataset.exist?(project, name)
    raise "Dataset #{name} exists in the project, aborting..."
  end
   = ()
  udb = @@UNIVERSE[universe][:dbs][db]
  ["#{universe}_#{db}"] = ids.join(",")
  case udb[:stage]
  when :assembly
    dir = MiGA::Dataset.RESULT_DIRS[:assembly]
    base = "#{project.path}/data/#{dir}/#{name}"
    l_ctg = "#{base}.LargeContigs.fna"
    a_ctg = "#{base}.AllContigs.fna"
    File.open("#{base}.start", "w") { |ofh| ofh.puts Time.now.to_s }
    if udb[:format] == :fasta_gz
      download "#{l_ctg}.gz"
      system "gzip -d '#{l_ctg}.gz'"
    else
      download l_ctg
    end
    File.unlink(a_ctg) if File.exist? a_ctg
    File.symlink(File.basename(l_ctg), a_ctg)
    File.open("#{base}.done", "w") { |ofh| ofh.puts Time.now.to_s }
  else
    raise "Unexpected error: Unsupported result for database #{db}."
  end
  dataset = MiGA::Dataset.new(project, name, is_ref, )
  project.add_dataset(dataset.name)
  result = dataset.add_result(udb[:stage], true, is_clean:true)
  raise "Empty dataset created: seed result was not added due to " +
    "incomplete files." if result.nil?
  result.clean!
  result.save
  dataset
end