Class: MiGA::RemoteDataset

Inherits:
MiGA
  • Object
show all
Defined in:
lib/miga/remote_dataset.rb

Overview

MiGA representation of datasets with data in remote locations.

Constant Summary collapse

@@UNIVERSE =
{
  web:{
    dbs: {
      assembly:{stage: :assembly, format: :fasta},
      assembly_gz:{stage: :assembly, format: :fasta_gz}
    },
    url: "%2$s",
    method: :net
  },
  ebi:{
    dbs: { embl:{stage: :assembly, format: :fasta} },
    url: "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/%1$s/%2$s/%3$s",
    method: :rest
  },
  ncbi:{
    dbs: { nuccore:{stage: :assembly, format: :fasta} },
    url: "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/" +
      "efetch.fcgi?db=%1$s&id=%2$s&rettype=%3$s&retmode=text",
    method: :rest
  },
  ncbi_map:{
    dbs: { assembly:{map_to: :nuccore, format: :text} },
    url: "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/" +
      # FIXME ncbi_map is intended to do internal NCBI mapping between
      # databases.
      "elink.fcgi?dbfrom=%1$s&id=%2$s&db=%3$s - - - - -",
    method: :rest,
    map_to_universe: :ncbi
  }
}

Constants included from MiGA

CITATION, VERSION, VERSION_DATE, VERSION_NAME

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from MiGA

CITATION, DEBUG, DEBUG_OFF, DEBUG_ON, DEBUG_TRACE_OFF, DEBUG_TRACE_ON, FULL_VERSION, LONG_VERSION, VERSION, VERSION_DATE, initialized?, #result_files_exist?, root_path, tabulate

Constructor Details

#initialize(ids, db, universe) ⇒ RemoteDataset

Initialize MiGA::RemoteDataset with ids in database db from universe.



97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# File 'lib/miga/remote_dataset.rb', line 97

def initialize(ids, db, universe)
  ids = [ids] unless ids.is_a? Array
  @ids = (ids.is_a?(Array) ? ids : [ids])
  @db = db.to_sym
  @universe = universe.to_sym
  raise "Unknown Universe: #{@universe}. Try one of: "+
    "#{@@UNIVERSE.keys}" unless @@UNIVERSE.keys.include? @universe
  raise "Unknown Database: #{@db}. Try one of: "+
    "#{@@UNIVERSE[@universe][:dbs]}" unless
    @@UNIVERSE[@universe][:dbs].include? @db
  # FIXME Part of the +map_to+ support:
  #unless @@UNIVERSE[@universe][:dbs][@db][:map_to].nil?
  #  MiGA::RemoteDataset.download
  #end
end

Instance Attribute Details

#dbObject (readonly)

Database storing the dataset.



91
92
93
# File 'lib/miga/remote_dataset.rb', line 91

def db
  @db
end

#idsObject (readonly)

IDs of the entries composing the dataset.



93
94
95
# File 'lib/miga/remote_dataset.rb', line 93

def ids
  @ids
end

#universeObject (readonly)

Universe of the dataset.



89
90
91
# File 'lib/miga/remote_dataset.rb', line 89

def universe
  @universe
end

Class Method Details

.download(universe, db, ids, format, file = nil) ⇒ Object

Download data from the universe in the database db with IDs ids and in format. If passed, it saves the result in file. Returns String.



58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/miga/remote_dataset.rb', line 58

def self.download(universe, db, ids, format, file=nil)
  ids = [ids] unless ids.is_a? Array
  case @@UNIVERSE[universe][:method]
  when :rest
    map_to = @@UNIVERSE[universe][:dbs][db].nil? ? nil :
      @@UNIVERSE[universe][:dbs][db][:map_to]
    url = sprintf @@UNIVERSE[universe][:url],
      db, ids.join(","), format, map_to
    response = RestClient::Request.execute(:method=>:get,  :url=>url,
      :timeout=>600)
    raise "Unable to reach #{universe} client, error code "+
      "#{response.code}." unless response.code == 200
    doc = response.to_s
  when :net
    url = sprintf @@UNIVERSE[universe][:url],
      db, ids.join(","), format, map_to
    doc = ""
    open(url) { |f| doc = f.read }
  end
  unless file.nil?
    ofh = File.open(file, "w")
    ofh.print doc
    ofh.close
  end
  doc
end

.UNIVERSEObject

Structure of the different database Universes or containers. The structure is a Hash with universe names as keys as Symbol and values being a Hash with supported keys as Symbol:

  • :dbs => Hash with keys being the database name and the values a Hash of properties such as stage, format, and map_to.

  • url => Pattern of the URL where the data can be obtained, where %1$s is the name of the database, %2$s is the IDs, and %3$s is format.

  • method => Method used to query the URL. Only :rest is currently supported.

  • map_to_universe => Universe where results map to. Currently unsupported.



23
# File 'lib/miga/remote_dataset.rb', line 23

def self.UNIVERSE ; @@UNIVERSE ; end

Instance Method Details

#download(file) ⇒ Object

Download data into file.



159
160
161
162
# File 'lib/miga/remote_dataset.rb', line 159

def download(file)
  MiGA::RemoteDataset.download(universe, db, ids,
    @@UNIVERSE[universe][:dbs][db][:format], file)
end

#get_metadata(metadata = {}) ⇒ Object

Get metadata from the remote location.



148
149
150
151
152
153
154
155
# File 'lib/miga/remote_dataset.rb', line 148

def (={})
  case universe
  when :ebi, :ncbi
    # Get taxonomy
    [:tax] = get_ncbi_taxonomy
  end
  
end

#get_ncbi_taxidObject

Get NCBI Taxonomy ID.



166
167
168
# File 'lib/miga/remote_dataset.rb', line 166

def get_ncbi_taxid
  send("get_ncbi_taxid_from_#{universe}")
end

#get_ncbi_taxonomyObject

Get NCBI taxonomy as MiGA::Taxonomy.



172
173
174
175
176
177
178
179
180
181
182
183
184
# File 'lib/miga/remote_dataset.rb', line 172

def get_ncbi_taxonomy
  lineage = {}
  tax_id = get_ncbi_taxid
  while !(tax_id.nil? or %w{0 1}.include? tax_id)
    doc = MiGA::RemoteDataset.download(:ebi, :taxonomy, tax_id, "")
    name = doc.scan(/SCIENTIFIC NAME\s+:\s+(.+)/).first.to_a.first
    rank = doc.scan(/RANK\s+:\s+(.+)/).first.to_a.first
    rank = "dataset" if lineage.empty? and rank=="no rank"
    lineage[rank] = name unless rank.nil?
    tax_id = doc.scan(/PARENT ID\s+:\s+(.+)/).first.to_a.first
  end
  MiGA::Taxonomy.new(lineage)
end

#save_to(project, name = nil, is_ref = true, metadata = {}) ⇒ Object

Save dataset to the MiGA::Project project identified with name. is_ref indicates if it should be a reference dataset, and contains metadata.



116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# File 'lib/miga/remote_dataset.rb', line 116

def save_to(project, name=nil, is_ref=true, ={})
  name = ids.join("_").miga_name if name.nil?
  project = MiGA::Project.new(project) if project.is_a? String
  raise "Dataset #{name} exists in the project, aborting..." if
    MiGA::Dataset.exist?(project, name)
   = ()
  case @@UNIVERSE[universe][:dbs][db][:stage]
  when :assembly
    base = project.path + "/data/" + MiGA::Dataset.RESULT_DIRS[:assembly] +
      "/" + name
    File.open("#{base}.start", "w") { |ofh| ofh.puts Time.now.to_s }
    if @@UNIVERSE[universe][:dbs][db][:format] == :fasta_gz
      download("#{base}.LargeContigs.fna.gz")
      system("gzip -d #{base}.LargeContigs.fna.gz")
    else
      download("#{base}.LargeContigs.fna")
    end
    File.symlink("#{base}.LargeContigs.fna", "#{base}.AllContigs.fna")
    File.open("#{base}.done", "w") { |ofh| ofh.puts Time.now.to_s }
  else
    raise "Unexpected error: Unsupported result for database #{db}."
  end
  dataset = MiGA::Dataset.new(project, name, is_ref, )
  project.add_dataset(dataset.name)
  result = dataset.add_result @@UNIVERSE[universe][:dbs][db][:stage]
  raise "Empty dataset created: seed result was not added due to "+
    "incomplete files." if result.nil?
  dataset
end