Module: MiGA::Project::Dataset

Included in:
MiGA::Project
Defined in:
lib/miga/project/dataset.rb

Overview

Helper module including specific functions handle datasets.

Instance Method Summary collapse

Instance Method Details

#add_dataset(name) ⇒ Object

Add dataset identified by name and return MiGA::Dataset.



73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/miga/project/dataset.rb', line 73

def add_dataset(name)
  unless [:datasets].include? name
    d = MiGA::Dataset.new(self, name)
    @metadata[:datasets] << name
    @dataset_names_hash[name] = true if @dataset_names_hash
    @dataset_names_set << name if @dataset_names_set
    @dataset_ref_active = nil
    save
    if d.ref? && d.active?
      recalculate_tasks("Reference dataset added: #{d.name}")
    end
    pull_hook(:on_add_dataset, name)
  end
  dataset(name)
end

#dataset(name) ⇒ Object

Returns MiGA::Dataset



52
53
54
55
56
57
58
59
# File 'lib/miga/project/dataset.rb', line 52

def dataset(name)
  name = name.to_s.miga_name
  return nil unless MiGA::Dataset.exist?(self, name)

  @datasets ||= {}
  @datasets[name] ||= MiGA::Dataset.new(self, name)
  @datasets[name]
end

#dataset_namesObject

Returns Array of String (without evaluating dataset objects)



15
16
17
# File 'lib/miga/project/dataset.rb', line 15

def dataset_names
  [:datasets]
end

#dataset_names_hashObject

Returns Hash of { String => true }. Similar to dataset_names but as Hash for efficiency



22
23
24
25
# File 'lib/miga/project/dataset.rb', line 22

def dataset_names_hash
  warn 'The Project#dataset_names_hash method will be deprecated soon'
  @dataset_names_hash ||= Hash[dataset_names.map { |i| [i, true] }]
end

#dataset_names_setObject

Returns Set of Strings. Similar to dataset_names but as Set for efficiency



30
31
32
# File 'lib/miga/project/dataset.rb', line 30

def dataset_names_set
  @dataset_names_set ||= Set.new(dataset_names)
end

#dataset_ref_activeObject

Cache for the special set of datasets which are both reference and active, returned as an Array. Use carefully, as it doesn’t get recalculated upon dataset (in)activation once loaded. To force recalculating, use dataset_ref_active!



39
40
41
# File 'lib/miga/project/dataset.rb', line 39

def dataset_ref_active
  @dataset_ref_active ||= dataset_ref_active!
end

#dataset_ref_active!Object

Force recalculation of dataset_ref_active and returns the Array of MiGA::Dataset objects



46
47
48
# File 'lib/miga/project/dataset.rb', line 46

def dataset_ref_active!
  @dataset_ref_active = datasets.select(&:ref?).select(&:active?)
end

#datasetsObject

Returns Array of MiGA::Dataset



9
10
11
# File 'lib/miga/project/dataset.rb', line 9

def datasets
  [:datasets].map { |name| dataset(name) }
end

#done_preprocessing?(save = false) ⇒ Boolean

Are all the datasets in the project preprocessed? Save intermediate results if save (until the first incomplete dataset is reached).

Returns:

  • (Boolean)


172
173
174
175
176
# File 'lib/miga/project/dataset.rb', line 172

def done_preprocessing?(save = false)
  !each_dataset.any? do |d|
    d.ref? && d.active? && !d.done_preprocessing?(save)
  end
end

#each_dataset(&blk) ⇒ Object

Iterate through datasets (MiGA::Dataset)



63
64
65
66
67
68
69
# File 'lib/miga/project/dataset.rb', line 63

def each_dataset(&blk)
  if block_given?
    [:datasets].each { |name| blk.call(dataset(name)) }
  else
    to_enum(:each_dataset)
  end
end

#each_dataset_profile_advance(&blk) ⇒ Object

Call blk passing the result of MiGA::Dataset#profile_advance for each registered dataset.



194
195
196
# File 'lib/miga/project/dataset.rb', line 194

def each_dataset_profile_advance(&blk)
  each_dataset { |ds| blk.call(ds.profile_advance) }
end

#import_dataset(ds, method = :hardlink) ⇒ Object

Import the dataset ds, a MiGA::Dataset, using method which is any method supported by File#generic_transfer.



110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# File 'lib/miga/project/dataset.rb', line 110

def import_dataset(ds, method = :hardlink)
  raise "Impossible to import dataset, it already exists: #{ds.name}." if
    MiGA::Dataset.exist?(self, ds.name)

  # Import dataset results
  ds.each_result do |task, result|
    # import result files
    result.each_file do |file|
      File.generic_transfer(
        File.join(result.dir, file),
        File.join(path, 'data', MiGA::Dataset.RESULT_DIRS[task], file),
        method
      )
    end
    # import result metadata
    %w(json start done).each do |suffix|
      if File.exist? File.join(result.dir, "#{ds.name}.#{suffix}")
        File.generic_transfer(
          File.join(result.dir, "#{ds.name}.#{suffix}"),
          File.join(
            path, 'data', MiGA::Dataset.RESULT_DIRS[task],
            "#{ds.name}.#{suffix}"
          ),
          method
        )
      end
    end
  end
  # Import dataset metadata
  File.generic_transfer(
    File.join(ds.project.path, 'metadata', "#{ds.name}.json"),
    File.join(self.path, 'metadata', "#{ds.name}.json"),
    method
  )
  # Save dataset
  self.add_dataset(ds.name)
end

#profile_datasets_advanceObject

Returns a two-dimensional matrix (Array of Array) where the first index corresponds to the dataset, the second index corresponds to the dataset task, and the value corresponds to:

  • 0: Before execution.

  • 1: Done (or not required).

  • 2: To do.



185
186
187
188
189
# File 'lib/miga/project/dataset.rb', line 185

def profile_datasets_advance
  advance = []
  each_dataset_profile_advance { |adv| advance << adv }
  advance
end

Unlink dataset identified by name and return MiGA::Dataset.



91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/miga/project/dataset.rb', line 91

def unlink_dataset(name)
  d = dataset(name)
  return nil if d.nil?

  @dataset_names_hash = nil
  @dataset_names_set  = nil
  @dataset_ref_active = nil
  self.[:datasets].delete(name)
  save
  if d.ref? && d.active?
    recalculate_tasks("Reference dataset unlinked: #{d.name}")
  end
  pull_hook(:on_unlink_dataset, name)
  d
end

#unregistered_datasetsObject

Find all datasets with (potential) result files but are yet unregistered.



150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# File 'lib/miga/project/dataset.rb', line 150

def unregistered_datasets
  datasets = []
  MiGA::Dataset.RESULT_DIRS.values.each do |dir|
    dir_p = "#{path}/data/#{dir}"
    next unless Dir.exist? dir_p

    Dir.entries(dir_p).each do |file|
      next unless
        file =~ %r{
          \.(fa(a|sta|stqc?)?|fna|solexaqa|gff[23]?|done|ess)(\.gz)?$
        }x

      m = /([^\.]+)/.match(file)
      datasets << m[1] unless m.nil? or m[1] == "miga-project"
    end
  end
  datasets.uniq - [:datasets]
end