Class: PubliSci::Dataset

Inherits:
Object
  • Object
show all
Extended by:
Interactive, Registry
Defined in:
lib/publisci/dataset/dataset.rb,
lib/publisci/dsl/dataset_dsl.rb,
lib/publisci/dataset/data_cube.rb,
lib/publisci/dataset/dataset_for.rb,
lib/publisci/dataset/configuration.rb

Defined Under Namespace

Modules: DSL, DataCube Classes: Configuration

Class Method Summary collapse

Methods included from Interactive

interact

Methods included from Registry

register, registry, symbol_for

Class Method Details

.configurationObject



6
7
8
# File 'lib/publisci/dataset/dataset.rb', line 6

def self.configuration
  @config ||= Dataset::Configuration.new
end

.download(uri) ⇒ Object



66
67
68
69
70
71
# File 'lib/publisci/dataset/dataset_for.rb', line 66

def self.download(uri)
  out = Tempfile.new(uri.split('/').last)
  out.write open(uri).read
  out.close
  out
end

.for(object, options = {}, ask_on_ambiguous = true) ⇒ Object



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/publisci/dataset/dataset_for.rb', line 14

def self.for(object, options={}, ask_on_ambiguous=true)
  if options == false || options == true
    ask_on_ambiguous = options
    options = {}
  end

  if object.is_a? String
    if File.exist? object
      if File.extname(object).size > 0
        extension = File.extname(object)
      elsif File.basename(object)[0] == '.' && File.basename(object).count('.') == 1
        extension = File.basename(object)
      else
        raise "Can't load file #{object}; file type inference not yet implemented"
      end

      if reader_registry.keys.include? extension
        k = reader_registry[extension]
        if k.respond_to? "automatic"
          reader_registry[extension].automatic(object,options,ask_on_ambiguous)
        else
          reader_registry[extension].new.automatic(object,options,ask_on_ambiguous)
        end
      else
        case extension
        when ".RData"
          r_object(object, options, ask_on_ambiguous)
        when /.csv/i
          PubliSci::Readers::CSV.new.automatic(object,nil,options,ask_on_ambiguous)
        when /.arff/i
          PubliSci::Readers::ARFF.new.generate_n3(object)
        else
          # false
          raise "Unkown Extension #{extension}"
        end
      end
    elsif object =~ %r{htt(p|ps)://.+}
      self.for(download(object).path, options, ask_on_ambiguous) || RDF::Statement.new(RDF::URI(object), RDF::URI('http://semanticscience.org/resource/hasValue'), IO.read(download(object).path)).to_s
      # raise res
      # self.for_remote(object)
    else
      raise "Unable to find reader for String '#{object}'"
      # TODO: better handling of missing readers; need this way for raw strings for now
      # false
    end
  elsif object.is_a? Rserve::REXP
    r_object(object, options, ask_on_ambiguous)
  else
    raise "not recognize Ruby objects of this type yet (#{object})"
  end
end

.r_object(object, options = {}, ask_on_ambiguous = true) ⇒ Object



73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# File 'lib/publisci/dataset/dataset_for.rb', line 73

def self.r_object(object, options={}, ask_on_ambiguous=true)
  if object.is_a? String
    con = Rserve::Connection.new
    vars = con.eval("load('#{File.absolute_path object}')")
    if vars.to_ruby.size > 1 && ask_on_ambiguous
      puts "Which variable? #{vars.to_ruby}"
      var = vars.to_ruby[gets.to_i]
    else
      var = vars.to_ruby[0]
    end

    r_classes = con.eval("class(#{var})").to_ruby

    if r_classes.include? "data.frame"
      df = PubliSci::Readers::Dataframe.new
      unless options[:dimensions] || !ask_on_ambiguous
        dims = con.eval("names(#{var})").to_ruby
        puts "Which dimensions? #{dims}"
        selection = gets.chomp
        if selection.size > 0
          options[:dimensions] = selection.split(',').map(&:to_i).map{|i| dims[i]}
        end
      end
      unless options[:measures] || !ask_on_ambiguous
        meas = con.eval("names(#{var})").to_ruby
        puts "Which measures? #{meas} "
        selection = gets.chomp
        if selection.size > 0
          options[:measures] = selection.split(',').map(&:to_i).map{|i| meas[i]}
        end
      end

      df.generate_n3(con.eval(var),var,options)

    elsif r_classes.include? "cross"
      bc = PubliSci::Readers::RCross.new

      unless options[:measures] || !ask_on_ambiguous
        pheno_names = con.eval("names(#{var}$pheno)").to_ruby
        puts "Which phenotype traits? #{pheno_names}"
        selection = gets.chomp
        if selection.size > 0
          options[:measures] = selection.split(',').map(&:to_i).map{|i| pheno_names[i]}
        end
      end

      base = var
      if ask_on_ambiguous
        puts "Output file base?"
        base = gets.chomp
        base = var unless base.size > 0
      end

      bc.generate_n3(con, var, base, options)

    elsif r_classes.include? "matrix"
      mat = PubliSci::Readers::RMatrix.new

      unless options[:measures] || !ask_on_ambiguous
        puts "Row label"
        rows = gets.chomp
        rows = "row" unless rows.size > 0

        puts "Column label"
        cols = gets.chomp
        cols = "column" unless cols.size > 0

        puts "Entry label"
        vals = gets.chomp
        vals = "value" unless vals.size > 0

        options[:measures] = [cols,rows,vals]
      end

      base = var
      if ask_on_ambiguous
        puts "Output file base?"
        base = gets.chomp
        base = var unless base.size > 0
      end

      mat.generate_n3(con, var, base, options)
    else
      raise "no PubliSci::Readers found for #{r_classes}"
    end

  elsif object.is_a? Rserve::REXP
    if object.attr.payload["class"].payload.first

      df = PubliSci::Readers::Dataframe.new

      var = nil

      if ask_on_ambiguous
        var = interact("Dataset name?",nil)
      end

      unless options[:dimensions] || !ask_on_ambiguous
        dims = object.payload.names
        selection = interact("Which dimensions?","row",dims){|s| puts s; nil}
        options[:dimensions] = selection if selection
      end

      unless options[:measures] || !ask_on_ambiguous
        meas = object.payload.names
        options[:measures] = interact("Which measures?",meas,meas)
      end

      df.generate_n3(object,var,options)
    else
      raise "support for other Rserve objects coming shortly"
    end

  else
    raise "#{object} is not an R object"
  end
end

.reader_registryObject



6
7
8
# File 'lib/publisci/dataset/dataset_for.rb', line 6

def self.reader_registry
  @reader_registry ||= {}
end

.register_reader(extension, klass) ⇒ Object



10
11
12
# File 'lib/publisci/dataset/dataset_for.rb', line 10

def self.register_reader(extension,klass)
  reader_registry[extension] = klass
end