Class: Matrix

Inherits:
Object
  • Object
show all
Extended by:
Resource
Defined in:
lib/rbbt/matrix.rb,
lib/rbbt/matrix/barcode.rb,
lib/rbbt/matrix/differential.rb,
lib/rbbt/expression_old/matrix.rb

Constant Summary collapse

MATRIX_DIR =
Matrix.root.find

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(data, identifiers, labels = nil, key_field = nil, organism = nil, log2 = false, channel = false) ⇒ Matrix

Returns a new instance of Matrix.



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/rbbt/matrix.rb', line 14

def initialize(data_file, labels = nil, value_type = nil, format = nil, organism=nil, identifiers=nil)
  @data_file = data_file
  @labels = labels 
  @value_type = value_type || 'count'
  @format = format
  _header = nil
  @format ||=  begin
                 _header ||= TSV.parse_header(@data_file)
                 _header.key_field || "ID"
               end
  @organism = organism 
  @organism ||=  begin
                   _header ||= TSV.parse_header(@data_file)
                   _header.namespace || Organism.default_code("Hsa")
                 end
  @identifiers = identifiers 
end

Class Attribute Details

.matrix_dirObject

Returns the value of attribute matrix_dir.



7
8
9
# File 'lib/rbbt/matrix.rb', line 7

def matrix_dir
  @matrix_dir
end

Instance Attribute Details

#channelObject

Returns the value of attribute channel.



28
29
30
# File 'lib/rbbt/expression_old/matrix.rb', line 28

def channel
  @channel
end

#dataObject

Returns the value of attribute data.



28
29
30
# File 'lib/rbbt/expression_old/matrix.rb', line 28

def data
  @data
end

#data_fileObject

Returns the value of attribute data_file.



13
14
15
# File 'lib/rbbt/matrix.rb', line 13

def data_file
  @data_file
end

#formatObject

Returns the value of attribute format.



13
14
15
# File 'lib/rbbt/matrix.rb', line 13

def format
  @format
end

#identifiersObject

Returns the value of attribute identifiers.



13
14
15
# File 'lib/rbbt/matrix.rb', line 13

def identifiers
  @identifiers
end

#key_fieldObject

Returns the value of attribute key_field.



28
29
30
# File 'lib/rbbt/expression_old/matrix.rb', line 28

def key_field
  @key_field
end

#labelsObject

Returns the value of attribute labels.



13
14
15
# File 'lib/rbbt/matrix.rb', line 13

def labels
  @labels
end

#log2Object

Returns the value of attribute log2.



28
29
30
# File 'lib/rbbt/expression_old/matrix.rb', line 28

def log2
  @log2
end

#organismObject

Returns the value of attribute organism.



13
14
15
# File 'lib/rbbt/matrix.rb', line 13

def organism
  @organism
end

#samplesObject

Returns the value of attribute samples.



32
33
34
# File 'lib/rbbt/matrix.rb', line 32

def samples
  @samples ||= TSV.parse_header(@data_file).fields
end

#value_typeObject

Returns the value of attribute value_type.



13
14
15
# File 'lib/rbbt/matrix.rb', line 13

def value_type
  @value_type
end

Class Method Details

.geo_matrix_for(gds, key_field = nil, organism = nil) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/rbbt/expression_old/matrix.rb', line 15

def self.geo_matrix_for(gds, key_field = nil, organism = nil)
  data    = GEO[gds].values.produce.find
  samples = GEO[gds].samples.produce.find

  dataset_info = GEO[gds]['info.yaml'].produce.yaml
  platform     = dataset_info[:platform]
  identifiers  = GEO[platform].codes.produce.find

  log2         = ["count"].include? dataset_info[:value_type]

  Matrix.new(data, identifiers, samples, key_field, organism, log2)
end

Instance Method Details

#activity_cluster(outfile, factor = 2) ⇒ Object



48
49
50
51
52
53
54
55
56
57
# File 'lib/rbbt/matrix/barcode.rb', line 48

def activity_cluster(outfile, factor = 2)

  FileUtils.mkdir_p File.dirname(outfile) unless outfile.nil? or File.exists? File.dirname(outfile)
  cmd =<<-EOF
source('#{Rbbt.share.R['barcode.R'].find}')
rbbt.GE.activity_cluster(#{ R.ruby2R self.data_file }, #{ R.ruby2R outfile }, #{R.ruby2R value_type})
  EOF

  R.run(cmd)
end

#average_label(value, field = nil) ⇒ Object



69
70
71
72
73
# File 'lib/rbbt/expression_old/matrix.rb', line 69

def average_label(value, field = nil)
  samples = find_samples(value, field)
  samples = remove_missing(samples)
  average_samples(samples)
end

#average_samples(samples) ⇒ Object



51
52
53
54
55
56
57
# File 'lib/rbbt/expression_old/matrix.rb', line 51

def average_samples(samples)
  path = Persist.persistence_path(matrix_file, {:dir => File.join(Matrix::MATRIX_DIR, 'averaged_samples')}, {:samples => samples})
  Persist.persist(data, :tsv, :file => path, :no_load => true, :check => [matrix_file]) do
    Expression.average_samples(matrix_file, samples)
  end
  path
end

#barcode(path = nil, factor = 2) ⇒ Object



4
5
6
7
8
9
10
11
12
13
# File 'lib/rbbt/matrix/barcode.rb', line 4

def barcode(outfile, factor = 2)

  FileUtils.mkdir_p File.dirname(outfile) unless outfile.nil? or File.exists? File.dirname(outfile)
  cmd =<<-EOF
source('#{Rbbt.share.R['barcode.R'].find}')
rbbt.GE.barcode.mode(#{ R.ruby2R self.data_file }, #{ R.ruby2R outfile }, #{ R.ruby2R factor })
  EOF

  R.run(cmd)
end

#barcode_ruby(outfile, factor = 2) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/rbbt/matrix/barcode.rb', line 15

def barcode_ruby(outfile, factor = 2)
  parser = TSV::Parser.new self.data_file
  dumper = TSV::Dumper.new parser.options.merge(:type => :list, :cast => :to_i)
  dumper.init

  TSV.traverse parser, :into => dumper, :bar => "Barcoding #{self.data_file}" do |key,values|
    clean_values = values.flatten.compact.collect{|v| v.to_f}
    modes = R.eval("rbbt.get.modes(#{R.ruby2R clean_values})$modes")
    mode = Array === modes ? modes.first : modes
    mode_values = clean_values.select{|v| v.to_f <= mode}
    mode_values.concat mode_values.collect{|v| v+mode}
    sd = Misc.sd mode_values 
    if sd.nil?
      [key, [nil] * values.length]
    else
      threshold = mode + sd
      bars = if Array === values.compact.first 
        values.collect do |v|
          Misc.mean(v.compact.collect{|v| v.to_f}) > threshold ? 1 : 0
        end
      else
        values.collect do |v|
          v.to_f > threshold ? 1 : 0
        end
      end
      key = key.first if Array === key
    [key, bars]
    end
  end

  Misc.sensiblewrite(outfile, dumper.stream)
end

#comparison(main, contrast, subsets = nil) ⇒ Object



97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# File 'lib/rbbt/matrix.rb', line 97

def comparison(main, contrast, subsets = nil)
  subsets ||= self.subsets

  if main.index "="
    main_factor, main_value = main.split "=" 
    raise ParameterException, "Main selection not understood" if subsets[main_factor].nil? or subsets[main_factor][main_value].nil?
    value = subsets[main_factor][main_value]
    main_samples = String === value ? value.split(',') : value
  else
    main_samples = main.split(/[|,\n]/)
  end

  if contrast
    if contrast.index "="
      contrast_factor, contrast_value = contrast.split "=" 
      raise ParameterException, "Contrast selection not understood" if subsets[contrast_factor].nil? or subsets[contrast_factor][contrast_value].nil?
      value = subsets[contrast_factor][contrast_value]
      contrast_samples = String === value ? value.split(',') : value
    else
      contrast_samples = contrast.split(/[|,\n]/)
    end
  else
    if subsets and main_factor
      contrast_samples = subsets[main_factor].values.flatten.collect{|s| s.split ',' }.flatten.uniq - main_samples
    else
      contrast_samples = samples - main_samples
    end
  end
  main_samples = main_samples.compact.reject{|m| m.empty? }.collect{|m| m.strip }
  contrast_samples = contrast_samples.compact.reject{|m| m.empty? }.collect{|m| m.strip }

  [main_samples, contrast_samples]
end

#differential(main, contrast, path = nil) ⇒ Object



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/rbbt/matrix/differential.rb', line 4

def differential(main, contrast, path = nil)
  all_samples = self.samples
  if Array === main and Array === contrast
    main_samples, contrast_samples = main, contrast
  else
    main_samples, contrast_samples = comparison main, contrast
  end

  name = data_file =~ /:>/ ? File.basename(data_file) : data_file
  main_samples = main_samples & all_samples
  contrast_samples = contrast_samples & all_samples
  raise "No main samples found" if main_samples.empty?
  raise "No contrast samples found" if contrast_samples.empty?

  Persist.persist(name, :tsv, :persist => true, :file => path,
                  :other => {:main => main_samples, :contrast => contrast_samples}, 
                  :prefix => "Diff", :dir => Matrix.matrix_dir.differential, :no_load => true) do |file|

    raise if file.nil?

    case value_type
    when 'two_channel'
      log2 = true
      trend = false
      two_channel = true
    when nil, 'count', 'counts'
      log2 = true
      trend = false
      two_channel = false
    when 'fpkm'
      log2 = true
      trend = true
      two_channel = false
    when 'log2 ratio', 'transformed count'
      log2 = false
      trend = false
      two_channel = false
    else
      Log.warn "Unkown value_type: #{value_type}"
      log2 = true
      trend = false
      two_channel = false
    end

    file = file.find if Path === file
    FileUtils.mkdir_p File.dirname(file) unless file.nil? or File.exists? File.dirname(file)

    cmd = <<-EOS

source('#{Rbbt.share.R["MA.R"].find(:lib)}')

data = rbbt.dm.matrix.differential(#{ R.ruby2R data_file }, 
main = #{R.ruby2R(main_samples)}, 
contrast = #{R.ruby2R(contrast_samples)}, 
log2=#{ R.ruby2R log2 }, 
outfile = #{R.ruby2R file}, 
key.field = #{R.ruby2R format}, 
two.channel = #{R.ruby2R two_channel},
namespace = #{R.ruby2R organism},
eBayes.trend = #{R.ruby2R trend}
)
      EOS

      R.run(cmd, :monitor => true)
  end
end

#find_samples(value, field = nil) ⇒ Object



59
60
61
62
63
# File 'lib/rbbt/expression_old/matrix.rb', line 59

def find_samples(value, field = nil)
  labels.select(field){|k,v|
    Array === v ? v.flatten.include?(value) : v == value
  }.keys
end

#label_differences(main, contrast = nil, field = nil) ⇒ Object



92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/rbbt/expression_old/matrix.rb', line 92

def label_differences(main, contrast = nil, field = nil)
  all_samples = labels.keys
  main_samples = find_samples(main, field)
  if contrast
    contrast_samples = find_samples(contrast, field)
  else
    contrast_samples = all_samples - main_samples
  end

  main_samples = remove_missing(main_samples)
  contrast_samples = remove_missing(contrast_samples)

  sample_differences(main_samples, contrast_samples)
end

#matrix_file(path = nil) ⇒ Object



41
42
43
44
45
46
47
48
49
# File 'lib/rbbt/expression_old/matrix.rb', line 41

def matrix_file(path = nil)
  path ||= Persist.persistence_path(data, {:dir => Matrix::MATRIX_DIR}, {:identifiers => identifiers, :labels => labels, :key_field => key_field, :organism => organism})
  Persist.persist(data, :tsv, :file => path, :check => [data], :no_load => true) do
    matrix = Expression.load_matrix(data, identifiers, key_field, organism)
    matrix = matrix.select(:key => Organism.sanctioned_genes(organism).list) if matrix.key_field == "Ensembl Gene ID"
    matrix
  end
  path
end

#random_forest_importance(main, contrast = nil, field = nil, options = {}) ⇒ Object



125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# File 'lib/rbbt/expression_old/matrix.rb', line 125

def random_forest_importance(main, contrast = nil, field = nil, options = {})
  features = Misc.process_options options, :features
  features ||= []

  path = Persist.persistence_path(matrix_file, {:dir => File.join(Matrix::MATRIX_DIR, 'random_forest_importance')}, {:main => main, :contrast => contrast, :field => field, :features => features})
  Persist.persist(data, :tsv, :file => path, :no_load => false, :check => [matrix_file]) do
    all_samples = labels.keys
    main_samples = find_samples(main, field)
    if contrast
      contrast_samples = find_samples(contrast, field)
    else
      contrast_samples = all_samples - main_samples
    end


    main_samples     = remove_missing(main_samples)
    contrast_samples = remove_missing(contrast_samples)

    TmpFile.with_file do |result|
      R.run <<-EOF
library(randomForest);
orig = rbbt.tsv('#{matrix_file}');
main = c('#{main_samples * "', '"}')
contrast = c('#{contrast_samples * "', '"}')
features = c('#{features * "', '"}')

features = intersect(features, rownames(orig));
data = t(orig[features, c(main, contrast)])
data = cbind(data, Class = 0)
data[main, "Class"] = 1

rf = randomForest(factor(Class) ~ ., data, na.action = na.exclude)
rbbt.tsv.write(rf$importance, filename='#{ result }', key.field = '#{@key_field}')
      EOF

      TSV.open(result, :type => :single, :cast => :to_f)
    end
  end
end

#remove_missing(samples) ⇒ Object



65
66
67
# File 'lib/rbbt/expression_old/matrix.rb', line 65

def remove_missing(samples)
  @samples & samples
end

#sample_differences(main, contrast) ⇒ Object



84
85
86
87
88
89
90
# File 'lib/rbbt/expression_old/matrix.rb', line 84

def sample_differences(main, contrast)
  path = Persist.persistence_path(matrix_file, {:dir => File.join(Matrix::MATRIX_DIR, 'sample_differences')}, {:main => main, :contrast => contrast, :log2 => log2, :channel => channel})
  Persist.persist(data, :tsv, :file => path, :no_load => true, :check => [matrix_file]) do
    Expression.differential(matrix_file, main, contrast, log2, channel)
  end
  path
end

#signature_set(field, cast = nil) ⇒ Object



107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/rbbt/expression_old/matrix.rb', line 107

def signature_set(field, cast = nil)
  path = Persist.persistence_path(matrix_file, {:dir => File.join(Matrix::MATRIX_DIR, 'signature_set')}, {:field => field, :cast => cast})
  Persist.persist(data, :tsv, :file => path, :no_load => true, :check => [matrix_file]) do
    signatures = TSV.open(matrix_file, :fields => [], :type => :list, :cast => cast)
    labels.values.flatten.uniq.sort.each do |value|
      begin
        s = Signature.tsv_field(label_differences(value), field, cast)
        s.fields = [value]
        signatures.attach s
      rescue Exception
        Log.warn("Signature for #{ value } did not compute")
      end
    end
    signatures
  end
  path
end

#subsetsObject



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/rbbt/matrix.rb', line 40

def subsets
  @subsets ||= begin
                 subsets = {}
                 case @labels
                 when Path
                   if @labels.exists?
                    labels = @labels.tsv
                    factors = labels.fields
                    labels.through do |sample,values|
                      factors.zip(values).each do |factor,value|
                        subsets[factor] ||= {}
                        subsets[factor][value] ||= []
                        subsets[factor][value] << sample
                      end
                    end
                   end

                 when TSV
                   factors = @labels.fields
                   @labels.through do |sample,values|
                     factors.zip(values).each do |factor,value|
                       subsets[factor] ||= {}
                       subsets[factor][value] ||= []
                       subsets[factor][value] << sample
                     end
                   end
                 when Hash
                   @labels.each do |factor,info|
                     subsets[factors] ||= {}
                     info.each do |value, samples|
                       subsets[factors][value] = case samples
                                                 when Array 
                                                   samples
                                                 when String
                                                   samples.split ','
                                                 else
                                                   raise "Format of samples not understood: #{Misc.finguerprint samples}"
                                                 end

                     end
                   end
                 end

                 clean_subsets = {}
                 subsets.each do |factor,values|
                   next if values.nil? or values.size < 2
                   values.each do |level,samples|
                     next if samples.nil? or samples.length < 2
                     clean_subsets[factor] ||= {}
                     clean_subsets[factor][level] = samples
                   end
                 end

                 clean_subsets
               end
end

#subsets=(subsets) ⇒ Object



36
37
38
# File 'lib/rbbt/matrix.rb', line 36

def subsets=(subsets)
  @subsets = subsets
end

#to_activity(factor = 2) ⇒ Object



177
178
179
180
181
182
183
184
185
186
187
188
189
# File 'lib/rbbt/matrix.rb', line 177

def to_activity(factor = 2)
  require 'rbbt/tsv/change_id'

  name = data_file =~ /:>/ ? File.basename(data_file) : data_file

  file = Persist.persist(data_file, :tsv, :prefix => "Activity #{factor}", :dir => Matrix.matrix_dir.barcode, :no_load => true) do |filename|
    activity_cluster(filename, factor)
  end
  subsets = self.subsets
  matrix = Matrix.new file, labels, value_type, "Ensembl Gene ID", organism
  matrix.subsets = subsets
  matrix
end

#to_barcode(factor = 2) ⇒ Object



165
166
167
168
169
170
171
172
173
174
175
# File 'lib/rbbt/matrix.rb', line 165

def to_barcode(factor = 2)
  name = data_file =~ /:>/ ? File.basename(data_file) : data_file

  file = Persist.persist(data_file, :tsv, :prefix => "Barcode R #{factor}", :dir => Matrix.matrix_dir.barcode, :no_load => true) do |filename|
    barcode(filename, factor)
  end
  subsets = self.subsets
  matrix = Matrix.new file, labels, value_type, "Ensembl Gene ID", organism
  matrix.subsets = subsets
  matrix
end

#to_barcode_ruby(factor = 2) ⇒ Object



153
154
155
156
157
158
159
160
161
162
163
# File 'lib/rbbt/matrix.rb', line 153

def to_barcode_ruby(factor = 2)
  name = data_file =~ /:>/ ? File.basename(data_file) : data_file

  file = Persist.persist(data_file, :tsv, :prefix => "Barcode #{factor}", :dir => Matrix.matrix_dir.barcode, :no_load => true) do |filename|
    barcode_ruby(filename, factor)
  end
  subsets = self.subsets
  matrix = Matrix.new file, labels, value_type, "Ensembl Gene ID", organism
  matrix.subsets = subsets
  matrix
end

#to_gene(identifiers = nil) ⇒ Object



132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# File 'lib/rbbt/matrix.rb', line 132

def to_gene(identifiers = nil)
  require 'rbbt/tsv/change_id'

  name = data_file =~ /:>/ ? File.basename(data_file) : data_file

  file = Persist.persist(data_file, :tsv, :prefix => "Gene", :dir => Matrix.matrix_dir.values, :no_load => true) do

    data = data_file.tsv(:cast => :to_f)

    identifiers = [identifiers, @identifiers, data.identifiers, Organism.identifiers(organism)].flatten.compact.uniq

    data.change_key("Ensembl Gene ID", :identifiers => identifiers.reverse) do |v|
      Misc.mean(v.compact)
    end
  end
  subsets = self.subsets
  matrix = Matrix.new file, labels, value_type, "Ensembl Gene ID", organism
  matrix.subsets = subsets
  matrix
end

#tsv(to_gene = true, identifiers = nil) ⇒ Object



191
192
193
194
195
196
197
198
# File 'lib/rbbt/matrix.rb', line 191

def tsv(to_gene=true, identifiers = nil)
  if to_gene and TSV.parse_header(self.data_file).key_field != "Ensembl Gene ID"
    file =  self.to_gene(identifiers).data_file
    file.tsv :persist => true, :persist_dir => Matrix.matrix_dir.persist, :type => :double, :merge => true
  else
    self.data_file.tsv :persist => true, :persist_dir => Matrix.matrix_dir.persist, :merge => true
  end
end