Module: TreeClusters

Defined in:
lib/tree_clusters.rb,
lib/tree_clusters/attrs.rb,
lib/tree_clusters/clade.rb,
lib/tree_clusters/version.rb,
lib/tree_clusters/attr_array.rb

Overview

Top level namespace of the Gem.

Defined Under Namespace

Classes: AttrArray, Attrs, Clade

Constant Summary collapse

VERSION =
"0.8.0"

Instance Method Summary collapse

Instance Method Details

#all_clades(tree, metadata = nil) {|clade| ... } ⇒ Enumerator<Clade>

Given a NewickTree, return an array of all Clades in that tree.

Parameters:

Yield Parameters:

  • clade (Clade)

    a clade of the tree

Returns:

  • (Enumerator<Clade>)

    enumerator of Clade objects



155
156
157
158
159
160
161
# File 'lib/tree_clusters.rb', line 155

def all_clades tree, =nil
  return enum_for(:all_clades, tree, ) unless block_given?

  tree.clade_nodes.reverse.each do |node|
    yield Clade.new node, tree, 
  end
end

#check_ids(tree, mapping, aln) ⇒ Object

Note:

If there are quoted names in the tree file, they are unquoted first.



113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# File 'lib/tree_clusters.rb', line 113

def check_ids tree, mapping, aln
  tree_ids = Set.new(NewickTree.fromFile(tree).unquoted_taxa)

  mapping_ids = Set.new
  File.open(mapping, "rt").each_line.with_index do |line, idx|
    unless idx.zero?
      id, *rest = line.chomp.split

      mapping_ids << id
    end
  end

  aln_ids = Set.new
  ParseFasta::SeqFile.open(aln).each_record do |rec|
    aln_ids << rec.id
  end

  if !(tree_ids == mapping_ids && mapping_ids == aln_ids)
    AbortIf::logger.error { "Seq IDs did not match in all input files" }

    tree_ids = tree_ids.to_a.sort
    mapping_ids = mapping_ids.to_a.sort
    aln_ids = aln_ids.to_a.sort

    AbortIf::logger.debug { ["tree_ids", tree_ids].join "\t" }
    AbortIf::logger.debug { ["mapping_ids", mapping_ids].join "\t" }
    AbortIf::logger.debug { ["aln_ids", aln_ids].join "\t" }

    raise AbortIf::Exit
  else
    true
  end
end

#consensus(bases) ⇒ Object

Note:

Each string is upcase’d before frequencies are calculated.

Given an ary of strings, find the most common string in the ary.

Examples:

Upper case and lower case count as the same.

TreeClusters::consensus %w[a A C T] #=> "A"

Ties take the one closest to the end

TreeClusters::consensus %w[a c T t C t g] #=> "T"

Parameters:

  • bases (Array<String>)

    an array of strings



49
50
51
52
53
54
55
56
57
# File 'lib/tree_clusters.rb', line 49

def consensus bases
  bases.
    map(&:upcase).
    group_by(&:itself).
    sort_by { |_, bases| bases.count }.
    reverse.
    first.
    first
end

#low_ent_cols(leaves, leaf2attrs, entropy_cutoff) ⇒ Object



74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'lib/tree_clusters.rb', line 74

def low_ent_cols leaves, leaf2attrs, entropy_cutoff
  low_ent_cols = []
  alns = leaf2attrs.attrs leaves, :aln
  aln_cols = alns.transpose

  aln_cols.each_with_index do |aln_col, aln_col_idx|
    has_gaps = aln_col.any? { |aa| aa == "-" }
    low_entropy =
      Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff

    if !has_gaps && low_entropy
      low_ent_cols << (aln_col_idx + 1)
    end
  end

  Set.new low_ent_cols
end

#low_ent_cols_with_bases(leaves, leaf2attrs, entropy_cutoff) ⇒ Object

Like low_ent_cols method but also returns the bases at the positions.



93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/tree_clusters.rb', line 93

def low_ent_cols_with_bases leaves, leaf2attrs, entropy_cutoff
  low_ent_cols = []
  alns = leaf2attrs.attrs leaves, :aln
  aln_cols = alns.transpose

  aln_cols.each_with_index do |aln_col, aln_col_idx|
    has_gaps = aln_col.any? { |aa| aa == "-" }
    low_entropy =
        Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff

    if !has_gaps && low_entropy
      low_ent_cols << [(aln_col_idx + 1), aln_col.map(&:upcase).uniq.sort]
    end
  end

  Set.new low_ent_cols
end

#read_alignment(aln_fname) ⇒ Object



59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/tree_clusters.rb', line 59

def read_alignment aln_fname
  leaf2attrs = TreeClusters::Attrs.new
  aln_len = nil
  ParseFasta::SeqFile.open(aln_fname).each_record do |rec|
    leaf2attrs[rec.id] = { aln: rec.seq.chars }

    aln_len ||= rec.seq.length

    abort_unless aln_len == rec.seq.length,
                 "Aln len mismatch for #{rec.id}"
  end

  leaf2attrs
end

#read_attrs_file(fname) ⇒ Object



309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
# File 'lib/tree_clusters.rb', line 309

def read_attrs_file fname

  attr_names = Set.new
  File.open(fname, "rt").each_line.with_index do |line, idx|
    unless idx.zero?
      _, attr_name, _ = line.chomp.split "\t"

      attr_names << attr_name
    end
  end

  attr_names = attr_names.to_a.sort

  attrs = TreeClusters::Attrs.new

  File.open(fname, "rt").each_line.with_index do |line, idx|
    unless idx.zero?
      leaf, attr_name, attr_val = line.chomp.split "\t"

      if attrs.has_key? leaf
        if attrs[leaf].has_key? attr_name
          attrs[leaf][attr_name] << attr_val
        else
          attrs[leaf][attr_name] = Set.new([attr_val])
        end
      else
        attrs[leaf] = {}

        attr_names.each do |name|
          attrs[leaf][name] = Set.new
        end
        attrs[leaf][attr_name] << attr_val
      end
    end
  end

    [attr_names, attrs]
end

#read_mapping_file(fname) ⇒ Object



290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
# File 'lib/tree_clusters.rb', line 290

def read_mapping_file fname
  md_cat_names = nil
   = TreeClusters::Attrs.new

  File.open(fname, "rt").each_line.with_index do |line, idx|
    leaf_name, * = line.chomp.split "\t"

    if idx.zero?
      md_cat_names = 
    else
      .each_with_index do |val, val_idx|
        .add md_cat_names[val_idx], leaf_name, val
      end
    end
  end

  
end

#snazzy_clades(tree, metadata) ⇒ Object



163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# File 'lib/tree_clusters.rb', line 163

def snazzy_clades tree, 
  snazzy_clades = {}

  clades = self.
           all_clades(tree, ).
           sort_by { |clade| clade.all_leaves.count }.
           reverse

  .each do |md_cat, leaf2mdtag|
    already_checked = Set.new
    single_tag_clades = {}

    clades.each do |clade|
      assert clade.all_leaves.count > 1,
             "A clade cannot also be a leaf"

      unless clade.all_leaves.all? do |leaf|
               already_checked.include? leaf
             end
        md_tags = clade.all_leaves.map do |leaf|
          assert leaf2mdtag.has_key?(leaf),
                 "leaf #{leaf} is missing from leaf2mdtag ht"

          leaf2mdtag[leaf]
        end

        # this clade is mono-phyletic w.r.t. this metadata category.
        if md_tags.uniq.count == 1
          clade.all_leaves.each do |leaf|
            already_checked << leaf
          end

          assert !single_tag_clades.has_key?(clade),
                 "clade #{clade.name} is repeated in single_tag_clades for #{md_cat}"

          single_tag_clades[clade] = md_tags.first
        end
      end
    end

    single_tag_clades.each do |clade, md_tag|
      non_clade_leaves = tree.unquoted_taxa - clade.all_leaves

      non_clade_leaves_with_this_md_tag = non_clade_leaves.map do |leaf|
        [leaf, leaf2mdtag[leaf]]
      end.select { |ary| ary.last == md_tag }

      if non_clade_leaves_with_this_md_tag.count.zero?
        if snazzy_clades.has_key? clade
          snazzy_clades[clade][md_cat] = md_tag
        else
          snazzy_clades[clade] = { md_cat => md_tag }
        end
      end
    end
  end

  snazzy_clades
end

#snazzy_info(tree, metadata) ⇒ Object



223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
# File 'lib/tree_clusters.rb', line 223

def snazzy_info tree, 
  snazzy_info = {}

  clades = self.
           all_clades(tree, ).
           sort_by { |clade| clade.all_leaves.count }.
           reverse

  # Non snazzy clades have a value of nil, so set all to nil and the
  # snazzy ones will be overwritten.
  clades.each do |clade|
    snazzy_info[clade] = nil
  end

  .each do |md_cat, leaf2mdtag|
    already_checked = Set.new
    single_tag_clades = {}

    clades.each do |clade|
      assert clade.all_leaves.count > 1,
             "A clade cannot also be a leaf"

      unless clade.all_leaves.all? do |leaf|
               already_checked.include? leaf
             end
        md_tags = clade.all_leaves.map do |leaf|
          assert leaf2mdtag.has_key?(leaf),
                 "leaf #{leaf} is missing from leaf2mdtag ht"

          leaf2mdtag[leaf]
        end

        # this clade is mono-phyletic w.r.t. this metadata category.
        if md_tags.uniq.count == 1
          clade.all_leaves.each do |leaf|
            already_checked << leaf
          end

          assert !single_tag_clades.has_key?(clade),
                 "clade #{clade.name} is repeated in single_tag_clades for #{md_cat}"

          single_tag_clades[clade] = md_tags.first
        end
      end
    end

    single_tag_clades.each do |clade, md_tag|
      non_clade_leaves = tree.unquoted_taxa - clade.all_leaves

      non_clade_leaves_with_this_md_tag = non_clade_leaves.map do |leaf|
        [leaf, leaf2mdtag[leaf]]
      end.select { |ary| ary.last == md_tag }

      is_snazzy_clade = non_clade_leaves_with_this_md_tag.count.zero?
      if is_snazzy_clade
        if !snazzy_info[clade].nil?
          snazzy_info[clade][md_cat] = md_tag
        else
          snazzy_info[clade] = { md_cat => md_tag }
        end
      end
    end
  end

  snazzy_info
end