Module: TreeClusters

Defined in:
lib/tree_clusters.rb,
lib/tree_clusters/version.rb

Overview

Top level namespace of the Gem.

Defined Under Namespace

Classes: AttrArray, Attrs, Clade

Constant Summary collapse

VERSION =
"0.5.0"

Instance Method Summary collapse

Instance Method Details

#all_clades(tree, metadata = nil) {|clade| ... } ⇒ Enumerator<Clade>

Given a NewickTree, return an array of all Clades in that tree.

Parameters:

Yield Parameters:

  • clade (Clade)

    a clade of the tree

Returns:

  • (Enumerator<Clade>)

    enumerator of Clade objects



127
128
129
130
131
132
133
# File 'lib/tree_clusters.rb', line 127

def all_clades tree, =nil
  return enum_for(:all_clades, tree, ) unless block_given?

  tree.clade_nodes.reverse.each do |node|
    yield Clade.new node, tree, 
  end
end

#check_ids(tree, mapping, aln) ⇒ Object



85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# File 'lib/tree_clusters.rb', line 85

def check_ids tree, mapping, aln
  tree_ids = Set.new(NewickTree.fromFile(tree).taxa)

  mapping_ids = Set.new
  File.open(mapping, "rt").each_line.with_index do |line, idx|
    unless idx.zero?
      id, *rest = line.chomp.split

      mapping_ids << id
    end
  end

  aln_ids = Set.new
  ParseFasta::SeqFile.open(aln).each_record do |rec|
    aln_ids << rec.id
  end

  if !(tree_ids == mapping_ids && mapping_ids == aln_ids)
    AbortIf::logger.error { "Seq IDs did not match in all input files" }

    tree_ids = tree_ids.to_a.sort
    mapping_ids = mapping_ids.to_a.sort
    aln_ids = aln_ids.to_a.sort

    AbortIf::logger.debug { ["tree_ids", tree_ids].join "\t" }
    AbortIf::logger.debug { ["mapping_ids", mapping_ids].join "\t" }
    AbortIf::logger.debug { ["aln_ids", aln_ids].join "\t" }

    raise AbortIf::Exit
  else
    true
  end
end

#consensus(bases) ⇒ Object

Note:

Each string is upcase’d before frequencies are calculated.

Given an ary of strings, find the most common string in the ary.

Examples:

Upper case and lower case count as the same.

TreeClusters::consensus %w[a A C T] #=> "A"

Ties take the one closest to the end

TreeClusters::consensus %w[a c T t C t g] #=> "T"

Parameters:

  • bases (Array<String>)

    an array of strings



42
43
44
45
46
47
48
49
50
# File 'lib/tree_clusters.rb', line 42

def consensus bases
  bases.
    map(&:upcase).
    group_by(&:itself).
    sort_by { |_, bases| bases.count }.
    reverse.
    first.
    first
end

#low_ent_cols(leaves, leaf2attrs, entropy_cutoff) ⇒ Object



67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/tree_clusters.rb', line 67

def low_ent_cols leaves, leaf2attrs, entropy_cutoff
  low_ent_cols = []
  alns = leaf2attrs.attrs leaves, :aln
  aln_cols = alns.transpose

  aln_cols.each_with_index do |aln_col, aln_col_idx|
    has_gaps = aln_col.any? { |aa| aa == "-" }
    low_entropy =
      Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff

    if !has_gaps && low_entropy
      low_ent_cols << (aln_col_idx + 1)
    end
  end

  Set.new low_ent_cols
end

#read_alignment(aln_fname) ⇒ Object



52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/tree_clusters.rb', line 52

def read_alignment aln_fname
  leaf2attrs = TreeClusters::Attrs.new
  aln_len = nil
  ParseFasta::SeqFile.open(aln_fname).each_record do |rec|
    leaf2attrs[rec.id] = { aln: rec.seq.chars }

    aln_len ||= rec.seq.length

    abort_unless aln_len == rec.seq.length,
                 "Aln len mismatch for #{rec.id}"
  end

  leaf2attrs
end

#read_mapping_file(fname) ⇒ Object



195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# File 'lib/tree_clusters.rb', line 195

def read_mapping_file fname
  md_cat_names = nil
   = TreeClusters::Attrs.new

  File.open(fname, "rt").each_line.with_index do |line, idx|
    leaf_name, * = line.chomp.split "\t"

    if idx.zero?
      md_cat_names = 
    else
      .each_with_index do |val, val_idx|
        .add md_cat_names[val_idx], leaf_name, val
      end
    end
  end

  
end

#snazzy_clades(tree, metadata) ⇒ Object



135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# File 'lib/tree_clusters.rb', line 135

def snazzy_clades tree, 
  snazzy_clades = {}

  clades = self.
           all_clades(tree, ).
           sort_by { |clade| clade.all_leaves.count }.
           reverse

  .each do |md_cat, leaf2mdtag|
    already_checked = Set.new
    single_tag_clades = {}

    clades.each do |clade|
      assert clade.all_leaves.count > 1,
             "A clade cannot also be a leaf"

      unless clade.all_leaves.all? do |leaf|
               already_checked.include? leaf
             end
        md_tags = clade.all_leaves.map do |leaf|
          assert leaf2mdtag.has_key?(leaf),
                 "leaf #{leaf} is missing from leaf2mdtag ht"

          leaf2mdtag[leaf]
        end

        # this clade is mono-phyletic w.r.t. this metadata category.
        if md_tags.uniq.count == 1
          clade.all_leaves.each do |leaf|
            already_checked << leaf
          end

          assert !single_tag_clades.has_key?(clade),
                 "clade #{clade.name} is repeated in single_tag_clades for #{md_cat}"

          single_tag_clades[clade] = md_tags.first
        end
      end
    end

    single_tag_clades.each do |clade, md_tag|
      non_clade_leaves = tree.taxa - clade.all_leaves

      non_clade_leaves_with_this_md_tag = non_clade_leaves.map do |leaf|
        [leaf, leaf2mdtag[leaf]]
      end.select { |ary| ary.last == md_tag }

      if non_clade_leaves_with_this_md_tag.count.zero?
        if snazzy_clades.has_key? clade
          snazzy_clades[clade][md_cat] = md_tag
        else
          snazzy_clades[clade] = { md_cat => md_tag }
        end
      end
    end
  end

  snazzy_clades
end