Class: Bio::MAF::Tiler

Inherits:
Object
  • Object
show all
Defined in:
lib/bio/maf/tiler.rb

Overview

Tiles a given genomic interval. Inspired by: lib/bx/align/tools/tile.py in bx-python

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeTiler

Returns a new instance of Tiler.



35
36
37
38
39
# File 'lib/bio/maf/tiler.rb', line 35

def initialize
  @species_map = {}
  self.fill_char = '*'
  self.remove_absent_species = true
end

Instance Attribute Details

#fill_charString

The character used to fill regions where no sequence data is available for a particular species. Defaults to *.

Returns:

  • (String)


31
32
33
# File 'lib/bio/maf/tiler.rb', line 31

def fill_char
  @fill_char
end

#indexObject

Returns the value of attribute index.



11
12
13
# File 'lib/bio/maf/tiler.rb', line 11

def index
  @index
end

#intervalObject

GenomicInterval



15
16
17
# File 'lib/bio/maf/tiler.rb', line 15

def interval
  @interval
end

#parserObject

Returns the value of attribute parser.



12
13
14
# File 'lib/bio/maf/tiler.rb', line 12

def parser
  @parser
end

#referenceObject

Returns the value of attribute reference.



13
14
15
# File 'lib/bio/maf/tiler.rb', line 13

def reference
  @reference
end

#remove_absent_speciesObject

Returns the value of attribute remove_absent_species.



33
34
35
# File 'lib/bio/maf/tiler.rb', line 33

def remove_absent_species
  @remove_absent_species
end

#speciesArray<String>

The species of interest to extract from the MAF file. Will be set as a Parser#sequence_filter for parsing. Defaults to the keys of #species_map.

Returns:

  • (Array<String>)


22
23
24
# File 'lib/bio/maf/tiler.rb', line 22

def species
  @species
end

#species_mapHash

A hash mapping species to their desired output names.

Returns:

  • (Hash)


27
28
29
# File 'lib/bio/maf/tiler.rb', line 27

def species_map
  @species_map
end

Instance Method Details

#build_bio_alignmentBio::BioAlignment::Alignment

Tile sequences to build a new Alignment object. This will have one Sequence per entry in #species or #species_map, in the same order. Each sequence will have an id given by #species_map or, if none is present, the identifier from #species.

Returns:

  • (Bio::BioAlignment::Alignment)


181
182
183
184
185
# File 'lib/bio/maf/tiler.rb', line 181

def build_bio_alignment
  out = output_text.to_a
  Bio::BioAlignment::Alignment.new(out.collect { |e| e[1] },
                                   out.collect { |e| e[0] })
end

#non_fill_reObject



163
164
165
166
# File 'lib/bio/maf/tiler.rb', line 163

def non_fill_re
  fill_esc = Regexp.escape(fill_char)
  Regexp.compile("[^#{fill_esc}]")
end

#output_textObject



168
169
170
# File 'lib/bio/maf/tiler.rb', line 168

def output_text
  species_for_output.zip(tile()).reject { |s, t| t.nil? }
end

#ref_data(range) ⇒ Object



73
74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/bio/maf/tiler.rb', line 73

def ref_data(range)
  if reference
    if reference.respond_to? :read_interval
      reference.read_interval(range.begin, range.end)
    elsif reference.is_a? String
      reference.slice(range)
    else
      raise "Unhandled reference data source: #{reference}"
    end
  else
    nil
  end
end

#runs(mask) {|cur_start...mask.size, cur| ... } ⇒ Object

Yields:

  • (cur_start...mask.size, cur)


199
200
201
202
203
204
205
206
207
208
209
210
# File 'lib/bio/maf/tiler.rb', line 199

def runs(mask)
  cur = nil
  cur_start = nil
  mask.each_with_index do |obj, i|
    if ! cur.equal?(obj)
      yield(cur_start...i, cur) if cur
      cur = obj
      cur_start = i
    end
  end
  yield(cur_start...mask.size, cur)
end

#species_for_outputObject



91
92
93
# File 'lib/bio/maf/tiler.rb', line 91

def species_for_output
  species_to_use.collect { |s| species_map[s] || s }
end

#species_to_useObject



87
88
89
# File 'lib/bio/maf/tiler.rb', line 87

def species_to_use
  species || species_map.keys
end

#tileArray<String>

Return an array of tiled sequence data, in the order given by #species_to_use.

Returns:

  • (Array<String>)


98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# File 'lib/bio/maf/tiler.rb', line 98

def tile
  parser.sequence_filter[:only_species] = species_to_use
  parser.opts[:remove_gaps] = true
  LOG.debug { "finding blocks covering interval #{interval}." }
  blocks = index.find([interval], parser).sort_by { |b| b.vars[:score] }
  mask = Array.new(interval.length, :ref)
  i_start = interval.zero_start
  i_end = interval.zero_end
  if reference
    LOG.debug { "using a #{reference.class} reference." }
    ref_region = ref_data(i_start...i_end)
  end
  LOG.debug "tiling #{blocks.count} blocks."
  blocks.each do |block|
    ref = block.ref_seq
    LOG.debug { "tiling with block #{ref.start}-#{ref.end}" }
    slice_start = [i_start, ref.start].max
    slice_end = [i_end, ref.end].min
    mask.fill(block,
              (slice_start - i_start)...(slice_end - i_start))
  end
  text = []
  species_to_use.each { |s| text << '' }
  nonref_text = text[1...text.size]
  runs(mask) do |range, block|
    g_range = (range.begin + i_start)...(range.end + i_start)
    if block == :ref
      # not covered by an alignment block
      # use the reference sequence if given, otherwise 'N'
      range_size = range.end - range.begin
      text[0] << if ref_region
                   ref_region.slice(range)
                 else
                   'N' * range_size
                 end
      fill_text = fill_char * range_size
      nonref_text.each { |t| t << fill_text }
    else
      # covered by an alignment block
      t_range = block.ref_seq.text_range(g_range)
      species_to_use.each_with_index do |species, i|
        sp_text = text[i]
        seq = block.sequences.find { |s| s.source == species || s.species == species }
        if seq
          # got alignment text
          sp_text << seq.text.slice(t_range)
        else
          # no alignment for this one here, use the fill char
          sp_text << fill_char * (t_range.end - t_range.begin)
        end
      end
    end
  end
  if remove_absent_species
    non_fill = non_fill_re
    LOG.debug { "searching for non-fill characters with #{non_fill}" }
    text.each_with_index do |seq, i|
      unless non_fill.match(seq)
        text[i] = nil
      end
    end
  end
  text
end

#write_fasta(f) ⇒ Object

Write a FASTA representation of the tiled sequences to the given output stream.

Parameters:

  • f (#puts)

    the output stream to write the FASTA data to.



192
193
194
195
196
197
# File 'lib/bio/maf/tiler.rb', line 192

def write_fasta(f)
  output_text.each do |sp_out, text|
    f.puts ">#{sp_out}"
    f.puts text
  end
end