Class: Bio::MAF::Tiler

Inherits:
Object
  • Object
show all
Defined in:
lib/bio/maf/tiler.rb

Overview

Tiles a given genomic interval. Inspired by: lib/bx/align/tools/tile.py in bx-python

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeTiler

Returns a new instance of Tiler



33
34
35
36
# File 'lib/bio/maf/tiler.rb', line 33

def initialize
  @species_map = {}
  self.fill_char = '*'
end

Instance Attribute Details

#fill_charString

The character used to fill regions where no sequence data is available for a particular species. Defaults to *.

Returns:

  • (String)


31
32
33
# File 'lib/bio/maf/tiler.rb', line 31

def fill_char
  @fill_char
end

#indexObject

Returns the value of attribute index



11
12
13
# File 'lib/bio/maf/tiler.rb', line 11

def index
  @index
end

#intervalObject

GenomicInterval



15
16
17
# File 'lib/bio/maf/tiler.rb', line 15

def interval
  @interval
end

#parserObject

Returns the value of attribute parser



12
13
14
# File 'lib/bio/maf/tiler.rb', line 12

def parser
  @parser
end

#referenceObject

Returns the value of attribute reference



13
14
15
# File 'lib/bio/maf/tiler.rb', line 13

def reference
  @reference
end

#speciesArray<String>

The species of interest to extract from the MAF file. Will be set as a Parser#sequence_filter for parsing. Defaults to the keys of #species_map.

Returns:

  • (Array<String>)


22
23
24
# File 'lib/bio/maf/tiler.rb', line 22

def species
  @species
end

#species_mapHash

A hash mapping species to their desired output names.

Returns:

  • (Hash)


27
28
29
# File 'lib/bio/maf/tiler.rb', line 27

def species_map
  @species_map
end

Instance Method Details

#build_bio_alignmentBio::BioAlignment::Alignment

Tile sequences to build a new Alignment object. This will have one Sequence per entry in #species or #species_map, in the same order. Each sequence will have an id given by #species_map or, if none is present, the identifier from #species.

Returns:

  • (Bio::BioAlignment::Alignment)


156
157
158
# File 'lib/bio/maf/tiler.rb', line 156

def build_bio_alignment
  Bio::BioAlignment::Alignment.new(tile(), species_for_output)
end

#ref_data(range) ⇒ Object



70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/bio/maf/tiler.rb', line 70

def ref_data(range)
  if reference
    if reference.respond_to? :read_interval
      reference.read_interval(range.begin, range.end)
    elsif reference.is_a? String
      reference.slice(range)
    else
      raise "Unhandled reference data source: #{reference}"
    end
  else
    nil
  end
end

#runs(mask) {|cur_start...mask.size, cur| ... } ⇒ Object

Yields:

  • (cur_start...mask.size, cur)


172
173
174
175
176
177
178
179
180
181
182
183
# File 'lib/bio/maf/tiler.rb', line 172

def runs(mask)
  cur = nil
  cur_start = nil
  mask.each_with_index do |obj, i|
    if ! cur.equal?(obj)
      yield(cur_start...i, cur) if cur
      cur = obj
      cur_start = i
    end
  end
  yield(cur_start...mask.size, cur)
end

#species_for_outputObject



88
89
90
# File 'lib/bio/maf/tiler.rb', line 88

def species_for_output
  species_to_use.collect { |s| species_map[s] || s }
end

#species_to_useObject



84
85
86
# File 'lib/bio/maf/tiler.rb', line 84

def species_to_use
  species || species_map.keys
end

#tileArray<String>

Return an array of tiled sequence data, in the order given by #species_to_use.

Returns:

  • (Array<String>)


95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# File 'lib/bio/maf/tiler.rb', line 95

def tile
  parser.sequence_filter[:only_species] = species_to_use
  # TODO: remove gaps
  blocks = index.find([interval], parser).sort_by { |b| b.vars[:score] }
  mask = Array.new(interval.length, :ref)
  i_start = interval.zero_start
  i_end = interval.zero_end
  if reference
    ref_region = ref_data(i_start...i_end)
  end
  blocks.each do |block|
    ref = block.ref_seq
    slice_start = [i_start, ref.start].max
    slice_end = [i_end, ref.end].min
    mask.fill(block,
              (slice_start - i_start)...(slice_end - i_start))
  end
  text = []
  species_to_use.each { |s| text << '' }
  nonref_text = text[1...text.size]
  runs(mask) do |range, block|
    g_range = (range.begin + i_start)...(range.end + i_start)
    if block == :ref
      # not covered by an alignment block
      # use the reference sequence if given, otherwise 'N'
      range_size = range.end - range.begin
      text[0] << if ref_region
                   ref_region.slice(range)
                 else
                   'N' * range_size
                 end
      fill_text = fill_char * range_size
      nonref_text.each { |t| t << fill_text }
    else
      # covered by an alignment block
      t_range = block.ref_seq.text_range(g_range)
      species_to_use.each_with_index do |species, i|
        sp_text = text[i]
        seq = block.sequences.find { |s| s.source == species || s.species == species }
        if seq
          # got alignment text
          sp_text << seq.text.slice(t_range)
        else
          # no alignment for this one here, use the fill char
          sp_text << fill_char * (t_range.end - t_range.begin)
        end
      end
    end
  end
  text
end

#write_fasta(f) ⇒ Object

Write a FASTA representation of the tiled sequences to the given output stream.

Parameters:

  • f (#puts)

    the output stream to write the FASTA data to.



165
166
167
168
169
170
# File 'lib/bio/maf/tiler.rb', line 165

def write_fasta(f)
  species_for_output.zip(tile()) do |sp_out, text|
    f.puts ">#{sp_out}"
    f.puts text
  end
end