Class: Bio::MAF::KyotoIndex

Inherits:

Object

Object
Bio::MAF::KyotoIndex

show all

Includes:: KVHelpers

Defined in:: lib/bio/maf/index.rb

Constant Summary collapse

FORMAT_VERSION_KEY =

'bio-maf:index-format-version'

FORMAT_VERSION =

REF_SEQ_KEY =

'bio-maf:reference-sequence'

MAX_SPECIES =

Constants included from KVHelpers

Bio::MAF::KVHelpers::CHROM_BIN_PREFIX_FMT, Bio::MAF::KVHelpers::KEY_FMT, Bio::MAF::KVHelpers::KEY_SCAN_FMT, Bio::MAF::KVHelpers::VAL_FMT, Bio::MAF::KVHelpers::VAL_IDX_OFFSET_FMT, Bio::MAF::KVHelpers::VAL_N_SEQ_FMT, Bio::MAF::KVHelpers::VAL_SPECIES_FMT, Bio::MAF::KVHelpers::VAL_TEXT_SIZE_FMT

Instance Attribute Summary collapse

#db ⇒ Object readonly
Returns the value of attribute db.
#index_sequences ⇒ Object
Returns the value of attribute index_sequences.
#ref_only ⇒ Object readonly
Returns the value of attribute ref_only.
#ref_seq ⇒ Object
Returns the value of attribute ref_seq.
#species ⇒ Object readonly
Returns the value of attribute species.
#species_max_id ⇒ Object readonly
Returns the value of attribute species_max_id.

Class Method Summary collapse

.build(parser, path, ref_only = true) ⇒ KyotoIndex
Build a new index from the MAF file being parsed by parser, and store it in path.
.open(path) ⇒ KyotoIndex
Open an existing index for reading.

Instance Method Summary collapse

#build(parser, ref_only = true) ⇒ Object
#build_block_value(block) ⇒ Object
#close ⇒ Object
Close the underlying Kyoto Cabinet database handle.
#dump(stream = $stdout) ⇒ Object
#entries_for(block) ⇒ Object
#fetch_list(intervals, filter_spec = {}) ⇒ Object
Build a fetch list of alignment blocks to read, given an array of Bio::GenomicInterval objects.
#find(intervals, parser, filter = {}) {|block| ... } ⇒ Enumerable<Block>
Find all alignment blocks in the genomic regions in the list of Bio::GenomicInterval objects, and parse them with the given parser.
#index_blocks(blocks) ⇒ Object
#initialize(path, db_arg = nil) ⇒ KyotoIndex constructor private
KyotoIndex Internals.
#load_index_sequences ⇒ Object
#load_species ⇒ Object
#make_scan_worker(jobs, completed) ⇒ Object
#overlaps?(gi, i_start, i_end) ⇒ Boolean
#reopen ⇒ Object
Reopen the same DB handle read-only.
#scan_bin(cur, chrom_id, bin, bin_intervals, filters) ⇒ Object
#scan_bins(chrom_id, bin_intervals, filters) ⇒ Object
Scan the index for blocks matching the given bins and intervals.
#scan_bins_parallel(chrom_id, bin_intervals, filters) ⇒ Object
#seq_id_for(name) ⇒ Object
#slice(interval, parser, filter = {}) ⇒ Object
#species_id_for_seq(seq) ⇒ Object
#with_profiling ⇒ Object

Methods included from KVHelpers

bin_start_prefix, extract_index_offset, extract_n_sequences, extract_species_vec, extract_text_size, unpack_key

Constructor Details

#initialize(path, db_arg = nil) ⇒ `KyotoIndex`

This method is part of a private API. You should avoid using this method if possible, as it may be removed or be changed in the future.

KyotoIndex Internals

# File 'lib/bio/maf/index.rb', line 426

def initialize(path, db_arg=nil)
  @species = {}
  @species_max_id = -1
  @max_sid = -1
  if db_arg || ((path.size > 1) and File.exist?(path))
    mode = KyotoCabinet::DB::OREADER
  else
    mode = KyotoCabinet::DB::OWRITER | KyotoCabinet::DB::OCREATE
  end
  @db = db_arg || KyotoCabinet::DB.new
  @path = path
  unless db_arg || db.open(path.to_s, mode)
    raise "Could not open DB file!"
  end
  if mode == KyotoCabinet::DB::OREADER
    self.ref_seq = db[REF_SEQ_KEY]
    load_index_sequences
    load_species
  end
end

Instance Attribute Details

#db ⇒ `Object` (readonly)

Returns the value of attribute db.



265
266
267

# File 'lib/bio/maf/index.rb', line 265

def db
  @db
end

#index_sequences ⇒ `Object`

Returns the value of attribute index_sequences.



266
267
268

# File 'lib/bio/maf/index.rb', line 266

def index_sequences
  @index_sequences
end

#ref_only ⇒ `Object` (readonly)

Returns the value of attribute ref_only.



265
266
267

# File 'lib/bio/maf/index.rb', line 265

def ref_only
  @ref_only
end

#ref_seq ⇒ `Object`

Returns the value of attribute ref_seq.



266
267
268

# File 'lib/bio/maf/index.rb', line 266

def ref_seq
  @ref_seq
end

#species ⇒ `Object` (readonly)

Returns the value of attribute species.



265
266
267

# File 'lib/bio/maf/index.rb', line 265

def species
  @species
end

#species_max_id ⇒ `Object` (readonly)

Returns the value of attribute species_max_id.



265
266
267

# File 'lib/bio/maf/index.rb', line 265

def species_max_id
  @species_max_id
end

Class Method Details

.build(parser, path, ref_only = true) ⇒ `KyotoIndex`

Build a new index from the MAF file being parsed by parser, and store it in path.

Parameters:

parser (Parser) —
MAF parser for file to index
path (String) —
path to index file to create

Returns:

(KyotoIndex)

# File 'lib/bio/maf/index.rb', line 356

def self.build(parser, path, ref_only=true)
  idx = self.new(path)
  idx.build(parser, ref_only)
  return idx
end

.open(path) ⇒ `KyotoIndex`

Open an existing index for reading.

Parameters:

path (String) —
path to existing Kyoto Cabinet index

Returns:

(KyotoIndex)



347
348
349

# File 'lib/bio/maf/index.rb', line 347

def self.open(path)
  return KyotoIndex.new(path)
end

Instance Method Details

#build(parser, ref_only = true) ⇒ `Object`

# File 'lib/bio/maf/index.rb', line 662

def build(parser, ref_only=true)
  first_block = parser.parse_block
  self.ref_seq = first_block.sequences.first.source
  @ref_only = ref_only
  db[REF_SEQ_KEY] = ref_seq
  db[FORMAT_VERSION_KEY] = FORMAT_VERSION
  @index_sequences = {}
  index_blocks([first_block])
  n = 0
  parser.each_block.each_slice(1000).each do |blocks|
    index_blocks(blocks)
    n += blocks.size
  end
  LOG.debug { "Created index for #{n} blocks and #{@index_sequences.size} sequences." }
  db.synchronize(true)
end

#build_block_value(block) ⇒ `Object`

# File 'lib/bio/maf/index.rb', line 739

def build_block_value(block)
  bits = block.sequences.collect {|s| 1 << species_id_for_seq(s.source) }
  vec = bits.reduce(0, :|)
  return [block.offset,
          block.size,
          block.text_size,
          block.sequences.size,
          vec].pack(VAL_FMT)
end

#close ⇒ `Object`

Close the underlying Kyoto Cabinet database handle.



409
410
411

# File 'lib/bio/maf/index.rb', line 409

def close
  db.close
end

#dump(stream = $stdout) ⇒ `Object`

# File 'lib/bio/maf/index.rb', line 452

def dump(stream=$stdout)
  stream.puts "KyotoIndex dump: #{@path}"
  stream.puts
  if db.count == 0
    stream.puts "Empty database!"
    return
  end
  db.cursor_process do |cur|
    stream.puts "== Metadata =="
    cur.jump('')
    while true
      k, v = cur.get(false)
      raise "unexpected end of records!" unless k
      break if k[0] == "\xff"
      stream.puts "#{k}: #{v}"
      unless cur.step
        raise "could not advance cursor!"
      end
    end
    stream.puts "== Index records =="
    while pair = cur.get(true)
      _, chr, bin, s_start, s_end = pair[0].unpack(KEY_FMT)
      offset, len, text_size, n_seq, species_vec = pair[1].unpack(VAL_FMT)
      stream.puts "#{chr} [bin #{bin}] #{s_start}:#{s_end}"
      stream.puts "  offset #{offset}, length #{len}"
      stream.puts "  text size: #{text_size}"
      stream.puts "  sequences in block: #{n_seq}"
      stream.printf("  species vector: %016x\n", species_vec)
    end
  end
end

#entries_for(block) ⇒ `Object`

# File 'lib/bio/maf/index.rb', line 749

def entries_for(block)
  unless block.ref_seq.source == @ref_seq
    raise "Inconsistent reference sequence: expected #{@ref_seq}, got #{block.ref_seq.source}"
  end
  h = {}
  val = build_block_value(block)
  to_index = ref_only ? [block.sequences.first] : block.sequences
  to_index.each do |seq|
    seq_id = seq_id_for(seq.source)
    seq_end = seq.start + seq.size
    bin = Bio::Ucsc::UcscBin.bin_from_range(seq.start, seq_end)
    key = [255, seq_id, bin, seq.start, seq_end].pack(KEY_FMT)
    h[key] = val
  end
  return h
end

#fetch_list(intervals, filter_spec = {}) ⇒ `Object`

Build a fetch list of alignment blocks to read, given an array of Bio::GenomicInterval objects

# File 'lib/bio/maf/index.rb', line 506

def fetch_list(intervals, filter_spec={})
  filter_spec ||= {}
  filters = Filters.build(filter_spec, self)
  chrom = intervals.first.chrom
  chrom_id = index_sequences[chrom]
  unless chrom_id
    raise "chromosome #{chrom} not indexed!"
  end
  if intervals.find { |i| i.chrom != chrom }
    raise "all intervals must be for the same chromosome!"
  end
  # for each bin, build a list of the intervals to look for there
  bin_intervals = Hash.new { |h, k| h[k] = [] }
  intervals.each do |i|
    i.bin_all.each do |bin|
      bin_intervals[bin] << (i.zero_start...i.zero_end)
    end
  end
  bin_intervals.values.each do |intervals|
    intervals.sort_by! {|i| i.begin}
  end
  matches = if RUBY_PLATFORM == 'java' && bin_intervals.size > 4
              scan_bins_parallel(chrom_id, bin_intervals, filters)
            else
              scan_bins(chrom_id, bin_intervals, filters)
            end
  matches.sort_by! { |e| e[0] } # sort by offset in file
end

#find(intervals, parser, filter = {}) {|block| ... } ⇒ `Enumerable<Block>`

Find all alignment blocks in the genomic regions in the list of Bio::GenomicInterval objects, and parse them with the given parser.

An optional Hash of filters may be passed in. The following keys are used:

:with_all_species => ["sp1", "sp2", ...]

Only match alignment blocks containing all given species.
:at_least_n_sequences => n

Only match alignment blocks with at least N sequences.
:min_size => n

Only match alignment blocks with text size at least N.
:max_size => n

Only match alignment blocks with text size at most N.

Parameters:

intervals (Enumerable<Bio::GenomicInterval>) —
genomic intervals to parse.
parser (Parser) —
MAF parser for file to fetch blocks from.
filter (Hash) (defaults to: {}) —
Block filter expression.

Yields:

(block) —
each Block matched, in turn

Returns:

(Enumerable<Block>) —
each matching Block, if no block given

# File 'lib/bio/maf/index.rb', line 393

def find(intervals, parser, filter={}, &blk)
  start = Time.now
  fl = fetch_list(intervals, filter)
  LOG.debug { sprintf("Built fetch list of %d items in %.3fs.\n",
                      fl.size,
                      Time.now - start) }
  if ! fl.empty?
    parser.fetch_blocks(fl, &blk)
  else
    if ! block_given?
     []
    end
  end
end

#index_blocks(blocks) ⇒ `Object`

# File 'lib/bio/maf/index.rb', line 679

def index_blocks(blocks)
  h = blocks.map { |b| entries_for(b) }.reduce(:merge!)
  db.set_bulk(h, false)
end

#load_index_sequences ⇒ `Object`

# File 'lib/bio/maf/index.rb', line 684

def load_index_sequences
  h = {}
  db.match_prefix("sequence:").each do |key|
    _, name = key.split(':', 2)
    id = db[key].to_i
    h[name] = id
  end
  @index_sequences = h
  @max_sid = @index_sequences.values.max
end

#load_species ⇒ `Object`

# File 'lib/bio/maf/index.rb', line 706

def load_species
  db.match_prefix("species:").each do |key|
    _, name = key.split(':', 2)
    id = db[key].to_i
    @species[name] = id
  end
  @species_max_id = @species.values.sort.last || -1
end

#make_scan_worker(jobs, completed) ⇒ `Object`

# File 'lib/bio/maf/index.rb', line 594

def make_scan_worker(jobs, completed)
  Thread.new do
    with_profiling do
      db.cursor_process do |cur|
        while true
          req = jobs.poll
          break unless req
          begin
            result = yield(cur, req)
            completed.put(result)
          rescue Exception => e
            completed.put(e)
            LOG.error "Worker failing: #{e.class}: #{e}"
            LOG.error e
            raise e
          end
        end
      end
    end
  end
end

#overlaps?(gi, i_start, i_end) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/bio/maf/index.rb', line 655

def overlaps?(gi, i_start, i_end)
  g_start = gi.begin

  (i_start <= g_start && g_start < i_end) \
   || gi.include?(i_start)
end

#reopen ⇒ `Object`

Reopen the same DB handle read-only. Only useful for unit tests.



448
449
450

# File 'lib/bio/maf/index.rb', line 448

def reopen
  KyotoIndex.new(@path, @db)
end

#scan_bin(cur, chrom_id, bin, bin_intervals, filters) ⇒ `Object`

# File 'lib/bio/maf/index.rb', line 616

def scan_bin(cur, chrom_id, bin, bin_intervals, filters)
  # bin_intervals is sorted by zero_start
  # compute the start and end of all intervals of interest
  spanning_start = bin_intervals.first.begin
  spanning_end = bin_intervals.map {|i| i.end}.max
  # scan from the start of the bin
  cur.jump(bin_start_prefix(chrom_id, bin))
  matches = []
  while pair = cur.get(true)
    c_chr, c_bin, c_start, c_end = pair[0].unpack(KEY_SCAN_FMT)
    if (c_chr != chrom_id) \
      || (c_bin != bin) \
      || c_start >= spanning_end
      # we've hit the next bin, or chromosome, or gone past
      # the spanning interval, so we're done with this bin
      break
    end
    if c_end >= spanning_start # possible overlap
      # any intervals that end before the start of the current
      # block are no longer relevant
      while bin_intervals.first.end < c_start
        bin_intervals.shift
      end
      bin_intervals.each do |i|
        i_start = i.begin
        break if i_start > c_end
        if ((c_start <= i_start && i_start < c_end) \
            || i.include?(c_start)) \
            && filters.match(pair)
          # match
          matches << extract_index_offset(pair)
          break
        end
      end
    end
  end
  matches
end

#scan_bins(chrom_id, bin_intervals, filters) ⇒ `Object`

Scan the index for blocks matching the given bins and intervals.

# File 'lib/bio/maf/index.rb', line 536

def scan_bins(chrom_id, bin_intervals, filters)
  to_fetch = []
  db.cursor_process do |cur|
    bin_intervals.each do |bin, bin_intervals_raw|
      matches = scan_bin(cur, chrom_id, bin, bin_intervals_raw, filters)
      to_fetch.concat(matches)
    end 
  end
  to_fetch
end

#scan_bins_parallel(chrom_id, bin_intervals, filters) ⇒ `Object`

# File 'lib/bio/maf/index.rb', line 561

def scan_bins_parallel(chrom_id, bin_intervals, filters)
  start = Time.now
  n_threads = ENV['profile'] ? 1 : java.lang.Runtime.runtime.availableProcessors
  jobs = java.util.concurrent.ConcurrentLinkedQueue.new(bin_intervals.to_a)
  completed = java.util.concurrent.LinkedBlockingQueue.new(128)
  threads = []
  n_threads.times do
    threads << make_scan_worker(jobs, completed) do |cur, req|
      bin, intervals = req
      scan_bin(cur, chrom_id, bin, intervals, filters)
    end
  end
  n_completed = 0
  to_fetch = []
  while (n_completed < bin_intervals.size)
    c = completed.poll(5, java.util.concurrent.TimeUnit::SECONDS)
    if c.nil?
      if threads.find { |t| t.alive? }
        next
      else
        raise "No threads alive, completed #{n_completed}/#{bin_intervals.size} jobs!"
      end
    end
    raise "worker failed: #{c}" if c.is_a? Exception
    to_fetch.concat(c)
    n_completed += 1
  end
  threads.each { |t| t.join }
  LOG.debug { sprintf("Matched %d index records with %d threads in %.3f seconds.\n",
                      to_fetch.size, n_threads, Time.now - start) }
  to_fetch
end

#seq_id_for(name) ⇒ `Object`

# File 'lib/bio/maf/index.rb', line 695

def seq_id_for(name)
  sid = index_sequences[name]
  if ! sid
    @max_sid += 1
    sid = @max_sid
    db.set("sequence:#{name}", sid.to_s)
    index_sequences[name] = sid
  end
  return sid
end

#slice(interval, parser, filter = {}) ⇒ `Object`

# File 'lib/bio/maf/index.rb', line 413

def slice(interval, parser, filter={})
  if block_given?
    find([interval], parser, filter) do |block|
      yield block.slice(interval)
    end
  else
    enum_for(:slice, interval, parser, filter)
  end
end

#species_id_for_seq(seq) ⇒ `Object`

# File 'lib/bio/maf/index.rb', line 715

def species_id_for_seq(seq)
  # NB can have multiple dots
  # example: otoGar1.scaffold_104707.1-93001
  parts = seq.split('.', 2)
  if parts.size == 2
    species_name = parts[0]
    if species.has_key? species_name
      return species[species_name]
    else
      species_id = @species_max_id + 1
      if species_id >= MAX_SPECIES
        raise "cannot index MAF file with more than #{MAX_SPECIES} species"
      end
      species[species_name] = species_id
      db["species:#{species_name}"] = species_id
      @species_max_id = species_id
      return species_id
    end
  else
    # not in species.sequence format, apparently
    return nil
  end
end

#with_profiling ⇒ `Object`

# File 'lib/bio/maf/index.rb', line 547

def with_profiling
  if RUBY_PLATFORM == 'java' && ENV['profile']
    rv = nil
    pdata = JRuby::Profiler.profile do
      rv = yield
    end
    printer = JRuby::Profiler::FlatProfilePrinter.new(pdata)
    printer.printProfile(STDERR)
    return rv
  else
    yield
  end
end

Class: Bio::MAF::KyotoIndex

Constant Summary collapse

Constants included from KVHelpers

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from KVHelpers

Constructor Details

#initialize(path, db_arg = nil) ⇒ KyotoIndex

Instance Attribute Details

#db ⇒ Object (readonly)

#index_sequences ⇒ Object

#ref_only ⇒ Object (readonly)

#ref_seq ⇒ Object

#species ⇒ Object (readonly)

#species_max_id ⇒ Object (readonly)

Class Method Details

.build(parser, path, ref_only = true) ⇒ KyotoIndex

.open(path) ⇒ KyotoIndex

Instance Method Details

#build(parser, ref_only = true) ⇒ Object

#build_block_value(block) ⇒ Object

#close ⇒ Object

#dump(stream = $stdout) ⇒ Object

#entries_for(block) ⇒ Object

#fetch_list(intervals, filter_spec = {}) ⇒ Object

#find(intervals, parser, filter = {}) {|block| ... } ⇒ Enumerable<Block>

#index_blocks(blocks) ⇒ Object

#load_index_sequences ⇒ Object

#load_species ⇒ Object

#make_scan_worker(jobs, completed) ⇒ Object

#overlaps?(gi, i_start, i_end) ⇒ Boolean

#reopen ⇒ Object

#scan_bin(cur, chrom_id, bin, bin_intervals, filters) ⇒ Object

#scan_bins(chrom_id, bin_intervals, filters) ⇒ Object

#scan_bins_parallel(chrom_id, bin_intervals, filters) ⇒ Object

#seq_id_for(name) ⇒ Object

#slice(interval, parser, filter = {}) ⇒ Object

#species_id_for_seq(seq) ⇒ Object

#with_profiling ⇒ Object

#initialize(path, db_arg = nil) ⇒ `KyotoIndex`

#db ⇒ `Object` (readonly)

#index_sequences ⇒ `Object`

#ref_only ⇒ `Object` (readonly)

#ref_seq ⇒ `Object`

#species ⇒ `Object` (readonly)

#species_max_id ⇒ `Object` (readonly)

.build(parser, path, ref_only = true) ⇒ `KyotoIndex`

.open(path) ⇒ `KyotoIndex`

#build(parser, ref_only = true) ⇒ `Object`

#build_block_value(block) ⇒ `Object`

#close ⇒ `Object`

#dump(stream = $stdout) ⇒ `Object`

#entries_for(block) ⇒ `Object`

#fetch_list(intervals, filter_spec = {}) ⇒ `Object`

#find(intervals, parser, filter = {}) {|block| ... } ⇒ `Enumerable<Block>`

#index_blocks(blocks) ⇒ `Object`

#load_index_sequences ⇒ `Object`

#load_species ⇒ `Object`

#make_scan_worker(jobs, completed) ⇒ `Object`

#overlaps?(gi, i_start, i_end) ⇒ `Boolean`

#reopen ⇒ `Object`

#scan_bin(cur, chrom_id, bin, bin_intervals, filters) ⇒ `Object`

#scan_bins(chrom_id, bin_intervals, filters) ⇒ `Object`

#scan_bins_parallel(chrom_id, bin_intervals, filters) ⇒ `Object`

#seq_id_for(name) ⇒ `Object`

#slice(interval, parser, filter = {}) ⇒ `Object`

#species_id_for_seq(seq) ⇒ `Object`

#with_profiling ⇒ `Object`