Module: Bio::Alignment::Output

Included in:: EnumerableExtension

Defined in:: lib/bio/alignment.rb

Overview

module EnumerableExtension

Instance Method Summary collapse

#__output_phylip_common(options = {}) ⇒ Object

common routine for interleaved/non-interleaved phylip format.
#output(format, *arg) ⇒ Object
#output_clustal(options = {}) ⇒ Object
Generates ClustalW-formatted text seqs

sequences (must be an alignment object) names

names of the sequences options

options.
#output_fasta(options = {}) ⇒ Object

Generates fasta format text and returns a string.
#output_molphy(options = {}) ⇒ Object

Generates Molphy alignment format text as a string.
#output_msf(options = {}) ⇒ Object

Generates msf formatted text as a string.
#output_phylip(options = {}) ⇒ Object

generates phylip interleaved alignment format as a string.
#output_phylipnon(options = {}) ⇒ Object

generates Phylip3.2 (old) non-interleaved format as a string.
#to_clustal(*arg) ⇒ Object

to_clustal is deprecated.

Instance Method Details

#__output_phylip_common(options = {}) ⇒ `Object`

common routine for interleaved/non-interleaved phylip format

# File 'lib/bio/alignment.rb', line 1099

def __output_phylip_common(options = {})
  len = self.alignment_length
  aln = [ " #{self.number_of_sequences} #{len}\n" ]
  sn = self.sequence_names.collect { |x| x.to_s.gsub(/[\r\n\x00]/, ' ') }
  if options[:replace_space]
    sn.collect! { |x| x.gsub(/\s/, '_') }
  end
  if !options.has_key?(:escape) or options[:escape]
    sn.collect! { |x| x.gsub(/[\:\;\,\(\)]/, '_') }
  end
  if !options.has_key?(:split) or options[:split]
    sn.collect! { |x| x.split(/\s/)[0].to_s }
  end
  if !options.has_key?(:avoid_same_name) or options[:avoid_same_name]
    sn = __clustal_avoid_same_name(sn, 10)
  end

  namewidth = 10
  seqwidth  = (options[:width] or 60)
  seqwidth = seqwidth.div(10) * 10
  seqregexp = Regexp.new("(.{1,#{seqwidth.div(10) * 11}})")
  gchar = (options[:gap_char] or '-')

  aseqs = Array.new(self.number_of_sequences).clear
  self.each_seq do |s|
    aseqs << s.to_s.gsub(self.gap_regexp, gchar)
  end
  case options[:case].to_s
  when /lower/i
    aseqs.each { |s| s.downcase! }
  when /upper/i
    aseqs.each { |s| s.upcase! }
  end
  
  aseqs.collect! do |s|
    snx = sn.shift
    head = sprintf("%*s", -namewidth, snx.to_s)[0, namewidth]
    head2 = ' ' * namewidth
    s << (gchar * (len - s.length))
    s.gsub!(/(.{1,10})/n, " \\1")
    s.gsub!(seqregexp, "\\1\n")
    a = s.split(/^/)
    head += a.shift
    ret = a.collect { |x| head2 + x }
    ret.unshift(head)
    ret
  end
  lines = (len + seqwidth - 1).div(seqwidth)
  [ aln, aseqs, lines ]
end

#output(format, *arg) ⇒ `Object`

# File 'lib/bio/alignment.rb', line 873

def output(format, *arg)
  case format
  when :clustal
    output_clustal(*arg)
  when :fasta
    output_fasta(*arg)
  when :phylip
    output_phylip(*arg)
  when :phylipnon
    output_phylipnon(*arg)
  when :msf
    output_msf(*arg)
  when :molphy
    output_molphy(*arg)
  else
    raise "Unknown format: #{format.inspect}"
  end
end

#output_clustal(options = {}) ⇒ `Object`

Generates ClustalW-formatted text

seqs: sequences (must be an alignment object)
names: names of the sequences
options: options



1045
1046
1047

# File 'lib/bio/alignment.rb', line 1045

def output_clustal(options = {})
  __clustal_formatter(self, self.sequence_names, options)
end

#output_fasta(options = {}) ⇒ `Object`

Generates fasta format text and returns a string.

# File 'lib/bio/alignment.rb', line 1059

def output_fasta(options={})
  #(original)
  width = (options[:width] or 70)
  if options[:avoid_same_name] then
    na = __clustal_avoid_same_name(self.sequence_names, 30)
  else
    na = self.sequence_names.collect do |k|
      k.to_s.gsub(/[\r\n\x00]/, ' ')
    end
  end
  if width and width > 0 then
    w_reg = Regexp.new(".{1,#{width}}")
    self.collect do |s|
      ">#{na.shift}\n" + s.to_s.gsub(w_reg, "\\0\n")
    end.join('')
  else
    self.collect do |s|
      ">#{na.shift}\n" + s.to_s + "\n"
    end.join('')
  end
end

#output_molphy(options = {}) ⇒ `Object`

Generates Molphy alignment format text as a string

# File 'lib/bio/alignment.rb', line 1151

def output_molphy(options = {})
  len = self.alignment_length
  header = "#{self.number_of_sequences} #{len}\n"
  sn = self.sequence_names.collect { |x| x.to_s.gsub(/[\r\n\x00]/, ' ') }
  if options[:replace_space]
    sn.collect! { |x| x.gsub(/\s/, '_') }
  end
  if !options.has_key?(:escape) or options[:escape]
    sn.collect! { |x| x.gsub(/[\:\;\,\(\)]/, '_') }
  end
  if !options.has_key?(:split) or options[:split]
    sn.collect! { |x| x.split(/\s/)[0].to_s }
  end
  if !options.has_key?(:avoid_same_name) or options[:avoid_same_name]
    sn = __clustal_avoid_same_name(sn, 30)
  end

  seqwidth  = (options[:width] or 60)
  seqregexp = Regexp.new("(.{1,#{seqwidth}})")
  gchar = (options[:gap_char] or '-')

  aseqs = Array.new(len).clear
  self.each_seq do |s|
    aseqs << s.to_s.gsub(self.gap_regexp, gchar)
  end
  case options[:case].to_s
  when /lower/i
    aseqs.each { |s| s.downcase! }
  when /upper/i
    aseqs.each { |s| s.upcase! }
  end
  
  aseqs.collect! do |s|
    s << (gchar * (len - s.length))
    s.gsub!(seqregexp, "\\1\n")
    sn.shift + "\n" + s
  end
  aseqs.unshift(header)
  aseqs.join('')
end

#output_msf(options = {}) ⇒ `Object`

Generates msf formatted text as a string

# File 'lib/bio/alignment.rb', line 1193

def output_msf(options = {})
  len = self.seq_length

  if !options.has_key?(:avoid_same_name) or options[:avoid_same_name]
    sn = __clustal_avoid_same_name(self.sequence_names)
  else
    sn = self.sequence_names.collect do |x|
      x.to_s.gsub(/[\r\n\x00]/, ' ')
    end
  end
  if !options.has_key?(:replace_space) or options[:replace_space]
    sn.collect! { |x| x.gsub(/\s/, '_') }
  end
  if !options.has_key?(:escape) or options[:escape]
    sn.collect! { |x| x.gsub(/[\:\;\,\(\)]/, '_') }
  end
  if !options.has_key?(:split) or options[:split]
    sn.collect! { |x| x.split(/\s/)[0].to_s }
  end

  seqwidth = 50
  namewidth = [31, sn.collect { |x| x.length }.max ].min
  sep = ' ' * 2

  seqregexp = Regexp.new("(.{1,#{seqwidth}})")
  gchar = (options[:gap_char]  or '.')
  pchar = (options[:padding_char] or '~')

  aseqs = Array.new(self.number_of_sequences).clear
  self.each_seq do |s|
    aseqs << s.to_s.gsub(self.gap_regexp, gchar)
  end
  aseqs.each do |s|
    s.sub!(/\A#{Regexp.escape(gchar)}+/) { |x| pchar * x.length }
    s.sub!(/#{Regexp.escape(gchar)}+\z/, '')
    s << (pchar * (len - s.length))
  end

  case options[:case].to_s
  when /lower/i
    aseqs.each { |s| s.downcase! }
  when /upper/i
    aseqs.each { |s| s.upcase! }
  else #default upcase
    aseqs.each { |s| s.upcase! }
  end

  case options[:type].to_s
  when /protein/i, /aa/i
    amino = true
  when /na/i
    amino = false
  else
    if seqclass == Bio::Sequence::AA then
      amino = true
    elsif seqclass == Bio::Sequence::NA then
      amino = false
    else
      # if we can't determine, we asuume as protein.
      amino = aseqs.size
      aseqs.each { |x| amino -= 1 if /\A[acgt]\z/i =~ x }
      amino = false if amino <= 0
    end
  end

  seq_type = (amino ? 'P' : 'N')

  fn = (options[:entry_id] or self.__id__.abs.to_s + '.msf')
  dt = (options[:time] or Time.now).strftime('%B %d, %Y %H:%M')

  sums = aseqs.collect { |s| GCG::Seq.calc_checksum(s) }
  #sums = aseqs.collect { |s| 0 }
  sum = 0; sums.each { |x| sum += x }; sum %= 10000
  msf =
    [
     "#{seq_type == 'N' ? 'N' : 'A' }A_MULTIPLE_ALIGNMENT 1.0\n",
     "\n",
     "\n",
     " #{fn}  MSF: #{len}  Type: #{seq_type}  #{dt}  Check: #{sum} ..\n",
     "\n"
    ]

  sn.each do |snx|
    msf << ' Name: ' +
      sprintf('%*s', -namewidth, snx.to_s)[0, namewidth] +
      "  Len: #{len}  Check: #{sums.shift}  Weight: 1.00\n"
  end
  msf << "\n//\n"

  aseqs.collect! do |s|
    snx = sn.shift
    head = sprintf("%*s", namewidth, snx.to_s)[0, namewidth] + sep
    s.gsub!(seqregexp, "\\1\n")
    a = s.split(/^/)
    a.collect { |x| head + x }
  end
  lines = (len + seqwidth - 1).div(seqwidth)
  i = 1
  lines.times do
    msf << "\n"
    n_l = i
    n_r = [ i + seqwidth - 1, len ].min
    if n_l != n_r then
      w = [ n_r - n_l + 1 - n_l.to_s.length - n_r.to_s.length, 1 ].max
      msf << (' ' * namewidth + sep + n_l.to_s + 
              ' ' * w + n_r.to_s + "\n")
    else
      msf << (' ' * namewidth + sep + n_l.to_s + "\n")
    end
    aseqs.each { |a| msf << a.shift }
    i += seqwidth
  end
  msf << "\n"
  msf.join('')
end

#output_phylip(options = {}) ⇒ `Object`

generates phylip interleaved alignment format as a string

# File 'lib/bio/alignment.rb', line 1082

def output_phylip(options = {})
  aln, aseqs, lines = __output_phylip_common(options)
  lines.times do
    aseqs.each { |a| aln << a.shift }
    aln << "\n"
  end
  aln.pop if aln[-1] == "\n"
  aln.join('')
end

#output_phylipnon(options = {}) ⇒ `Object`

generates Phylip3.2 (old) non-interleaved format as a string

# File 'lib/bio/alignment.rb', line 1093

def output_phylipnon(options = {})
  aln, aseqs, _ = __output_phylip_common(options)
  aln.first + aseqs.join('')
end

#to_clustal(*arg) ⇒ `Object`

to_clustal is deprecated. Instead, please use output_clustal.

alias to_clustal output_clustal +++

# File 'lib/bio/alignment.rb', line 1053

def to_clustal(*arg)
  warn "to_clustal is deprecated. Please use output_clustal."
  output_clustal(*arg)
end

Module: Bio::Alignment::Output

Overview

Instance Method Summary collapse

Instance Method Details

#__output_phylip_common(options = {}) ⇒ Object

#output(format, *arg) ⇒ Object

#output_clustal(options = {}) ⇒ Object

#output_fasta(options = {}) ⇒ Object

#output_molphy(options = {}) ⇒ Object

#output_msf(options = {}) ⇒ Object

#output_phylip(options = {}) ⇒ Object

#output_phylipnon(options = {}) ⇒ Object

#to_clustal(*arg) ⇒ Object