Method: Bio::Alignment::Output#output_msf

Defined in:
lib/bio/alignment.rb

#output_msf(options = {}) ⇒ Object

Generates msf formatted text as a string



1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
# File 'lib/bio/alignment.rb', line 1193

def output_msf(options = {})
  len = self.seq_length

  if !options.has_key?(:avoid_same_name) or options[:avoid_same_name]
    sn = __clustal_avoid_same_name(self.sequence_names)
  else
    sn = self.sequence_names.collect do |x|
      x.to_s.gsub(/[\r\n\x00]/, ' ')
    end
  end
  if !options.has_key?(:replace_space) or options[:replace_space]
    sn.collect! { |x| x.gsub(/\s/, '_') }
  end
  if !options.has_key?(:escape) or options[:escape]
    sn.collect! { |x| x.gsub(/[\:\;\,\(\)]/, '_') }
  end
  if !options.has_key?(:split) or options[:split]
    sn.collect! { |x| x.split(/\s/)[0].to_s }
  end

  seqwidth = 50
  namewidth = [31, sn.collect { |x| x.length }.max ].min
  sep = ' ' * 2

  seqregexp = Regexp.new("(.{1,#{seqwidth}})")
  gchar = (options[:gap_char]  or '.')
  pchar = (options[:padding_char] or '~')

  aseqs = Array.new(self.number_of_sequences).clear
  self.each_seq do |s|
    aseqs << s.to_s.gsub(self.gap_regexp, gchar)
  end
  aseqs.each do |s|
    s.sub!(/\A#{Regexp.escape(gchar)}+/) { |x| pchar * x.length }
    s.sub!(/#{Regexp.escape(gchar)}+\z/, '')
    s << (pchar * (len - s.length))
  end

  case options[:case].to_s
  when /lower/i
    aseqs.each { |s| s.downcase! }
  when /upper/i
    aseqs.each { |s| s.upcase! }
  else #default upcase
    aseqs.each { |s| s.upcase! }
  end

  case options[:type].to_s
  when /protein/i, /aa/i
    amino = true
  when /na/i
    amino = false
  else
    if seqclass == Bio::Sequence::AA then
      amino = true
    elsif seqclass == Bio::Sequence::NA then
      amino = false
    else
      # if we can't determine, we asuume as protein.
      amino = aseqs.size
      aseqs.each { |x| amino -= 1 if /\A[acgt]\z/i =~ x }
      amino = false if amino <= 0
    end
  end

  seq_type = (amino ? 'P' : 'N')

  fn = (options[:entry_id] or self.__id__.abs.to_s + '.msf')
  dt = (options[:time] or Time.now).strftime('%B %d, %Y %H:%M')

  sums = aseqs.collect { |s| GCG::Seq.calc_checksum(s) }
  #sums = aseqs.collect { |s| 0 }
  sum = 0; sums.each { |x| sum += x }; sum %= 10000
  msf =
    [
     "#{seq_type == 'N' ? 'N' : 'A' }A_MULTIPLE_ALIGNMENT 1.0\n",
     "\n",
     "\n",
     " #{fn}  MSF: #{len}  Type: #{seq_type}  #{dt}  Check: #{sum} ..\n",
     "\n"
    ]

  sn.each do |snx|
    msf << ' Name: ' +
      sprintf('%*s', -namewidth, snx.to_s)[0, namewidth] +
      "  Len: #{len}  Check: #{sums.shift}  Weight: 1.00\n"
  end
  msf << "\n//\n"

  aseqs.collect! do |s|
    snx = sn.shift
    head = sprintf("%*s", namewidth, snx.to_s)[0, namewidth] + sep
    s.gsub!(seqregexp, "\\1\n")
    a = s.split(/^/)
    a.collect { |x| head + x }
  end
  lines = (len + seqwidth - 1).div(seqwidth)
  i = 1
  lines.times do
    msf << "\n"
    n_l = i
    n_r = [ i + seqwidth - 1, len ].min
    if n_l != n_r then
      w = [ n_r - n_l + 1 - n_l.to_s.length - n_r.to_s.length, 1 ].max
      msf << (' ' * namewidth + sep + n_l.to_s + 
              ' ' * w + n_r.to_s + "\n")
    else
      msf << (' ' * namewidth + sep + n_l.to_s + "\n")
    end
    aseqs.each { |a| msf << a.shift }
    i += seqwidth
  end
  msf << "\n"
  msf.join('')
end