Module: Bio::Alignment::Output

Included in:
EnumerableExtension
Defined in:
lib/bio/alignment.rb

Overview

module EnumerableExtension

Instance Method Summary collapse

Instance Method Details

#__output_phylip_common(options = {}) ⇒ Object

common routine for interleaved/non-interleaved phylip format



1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
# File 'lib/bio/alignment.rb', line 1099

def __output_phylip_common(options = {})
  len = self.alignment_length
  aln = [ " #{self.number_of_sequences} #{len}\n" ]
  sn = self.sequence_names.collect { |x| x.to_s.gsub(/[\r\n\x00]/, ' ') }
  if options[:replace_space]
    sn.collect! { |x| x.gsub(/\s/, '_') }
  end
  if !options.has_key?(:escape) or options[:escape]
    sn.collect! { |x| x.gsub(/[\:\;\,\(\)]/, '_') }
  end
  if !options.has_key?(:split) or options[:split]
    sn.collect! { |x| x.split(/\s/)[0].to_s }
  end
  if !options.has_key?(:avoid_same_name) or options[:avoid_same_name]
    sn = __clustal_avoid_same_name(sn, 10)
  end

  namewidth = 10
  seqwidth  = (options[:width] or 60)
  seqwidth = seqwidth.div(10) * 10
  seqregexp = Regexp.new("(.{1,#{seqwidth.div(10) * 11}})")
  gchar = (options[:gap_char] or '-')

  aseqs = Array.new(self.number_of_sequences).clear
  self.each_seq do |s|
    aseqs << s.to_s.gsub(self.gap_regexp, gchar)
  end
  case options[:case].to_s
  when /lower/i
    aseqs.each { |s| s.downcase! }
  when /upper/i
    aseqs.each { |s| s.upcase! }
  end
  
  aseqs.collect! do |s|
    snx = sn.shift
    head = sprintf("%*s", -namewidth, snx.to_s)[0, namewidth]
    head2 = ' ' * namewidth
    s << (gchar * (len - s.length))
    s.gsub!(/(.{1,10})/n, " \\1")
    s.gsub!(seqregexp, "\\1\n")
    a = s.split(/^/)
    head += a.shift
    ret = a.collect { |x| head2 + x }
    ret.unshift(head)
    ret
  end
  lines = (len + seqwidth - 1).div(seqwidth)
  [ aln, aseqs, lines ]
end

#output(format, *arg) ⇒ Object



873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
# File 'lib/bio/alignment.rb', line 873

def output(format, *arg)
  case format
  when :clustal
    output_clustal(*arg)
  when :fasta
    output_fasta(*arg)
  when :phylip
    output_phylip(*arg)
  when :phylipnon
    output_phylipnon(*arg)
  when :msf
    output_msf(*arg)
  when :molphy
    output_molphy(*arg)
  else
    raise "Unknown format: #{format.inspect}"
  end
end

#output_clustal(options = {}) ⇒ Object

Generates ClustalW-formatted text

seqs

sequences (must be an alignment object)

names

names of the sequences

options

options



1045
1046
1047
# File 'lib/bio/alignment.rb', line 1045

def output_clustal(options = {})
  __clustal_formatter(self, self.sequence_names, options)
end

#output_fasta(options = {}) ⇒ Object

Generates fasta format text and returns a string.



1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
# File 'lib/bio/alignment.rb', line 1059

def output_fasta(options={})
  #(original)
  width = (options[:width] or 70)
  if options[:avoid_same_name] then
    na = __clustal_avoid_same_name(self.sequence_names, 30)
  else
    na = self.sequence_names.collect do |k|
      k.to_s.gsub(/[\r\n\x00]/, ' ')
    end
  end
  if width and width > 0 then
    w_reg = Regexp.new(".{1,#{width}}")
    self.collect do |s|
      ">#{na.shift}\n" + s.to_s.gsub(w_reg, "\\0\n")
    end.join('')
  else
    self.collect do |s|
      ">#{na.shift}\n" + s.to_s + "\n"
    end.join('')
  end
end

#output_molphy(options = {}) ⇒ Object

Generates Molphy alignment format text as a string



1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
# File 'lib/bio/alignment.rb', line 1151

def output_molphy(options = {})
  len = self.alignment_length
  header = "#{self.number_of_sequences} #{len}\n"
  sn = self.sequence_names.collect { |x| x.to_s.gsub(/[\r\n\x00]/, ' ') }
  if options[:replace_space]
    sn.collect! { |x| x.gsub(/\s/, '_') }
  end
  if !options.has_key?(:escape) or options[:escape]
    sn.collect! { |x| x.gsub(/[\:\;\,\(\)]/, '_') }
  end
  if !options.has_key?(:split) or options[:split]
    sn.collect! { |x| x.split(/\s/)[0].to_s }
  end
  if !options.has_key?(:avoid_same_name) or options[:avoid_same_name]
    sn = __clustal_avoid_same_name(sn, 30)
  end

  seqwidth  = (options[:width] or 60)
  seqregexp = Regexp.new("(.{1,#{seqwidth}})")
  gchar = (options[:gap_char] or '-')

  aseqs = Array.new(len).clear
  self.each_seq do |s|
    aseqs << s.to_s.gsub(self.gap_regexp, gchar)
  end
  case options[:case].to_s
  when /lower/i
    aseqs.each { |s| s.downcase! }
  when /upper/i
    aseqs.each { |s| s.upcase! }
  end
  
  aseqs.collect! do |s|
    s << (gchar * (len - s.length))
    s.gsub!(seqregexp, "\\1\n")
    sn.shift + "\n" + s
  end
  aseqs.unshift(header)
  aseqs.join('')
end

#output_msf(options = {}) ⇒ Object

Generates msf formatted text as a string



1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
# File 'lib/bio/alignment.rb', line 1193

def output_msf(options = {})
  len = self.seq_length

  if !options.has_key?(:avoid_same_name) or options[:avoid_same_name]
    sn = __clustal_avoid_same_name(self.sequence_names)
  else
    sn = self.sequence_names.collect do |x|
      x.to_s.gsub(/[\r\n\x00]/, ' ')
    end
  end
  if !options.has_key?(:replace_space) or options[:replace_space]
    sn.collect! { |x| x.gsub(/\s/, '_') }
  end
  if !options.has_key?(:escape) or options[:escape]
    sn.collect! { |x| x.gsub(/[\:\;\,\(\)]/, '_') }
  end
  if !options.has_key?(:split) or options[:split]
    sn.collect! { |x| x.split(/\s/)[0].to_s }
  end

  seqwidth = 50
  namewidth = [31, sn.collect { |x| x.length }.max ].min
  sep = ' ' * 2

  seqregexp = Regexp.new("(.{1,#{seqwidth}})")
  gchar = (options[:gap_char]  or '.')
  pchar = (options[:padding_char] or '~')

  aseqs = Array.new(self.number_of_sequences).clear
  self.each_seq do |s|
    aseqs << s.to_s.gsub(self.gap_regexp, gchar)
  end
  aseqs.each do |s|
    s.sub!(/\A#{Regexp.escape(gchar)}+/) { |x| pchar * x.length }
    s.sub!(/#{Regexp.escape(gchar)}+\z/, '')
    s << (pchar * (len - s.length))
  end

  case options[:case].to_s
  when /lower/i
    aseqs.each { |s| s.downcase! }
  when /upper/i
    aseqs.each { |s| s.upcase! }
  else #default upcase
    aseqs.each { |s| s.upcase! }
  end

  case options[:type].to_s
  when /protein/i, /aa/i
    amino = true
  when /na/i
    amino = false
  else
    if seqclass == Bio::Sequence::AA then
      amino = true
    elsif seqclass == Bio::Sequence::NA then
      amino = false
    else
      # if we can't determine, we asuume as protein.
      amino = aseqs.size
      aseqs.each { |x| amino -= 1 if /\A[acgt]\z/i =~ x }
      amino = false if amino <= 0
    end
  end

  seq_type = (amino ? 'P' : 'N')

  fn = (options[:entry_id] or self.__id__.abs.to_s + '.msf')
  dt = (options[:time] or Time.now).strftime('%B %d, %Y %H:%M')

  sums = aseqs.collect { |s| GCG::Seq.calc_checksum(s) }
  #sums = aseqs.collect { |s| 0 }
  sum = 0; sums.each { |x| sum += x }; sum %= 10000
  msf =
    [
     "#{seq_type == 'N' ? 'N' : 'A' }A_MULTIPLE_ALIGNMENT 1.0\n",
     "\n",
     "\n",
     " #{fn}  MSF: #{len}  Type: #{seq_type}  #{dt}  Check: #{sum} ..\n",
     "\n"
    ]

  sn.each do |snx|
    msf << ' Name: ' +
      sprintf('%*s', -namewidth, snx.to_s)[0, namewidth] +
      "  Len: #{len}  Check: #{sums.shift}  Weight: 1.00\n"
  end
  msf << "\n//\n"

  aseqs.collect! do |s|
    snx = sn.shift
    head = sprintf("%*s", namewidth, snx.to_s)[0, namewidth] + sep
    s.gsub!(seqregexp, "\\1\n")
    a = s.split(/^/)
    a.collect { |x| head + x }
  end
  lines = (len + seqwidth - 1).div(seqwidth)
  i = 1
  lines.times do
    msf << "\n"
    n_l = i
    n_r = [ i + seqwidth - 1, len ].min
    if n_l != n_r then
      w = [ n_r - n_l + 1 - n_l.to_s.length - n_r.to_s.length, 1 ].max
      msf << (' ' * namewidth + sep + n_l.to_s + 
              ' ' * w + n_r.to_s + "\n")
    else
      msf << (' ' * namewidth + sep + n_l.to_s + "\n")
    end
    aseqs.each { |a| msf << a.shift }
    i += seqwidth
  end
  msf << "\n"
  msf.join('')
end

#output_phylip(options = {}) ⇒ Object

generates phylip interleaved alignment format as a string



1082
1083
1084
1085
1086
1087
1088
1089
1090
# File 'lib/bio/alignment.rb', line 1082

def output_phylip(options = {})
  aln, aseqs, lines = __output_phylip_common(options)
  lines.times do
    aseqs.each { |a| aln << a.shift }
    aln << "\n"
  end
  aln.pop if aln[-1] == "\n"
  aln.join('')
end

#output_phylipnon(options = {}) ⇒ Object

generates Phylip3.2 (old) non-interleaved format as a string



1093
1094
1095
1096
# File 'lib/bio/alignment.rb', line 1093

def output_phylipnon(options = {})
  aln, aseqs, _ = __output_phylip_common(options)
  aln.first + aseqs.join('')
end

#to_clustal(*arg) ⇒ Object

to_clustal is deprecated. Instead, please use output_clustal.


alias to_clustal output_clustal +++



1053
1054
1055
1056
# File 'lib/bio/alignment.rb', line 1053

def to_clustal(*arg)
  warn "to_clustal is deprecated. Please use output_clustal."
  output_clustal(*arg)
end