Module: MiGA::Common::Format

Included in:
MiGA
Defined in:
lib/miga/common/format.rb

Overview

General formatting functions shared throughout MiGA.

Instance Method Summary collapse

Instance Method Details

#clean_fasta_file(file, min_len = 1) ⇒ Object

Cleans a FastA file in place, removing all sequences shorter than min_len



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/miga/common/format.rb', line 28

def clean_fasta_file(file, min_len = 1)
  tmp_fh = nil
  tmp_path = nil
  begin
    if file =~ /\.gz/
      tmp_path = Tempfile.new('MiGA.gz').tap(&:close).path
      File.unlink tmp_path
      tmp_path += '.gz'
      tmp_fh = Zlib::GzipWriter.open(tmp_path, 9)
      fh = Zlib::GzipReader.open(file)
    else
      tmp_fh = Tempfile.new('MiGA')
      tmp_path = tmp_fh.path
      fh = File.open(file, 'r')
    end
    next_seq = ['', '']
    fh.each_line do |ln|
      ln.chomp!
      if ln =~ /^>\s*(\S+)(.*)/
        id, df = $1, $2
        if next_seq[1].length >= min_len
          tmp_fh.puts next_seq[0]
          tmp_fh.print next_seq[1].wrap_width(80)
        end
        next_seq = [">#{id.gsub(/[^A-Za-z0-9_\|\.]/, '_')}#{df}", '']
      else
        next_seq[1] += ln.gsub(/[^A-Za-z]/, '')
      end
    end
    if next_seq[1].length >= min_len
      tmp_fh.puts next_seq[0]
      tmp_fh.print next_seq[1].wrap_width(80)
    end
    tmp_fh.close
    fh.close
    FileUtils.mv(tmp_path, file)
  ensure
    begin
      tmp_fh.close unless tmp_fh.nil?
      File.unlink(tmp_path) unless tmp_path.nil?
    rescue
    end
  end
end

#seqs_length(file, format, opts = {}) ⇒ Object

Calculates the average and standard deviation of the sequence lengths in a FastA or FastQ file (supports gzipped files). The format must be a Symbol, one of :fasta or :fastq. Additional estimations can be controlled via the opts Hash. Supported options include:

  • :n50: Include the N50 and the median (in bp)

  • :gc: Include the G+C content (in %)

  • :x: Include the undetermined bases content (in %)

  • :skew: Include G-C and A-T sequence skew (in %; forces gc: true). See definition used here in DOI:10.1177/117693430700300006



83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# File 'lib/miga/common/format.rb', line 83

def seqs_length(file, format, opts = {})
  opts[:gc] = true if opts[:skew]
  fh = file =~ /\.gz/ ? Zlib::GzipReader.open(file) : File.open(file, 'r')
  l = []
  gc = 0
  xn = 0
  t  = 0
  c  = 0
  i  = 0 # <- Zlib::GzipReader doesn't set `$.`
  fh.each_line do |ln|
    i += 1
    if (format == :fasta and ln =~ /^>/) or
       (format == :fastq and (i % 4) == 1)
      l << 0
    elsif format == :fasta or (i % 4) == 2
      l[l.size - 1] += ln.chomp.size
      gc += ln.scan(/[GCgc]/).count if opts[:gc]
      xn += ln.scan(/[XNxn]/).count if opts[:x]
      if opts[:skew]
        t += ln.scan(/[Tt]/).count
        c += ln.scan(/[Cc]/).count
      end
    end
  end
  fh.close

  o = { n: l.size, tot: l.inject(0, :+), max: l.max }
  return o if o[:tot].zero?
  o[:avg] = o[:tot].to_f / l.size
  o[:var] = l.map { |a| a**2 }.inject(:+).to_f / l.size - o[:avg]**2
  o[:sd]  = Math.sqrt o[:var]
  o[:gc]  = 100.0 * gc / o[:tot] if opts[:gc]
  o[:x]   = 100.0 * xn / o[:tot] if opts[:x]
  if opts[:skew]
    at = o[:tot] - gc
    o[:at_skew] = 100.0 * (2 * t - at) / at
    o[:gc_skew] = 100.0 * (2 * c - gc) / gc
  end

  if opts[:n50]
    l.sort!
    thr = o[:tot] / 2
    pos = 0
    l.each do |a|
      pos += a
      o[:n50] = a
      break if pos >= thr
    end
    o[:med] = o[:n].even? ?
          0.5 * l[o[:n] / 2 - 1, 2].inject(:+) :
          l[(o[:n] - 1) / 2]
  end
  o
end

#tabulate(header, values, tabular = false) ⇒ Object

Tabulates an values, and Array of Arrays, all with the same number of entries as header. Returns an Array of String, one per line.



11
12
13
14
15
16
17
18
19
20
21
22
23
# File 'lib/miga/common/format.rb', line 11

def tabulate(header, values, tabular = false)
  fields = []
  fields << header.map(&:to_s) unless tabular && header.all?(&:nil?)
  fields << fields.first.map { |h| h.gsub(/\S/, '-') } unless tabular
  fields += values.map { |r| r.map { |cell| cell.nil? ? '?' : cell.to_s } }
  clen = tabular ? Array.new(header.size, 0) :
        fields.map { |r| r.map(&:length) }.transpose.map(&:max)
  fields.map do |r|
    (0..(clen.size - 1)).map do |col_n|
      col_n == 0 ? r[col_n].rjust(clen[col_n]) : r[col_n].ljust(clen[col_n])
    end.join(tabular ? "\t" : '  ')
  end
end