Module: MiGA::Common::Format

Included in:
MiGA
Defined in:
lib/miga/common/format.rb

Overview

General formatting functions shared throughout MiGA.

Instance Method Summary collapse

Instance Method Details

#clean_fasta_file(file) ⇒ Object

Cleans a FastA file in place.



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/miga/common/format.rb', line 26

def clean_fasta_file(file)
  tmp_fh = nil
  begin
    if file =~ /\.gz/
      tmp_path = Tempfile.new('MiGA.gz').tap(&:close).path
      tmp_fh = Zlib::GzipWriter.open(tmp_path)
      fh = Zlib::GzipReader.open(file)
    else
      tmp_fh = Tempfile.new('MiGA')
      tmp_path = tmp_fh.path
      fh = File.open(file, 'r')
    end
    buffer = ''
    fh.each_line do |ln|
      ln.chomp!
      if ln =~ /^>\s*(\S+)(.*)/
        id, df = $1, $2
        tmp_fh.print buffer.wrap_width(80)
        buffer = ''
        tmp_fh.puts ">#{id.gsub(/[^A-Za-z0-9_\|\.]/, '_')}#{df}"
      else
        buffer << ln.gsub(/[^A-Za-z\.\-]/, '')
      end
    end
    tmp_fh.print buffer.wrap_width(80)
    tmp_fh.close
    fh.close
    FileUtils.cp(tmp_path, file)
  ensure
    begin
      tmp_fh.close unless tmp_fh.nil?
      File.unlink(tmp_path) unless tmp_path.nil?
    rescue
    end
  end
end

#seqs_length(file, format, opts = {}) ⇒ Object

Calculates the average and standard deviation of the sequence lengths in a FastA or FastQ file (supports gzipped files). The format must be a Symbol, one of :fasta or :fastq. Additional estimations can be controlled via the opts Hash. Supported options include:

  • :n50: If true, it also returns the N50 and the median (in bp).

  • gc: If true, it also returns the G+C content (in %).



70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/miga/common/format.rb', line 70

def seqs_length(file, format, opts = {})
  fh = file =~ /\.gz/ ? Zlib::GzipReader.open(file) : File.open(file, 'r')
  l = []
  gc = 0
  i = 0 # <- Zlib::GzipReader doesn't set `$.`
  fh.each_line do |ln|
    i += 1
    if (format == :fasta and ln =~ /^>/) or
          (format == :fastq and (i % 4) == 1)
      l << 0
    elsif format == :fasta or (i % 4) == 2
      l[l.size - 1] += ln.chomp.size
      gc += ln.scan(/[GCgc]/).count if opts[:gc]
    end
  end
  fh.close

  o = { n: l.size, tot: l.inject(:+) }
  o[:avg] = o[:tot].to_f / l.size
  o[:var] = l.map { |a| a**2 }.inject(:+).to_f / l.size - o[:avg]**2
  o[:sd]  = Math.sqrt o[:var]
  o[:gc]  = 100.0 * gc / o[:tot] if opts[:gc]
  if opts[:n50]
    l.sort!
    thr = o[:tot] / 2
    pos = 0
    l.each do |a|
      pos += a
      o[:n50] = a
      break if pos >= thr
    end
    o[:med] = o[:n].even? ?
          0.5 * l[o[:n] / 2 - 1, 2].inject(:+) : l[(o[:n] - 1) / 2]
  end
  o
end

#tabulate(header, values, tabular = false) ⇒ Object

Tabulates an values, and Array of Arrays, all with the same number of entries as header. Returns an Array of String, one per line.



11
12
13
14
15
16
17
18
19
20
21
22
# File 'lib/miga/common/format.rb', line 11

def tabulate(header, values, tabular = false)
  fields = [header.map(&:to_s)]
  fields << fields.first.map { |h| h.gsub(/\S/, '-') } unless tabular
  fields += values.map { |r| r.map { |cell| cell.nil? ? '?' : cell.to_s } }
  clen = tabular ? Array.new(header.size, 0) :
        fields.map { |r| r.map(&:length) }.transpose.map(&:max)
  fields.map do |r|
    (0..(clen.size - 1)).map do |col_n|
      col_n == 0 ? r[col_n].rjust(clen[col_n]) : r[col_n].ljust(clen[col_n])
    end.join(tabular ? "\t" : '  ')
  end
end