Top Level Namespace

Defined Under Namespace

Modules: Enumerable, FastqAssessment, MiseqRunStats

Instance Method Summary collapse

Methods included from MiseqRunStats

#parse_assembly_run_stats, #parse_resequencing_run_stats

Methods included from FastqAssessment

#generate_quality_stats_for_read, #percentage_compared_to_raw

Instance Method Details

#calculate_read_length(filename) ⇒ Object



70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/trim_and_correct.rb', line 70

def calculate_read_length(filename)
  read_length = nil
  File.open(filename) do |f|
    f.each do |line|
     line.chomp!
     if line =~ /^[GATCgatc]/
       read_length = line.size
       break
     end
    end
  end
  return read_length - 1
end

#delete_file_with_dependencies(file, *dependencies) ⇒ Object



63
64
65
66
67
68
# File 'lib/trim_and_correct.rb', line 63

def delete_file_with_dependencies(file, *dependencies)
  dependencies << file
  if dependencies.all?{|dependency| File.exists?(dependency)} && File.exists?(file)
    File.delete(file)
  end
end

#extract_file_prefixes_and_sample_name(sample_map_file, directory) ⇒ Object



4
5
6
7
8
9
10
11
# File 'lib/fastq-factory.rb', line 4

def extract_file_prefixes_and_sample_name(sample_map_file, directory)
  sample_map = Hash.new
  File.read("#{directory}/#{sample_map_file}").split("\n").each do |sample_map_line|
    file_prefix, sample_name = sample_map_line.split("\t")
    sample_map[file_prefix] = sample_name
  end
  return sample_map
end

#file_exists?(directory, *filenames) ⇒ Boolean

Returns:

  • (Boolean)


13
14
15
16
17
18
19
# File 'lib/fastq-factory.rb', line 13

def file_exists?(directory, *filenames)
  at_least_one_file_found = false
  filenames.each do |filename|
    at_least_one_file_found = true  if File.exists?("#{directory}/#{filename}")
  end
  abort("You specified a file(s): #{filenames.join(", ")}. At least one of these must exist! Please check your sample map file") unless at_least_one_file_found
end

#find_executable(executable_name, directory = nil) ⇒ Object



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/fastq-factory.rb', line 21

def find_executable(executable_name, directory = nil)
  if directory.nil?
    if which(executable_name)
      return which(executable_name)
    elsif File.executable?("/usr/local/bin/#{executable_name}")
      return "/usr/local/bin/#{executable_name}"
    elsif File.executable?("/usr/local/#{executable_name}/#{executable_name}")
      return "/usr/local/#{executable_name}/#{executable_name}"
    else
      return nil
    end
  else
    if File.executable?("#{directory}/#{executable_name}")
      return "#{directory}/#{executable_name}"
    else
      return nil
    end
  end
end

#generate_quality_metrics(sample_map, directory, forward_reads_suffix, reverse_reads_suffix, quality_scale, quality_cutoff) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/generate_quality_metrics.rb', line 6

def generate_quality_metrics(sample_map, directory, forward_reads_suffix, reverse_reads_suffix, quality_scale, quality_cutoff)
  if File.exists?("#{directory}/ResequencingRunStatistics.xml")
    puts "Assessing quality from Miseq run stats file"
    run_stats = parse_resequencing_run_stats("#{directory}/ResequencingRunStatistics.xml", sample_map.values)
  elsif File.exists?("#{directory}/AssemblyRunStatistics.xml")
    puts "Assessing quality from Miseq run stats file"
    run_stats = parse_assembly_run_stats("#{directory}/AssemblyRunStatistics.xml", sample_map.values)
  else
    run_stats = ResequencingRunStats.new
    run_stats.sample_stats = Hash.new
    sample_map.values.each do |sample_name|
      run_stats.sample_stats[sample_name] = ResequencingSampleStats.new
    end
  end

  forward_reads_trimmed_suffix = forward_reads_suffix.sub(/(.+)(\..+?)$/, '\1.trimmed\2')
  reverse_reads_trimmed_suffix = reverse_reads_suffix.sub(/(.+)(\..+?)$/, '\1.trimmed\2')

  forward_reads_trimmed_corrected_suffix = forward_reads_suffix.sub(/(.+)(\..+?)$/, '\1.trimmed.cor\2')
  reverse_reads_trimmed_corrected_suffix = reverse_reads_suffix.sub(/(.+)(\..+?)$/, '\1.trimmed.cor\2')

  sample_map.each do |read_file_prefix, sample_name|
    puts "Assesing quality for #{sample_name}"
    run_stats.sample_stats[sample_name].fastq_stats = Hash.new
    run_stats.sample_stats[sample_name].fastq_stats["forward"] = generate_quality_stats_for_read("#{directory}/#{read_file_prefix}#{forward_reads_suffix}",quality_scale, quality_cutoff)
    run_stats.sample_stats[sample_name].fastq_stats["reverse"] = generate_quality_stats_for_read("#{directory}/#{read_file_prefix}#{reverse_reads_suffix}",quality_scale, quality_cutoff)
    if File.exists?("#{directory}/#{read_file_prefix}#{forward_reads_trimmed_corrected_suffix}")
      run_stats.sample_stats[sample_name].fastq_stats["forward-trim_corrected"] = generate_quality_stats_for_read("#{directory}/#{read_file_prefix}#{forward_reads_trimmed_corrected_suffix}",quality_scale, quality_cutoff)
      run_stats.sample_stats[sample_name].fastq_stats["forward-trim_corrected"].percentage_compared_to_raw = percentage_compared_to_raw("#{directory}/#{read_file_prefix}#{forward_reads_trimmed_corrected_suffix}", "#{directory}/#{read_file_prefix}#{forward_reads_suffix}")
      run_stats.sample_stats[sample_name].fastq_stats["reverse-trim_corrected"] = generate_quality_stats_for_read("#{directory}/#{read_file_prefix}#{reverse_reads_trimmed_corrected_suffix}",quality_scale, quality_cutoff)
      run_stats.sample_stats[sample_name].fastq_stats["reverse-trim_corrected"].percentage_compared_to_raw = percentage_compared_to_raw("#{directory}/#{read_file_prefix}#{reverse_reads_trimmed_corrected_suffix}", "#{directory}/#{read_file_prefix}#{reverse_reads_suffix}")
    else
      run_stats.sample_stats[sample_name].fastq_stats["forward-trim_corrected"] = generate_quality_stats_for_read("#{directory}/#{read_file_prefix}#{forward_reads_trimmed_suffix}",quality_scale, quality_cutoff)
      run_stats.sample_stats[sample_name].fastq_stats["forward-trim_corrected"].percentage_compared_to_raw = percentage_compared_to_raw("#{directory}/#{read_file_prefix}#{forward_reads_trimmed_suffix}", "#{directory}/#{read_file_prefix}#{forward_reads_suffix}")
      run_stats.sample_stats[sample_name].fastq_stats["reverse-trim_corrected"] = generate_quality_stats_for_read("#{directory}/#{read_file_prefix}#{reverse_reads_trimmed_suffix}",quality_scale, quality_cutoff)
      run_stats.sample_stats[sample_name].fastq_stats["reverse-trim_corrected"].percentage_compared_to_raw = percentage_compared_to_raw("#{directory}/#{read_file_prefix}#{reverse_reads_trimmed_suffix}", "#{directory}/#{read_file_prefix}#{reverse_reads_suffix}")
    end

  end
  # print out data
  output_file = File.open("#{directory}/summary_stats.txt", "w")
  # print headers
  output_file.puts "run name\tnumber of bases(Gb)\tnumber of clusters\tsample name\tdirection\tnumber of clusters\tnumber of forward reads aligned\tnumber of reverse reads aligned\tcoverage\tnumber of snps\tnumber of contigs\tmean contig size\tn50\tnumber of bases\tmean quality\tread base where qual falls below 30\tpercent reduction compared to raw"
  output_file.puts "#{directory.match(/.*\/(.+?)$/).captures.first}\t#{run_stats.number_of_bases}\t#{run_stats.number_of_clusters}"
  run_stats.sample_stats.keys.sort.each do |sample_name|
    sample_stats = run_stats.sample_stats[sample_name]
    if sample_stats.class == Struct::ResequencingSampleStats
      output_file.puts "\t\t\t#{sample_name}\t\t#{sample_stats.number_of_clusters}\t#{sample_stats.number_of_forward_reads_aligned}\t#{sample_stats.number_of_reverse_reads_aligned}\t#{sample_stats.coverage}\t#{sample_stats.number_of_snps}"
    elsif sample_stats.class == Struct::AssemblySampleStats
      output_file.puts "\t\t\t#{sample_name}\t\t#{sample_stats.number_of_clusters}\t\t\t\t\t#{sample_stats.number_of_contigs}\t#{sample_stats.mean_contig_size}\t#{sample_stats.n50}\t#{sample_stats.number_of_bases}"
    end
    ["forward", "reverse", "forward-trim_corrected", "reverse-trim_corrected"].each do |direction|
      fastq_stats = run_stats.sample_stats[sample_name].fastq_stats[direction]
      output_file.puts "\t\t\t\t#{direction}\t\t\t\t\t\t\t\t\t\t#{fastq_stats.mean_quality}\t#{fastq_stats.position_where_quality_lt_20}\t#{fastq_stats.percentage_compared_to_raw}"
    end
  end
  output_file.close
end

#trim_and_correct_fastqs(sample_map, directory, forward_reads_suffix, forward_reads_file_extension, reverse_reads_suffix, reverse_reads_file_extension, quality_scale, fastq_quality_trimmer_path, quake_path, trim_point_fraction, trim_quality_cutoff) ⇒ Object



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/trim_and_correct.rb', line 1

def trim_and_correct_fastqs(sample_map, directory, forward_reads_suffix, forward_reads_file_extension, reverse_reads_suffix, reverse_reads_file_extension, quality_scale, fastq_quality_trimmer_path, quake_path,trim_point_fraction, trim_quality_cutoff)
  Dir.chdir(directory)
  # trimming
  sample_map.each do |sample_file_prefix, sample_name|
    next if File.exists?("#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}") || File.exists?("paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension}")
    puts "Trimming files for #{sample_name}"
    #determine read length
    read_length = calculate_read_length("#{directory}/#{sample_file_prefix}#{forward_reads_suffix}.#{forward_reads_file_extension}")
    trim_point = (trim_point_fraction * read_length).to_i
    unless File.exists?("#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension}")
      `#{fastq_quality_trimmer_path} -i #{directory}/#{sample_file_prefix}#{forward_reads_suffix}.#{forward_reads_file_extension} -o #{directory}/#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension} -t #{trim_quality_cutoff} -l #{trim_point} -Q #{quality_scale} -v`
      `#{fastq_quality_trimmer_path} -i #{directory}/#{sample_file_prefix}#{reverse_reads_suffix}.#{reverse_reads_file_extension} -o #{directory}/#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension} -t #{trim_quality_cutoff} -l #{trim_point} -Q #{quality_scale} -v`
    end
    `perl /tmp/fastq-remove-orphans.pl -1 #{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension} -2 #{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}`
  end

  #  quake correction
  # write file for quake
  sample_map.each do |sample_file_prefix, sample_name|
    next if File.exists?("#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}") || File.exists?(("paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}"))
    puts "Error correcting files for #{sample_name}"
    output_file = File.open("quake_file_list.txt","w")
    output_file.puts "paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension} paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}"
    output_file.close
    # run quake
    `#{quake_path} -f quake_file_list.txt -k 15 -q #{quality_scale}`
  end
  sample_map.each do |sample_file_prefix, sample_name|
    next if File.exists?("#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}")
    # remove orphans
    `perl /tmp/fastq-remove-orphans.pl -1 paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension} -2 paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}`
  end

  # cleanup and rename files

  sample_map.each do |sample_file_prefix, sample_name|
    delete_file_with_dependencies("#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension}", "paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}")
    delete_file_with_dependencies("#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}", "paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}")
    delete_file_with_dependencies("orphaned_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension}")
    delete_file_with_dependencies("orphaned_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}")
    if File.exists?("paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}")
      delete_file_with_dependencies("paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension}")
      delete_file_with_dependencies("paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}")
      delete_file_with_dependencies("error_model.paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.txt")
      delete_file_with_dependencies("error_model.paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.txt")
      delete_file_with_dependencies("paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.stats.txt")
      delete_file_with_dependencies("paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor_single.#{forward_reads_file_extension}")
      delete_file_with_dependencies("paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.stats.txt")
      delete_file_with_dependencies("paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor_single.#{forward_reads_file_extension}")
      delete_file_with_dependencies("paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}")
      delete_file_with_dependencies("paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}")
      delete_file_with_dependencies("orphaned_paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}")
      delete_file_with_dependencies("orphaned_paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}")
      File.rename("paired_paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}", "#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}") if File.exists?("paired_paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}")
      File.rename("paired_paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}", "#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}") if File.exists?("paired_paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}")
    else
      File.rename("paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension}", "#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension}") if File.exists?("paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension}")
      File.rename("paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}", "#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}") if File.exists?("paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}")
    end
  end
end

#which(cmd) ⇒ Object

meethod to return path to command if it is in the path (works in windows)

Parameters:

  • String

    cmd the name of the command



43
44
45
46
47
48
49
50
51
52
# File 'lib/fastq-factory.rb', line 43

def which(cmd)
  exts = ENV['PATHEXT'] ? ENV['PATHEXT'].split(';') : ['']
  ENV['PATH'].split(File::PATH_SEPARATOR).each do |path|
    exts.each { |ext|
      exe = "#{path}/#{cmd}#{ext}"
      return exe if File.executable? exe
    }
  end
  return nil
end

#write_out_fastq_trim_scriptObject



54
55
56
# File 'lib/fastq-factory.rb', line 54

def write_out_fastq_trim_script
  system("cp #{File.dirname(__FILE__)}/fastq-remove-orphans.pl /tmp/")
end