Class: Bio::FastQC::Parser
- Inherits:
-
Object
- Object
- Bio::FastQC::Parser
- Defined in:
- lib/bio/fastqc/parser.rb
Instance Method Summary collapse
- #adapter_content ⇒ Object
-
#basic_statistics ⇒ Object
Basic Statistics module.
-
#encoding ⇒ Object
quality encoding method for input file type.
-
#fastqc_version ⇒ Object
software version of FastQC.
-
#file_type ⇒ Object
input file type.
-
#filename ⇒ Object
input filename for FastQC program.
-
#filtered_sequences ⇒ Object
number of sequence reads filtered out.
-
#get_module_matrix(module_name, num_of_header_rows) ⇒ Object
Other modules.
-
#initialize(fastqc_data_txt) ⇒ Parser
constructor
A new instance of Parser.
- #kmer_content ⇒ Object
- #max_length ⇒ Object
- #mean_sequence_length ⇒ Object
- #median_sequence_length ⇒ Object
-
#min_length ⇒ Object
Custom modules.
- #overall_mean_quality_score ⇒ Object
- #overall_median_quality_score ⇒ Object
- #overall_n_content ⇒ Object
- #overall_quality_score(mean_or_median) ⇒ Object
- #overrepresented_sequences ⇒ Object
- #parse ⇒ Object
- #parse_modules ⇒ Object
- #per_base_n_content ⇒ Object
- #per_base_quality_column(mean_or_median) ⇒ Object
- #per_base_sequence_content ⇒ Object
- #per_base_sequence_quality ⇒ Object
- #per_sequence_gc_content ⇒ Object
- #per_sequence_quality_scores ⇒ Object
- #per_tile_sequence_quality ⇒ Object
-
#percent_gc ⇒ Object
overall percentage of GC content.
- #sequence_duplication_levels ⇒ Object
-
#sequence_length ⇒ Object
store as string: can be range.
- #sequence_length_distribution ⇒ Object
-
#sequences_flagged_as_poor_quality ⇒ Object
number of sequence reads flagged as poor quality.
- #summary ⇒ Object
- #total_duplicate_percentage ⇒ Object
-
#total_sequences ⇒ Object
total number of sequence reads.
Constructor Details
#initialize(fastqc_data_txt) ⇒ Parser
Returns a new instance of Parser.
6 7 8 9 10 |
# File 'lib/bio/fastqc/parser.rb', line 6 def initialize(fastqc_data_txt) @data = fastqc_data_txt @module_results = parse_modules @basic_statistics = basic_statistics end |
Instance Method Details
#adapter_content ⇒ Object
111 112 113 |
# File 'lib/bio/fastqc/parser.rb', line 111 def adapter_content get_module_matrix("Adapter Content", 1) end |
#basic_statistics ⇒ Object
Basic Statistics module
22 23 24 |
# File 'lib/bio/fastqc/parser.rb', line 22 def basic_statistics Hash[*@module_results[0].flatten] end |
#encoding ⇒ Object
quality encoding method for input file type
38 39 40 |
# File 'lib/bio/fastqc/parser.rb', line 38 def encoding # quality encoding method for input file type @basic_statistics["Encoding"] end |
#fastqc_version ⇒ Object
software version of FastQC
26 27 28 |
# File 'lib/bio/fastqc/parser.rb', line 26 def fastqc_version # software version of FastQC @basic_statistics["##FastQC"] end |
#file_type ⇒ Object
input file type
34 35 36 |
# File 'lib/bio/fastqc/parser.rb', line 34 def file_type # input file type @basic_statistics["File type"] end |
#filename ⇒ Object
input filename for FastQC program
30 31 32 |
# File 'lib/bio/fastqc/parser.rb', line 30 def filename # input filename for FastQC program @basic_statistics["Filename"] end |
#filtered_sequences ⇒ Object
number of sequence reads filtered out
50 51 52 |
# File 'lib/bio/fastqc/parser.rb', line 50 def filtered_sequences # number of sequence reads filtered out @basic_statistics["Filtered Sequences"].to_i end |
#get_module_matrix(module_name, num_of_header_rows) ⇒ Object
Other modules
66 67 68 69 |
# File 'lib/bio/fastqc/parser.rb', line 66 def get_module_matrix(module_name, num_of_header_rows) mod = @module_results.select{|m| m[0][0] == ">>#{module_name}" }[0] mod.drop(num_of_header_rows) if mod end |
#kmer_content ⇒ Object
115 116 117 |
# File 'lib/bio/fastqc/parser.rb', line 115 def kmer_content get_module_matrix("Kmer Content", 1) end |
#max_length ⇒ Object
127 128 129 |
# File 'lib/bio/fastqc/parser.rb', line 127 def max_length sequence_length.sub(/^\d+-/,"").to_i end |
#mean_sequence_length ⇒ Object
163 164 165 166 167 168 169 170 171 172 173 174 175 |
# File 'lib/bio/fastqc/parser.rb', line 163 def mean_sequence_length dist = sequence_length_distribution.drop(1) # drop column header if dist.size == 1 dist[0][0].to_f else sum = dist.map do |length_count| l = length_count[0] c = length_count[1].to_f ((l.sub(/-\d+$/,"").to_f + l.sub(/^\d+-/,"").to_f) / 2) * c end sum.reduce(:+) / dist.map{|l_c| l_c[1].to_f }.reduce(:+) end end |
#median_sequence_length ⇒ Object
177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
# File 'lib/bio/fastqc/parser.rb', line 177 def median_sequence_length dist = sequence_length_distribution.drop(1) # drop column header if dist.size == 1 dist[0][0].to_f else k = dist.map{|l_c| l_c[1].to_f }.reduce(:+) / 2 # position of median median = 0 dist.each do |l_c| c = l_c[1].to_f # count of reads in this length range if k > c k -= c else l = l_c[0] median = ((l.sub(/-\d+$/,"").to_f + l.sub(/^\d+-/,"").to_f) / 2) break end end median end end |
#min_length ⇒ Object
Custom modules
123 124 125 |
# File 'lib/bio/fastqc/parser.rb', line 123 def min_length sequence_length.sub(/-\d+$/,"").to_i end |
#overall_mean_quality_score ⇒ Object
149 150 151 |
# File 'lib/bio/fastqc/parser.rb', line 149 def overall_mean_quality_score overall_quality_score(:mean) end |
#overall_median_quality_score ⇒ Object
153 154 155 |
# File 'lib/bio/fastqc/parser.rb', line 153 def overall_median_quality_score overall_quality_score(:median) end |
#overall_n_content ⇒ Object
157 158 159 160 161 |
# File 'lib/bio/fastqc/parser.rb', line 157 def overall_n_content per_base = per_base_n_content v = per_base.map{|c| c[1].to_f } v.reduce(:+) / v.size end |
#overall_quality_score(mean_or_median) ⇒ Object
140 141 142 143 144 145 146 147 |
# File 'lib/bio/fastqc/parser.rb', line 140 def overall_quality_score(mean_or_median) per_base = per_base_sequence_quality.drop(1) # drop header column = per_base_quality_column(mean_or_median) v = per_base.map do |row| (10**(row[column].to_f / -10)).to_f end -10 * Math.log10(v.reduce(:+) / v.size) end |
#overrepresented_sequences ⇒ Object
107 108 109 |
# File 'lib/bio/fastqc/parser.rb', line 107 def overrepresented_sequences get_module_matrix("Overrepresented sequences", 1) end |
#parse ⇒ Object
202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 |
# File 'lib/bio/fastqc/parser.rb', line 202 def parse { fastqc_version: fastqc_version, filename: filename, file_type: file_type, encoding: encoding, total_sequences: total_sequences, sequences_flagged_as_poor_quality: sequences_flagged_as_poor_quality, filtered_sequences: filtered_sequences, sequence_length: sequence_length, percent_gc: percent_gc, per_base_sequence_quality: per_base_sequence_quality, per_tile_sequence_quality: per_tile_sequence_quality, per_sequence_quality_scores: per_sequence_quality_scores, per_base_sequence_content: per_base_sequence_content, per_sequence_gc_content: per_sequence_gc_content, per_base_n_content: per_base_n_content, sequence_length_distribution: sequence_length_distribution, total_duplicate_percentage: total_duplicate_percentage, sequence_duplication_levels: sequence_duplication_levels, overrepresented_sequences: overrepresented_sequences, adapter_content: adapter_content, kmer_content: kmer_content, min_length: min_length, max_length: max_length, overall_mean_quality_score: overall_mean_quality_score, overall_median_quality_score: overall_median_quality_score, overall_n_content: overall_n_content, mean_sequence_length: mean_sequence_length, median_sequence_length: median_sequence_length, } end |
#parse_modules ⇒ Object
12 13 14 15 16 |
# File 'lib/bio/fastqc/parser.rb', line 12 def parse_modules @data.split(">>END_MODULE\n").map do |mod| mod.split("\n").map{|line| line.split("\t") } end end |
#per_base_n_content ⇒ Object
91 92 93 |
# File 'lib/bio/fastqc/parser.rb', line 91 def per_base_n_content get_module_matrix("Per base N content", 1) end |
#per_base_quality_column(mean_or_median) ⇒ Object
131 132 133 134 135 136 137 138 |
# File 'lib/bio/fastqc/parser.rb', line 131 def per_base_quality_column(mean_or_median) case mean_or_median when :mean 1 when :median 2 end end |
#per_base_sequence_content ⇒ Object
83 84 85 |
# File 'lib/bio/fastqc/parser.rb', line 83 def per_base_sequence_content get_module_matrix("Per base sequence content", 1) end |
#per_base_sequence_quality ⇒ Object
71 72 73 |
# File 'lib/bio/fastqc/parser.rb', line 71 def per_base_sequence_quality get_module_matrix("Per base sequence quality", 1) end |
#per_sequence_gc_content ⇒ Object
87 88 89 |
# File 'lib/bio/fastqc/parser.rb', line 87 def per_sequence_gc_content get_module_matrix("Per sequence GC content", 1) end |
#per_sequence_quality_scores ⇒ Object
79 80 81 |
# File 'lib/bio/fastqc/parser.rb', line 79 def per_sequence_quality_scores get_module_matrix("Per sequence quality scores", 1) end |
#per_tile_sequence_quality ⇒ Object
75 76 77 |
# File 'lib/bio/fastqc/parser.rb', line 75 def per_tile_sequence_quality get_module_matrix("Per tile sequence quality", 1) end |
#percent_gc ⇒ Object
overall percentage of GC content
58 59 60 |
# File 'lib/bio/fastqc/parser.rb', line 58 def percent_gc # overall percentage of GC content @basic_statistics["%GC"].to_f end |
#sequence_duplication_levels ⇒ Object
103 104 105 |
# File 'lib/bio/fastqc/parser.rb', line 103 def sequence_duplication_levels get_module_matrix("Sequence Duplication Levels", 2) end |
#sequence_length ⇒ Object
store as string: can be range
54 55 56 |
# File 'lib/bio/fastqc/parser.rb', line 54 def sequence_length # store as string: can be range @basic_statistics["Sequence length"] end |
#sequence_length_distribution ⇒ Object
95 96 97 |
# File 'lib/bio/fastqc/parser.rb', line 95 def sequence_length_distribution get_module_matrix("Sequence Length Distribution", 1) end |
#sequences_flagged_as_poor_quality ⇒ Object
number of sequence reads flagged as poor quality
46 47 48 |
# File 'lib/bio/fastqc/parser.rb', line 46 def sequences_flagged_as_poor_quality # number of sequence reads flagged as poor quality @basic_statistics["Sequences flagged as poor quality"].to_i end |
#summary ⇒ Object
198 199 200 |
# File 'lib/bio/fastqc/parser.rb', line 198 def summary parse end |
#total_duplicate_percentage ⇒ Object
99 100 101 |
# File 'lib/bio/fastqc/parser.rb', line 99 def total_duplicate_percentage get_module_matrix("Sequence Duplication Levels", 1)[0][1].to_f end |
#total_sequences ⇒ Object
total number of sequence reads
42 43 44 |
# File 'lib/bio/fastqc/parser.rb', line 42 def total_sequences # total number of sequence reads @basic_statistics["Total Sequences"].to_i end |