Class: Bio::FastQC::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/bio/fastqc/parser.rb

Instance Method Summary collapse

Constructor Details

#initialize(fastqc_data_txt) ⇒ Parser

Returns a new instance of Parser.



6
7
8
9
10
# File 'lib/bio/fastqc/parser.rb', line 6

def initialize(fastqc_data_txt)
  @data = fastqc_data_txt
  @module_results = parse_modules
  @basic_statistics = basic_statistics
end

Instance Method Details

#adapter_contentObject



111
112
113
# File 'lib/bio/fastqc/parser.rb', line 111

def adapter_content
  get_module_matrix("Adapter Content", 1)
end

#basic_statisticsObject

Basic Statistics module



22
23
24
# File 'lib/bio/fastqc/parser.rb', line 22

def basic_statistics
  Hash[*@module_results[0].flatten]
end

#encodingObject

quality encoding method for input file type



38
39
40
# File 'lib/bio/fastqc/parser.rb', line 38

def encoding # quality encoding method for input file type
  @basic_statistics["Encoding"]
end

#fastqc_versionObject

software version of FastQC



26
27
28
# File 'lib/bio/fastqc/parser.rb', line 26

def fastqc_version # software version of FastQC
  @basic_statistics["##FastQC"]
end

#file_typeObject

input file type



34
35
36
# File 'lib/bio/fastqc/parser.rb', line 34

def file_type # input file type
  @basic_statistics["File type"]
end

#filenameObject

input filename for FastQC program



30
31
32
# File 'lib/bio/fastqc/parser.rb', line 30

def filename # input filename for FastQC program
  @basic_statistics["Filename"]
end

#filtered_sequencesObject

number of sequence reads filtered out



50
51
52
# File 'lib/bio/fastqc/parser.rb', line 50

def filtered_sequences # number of sequence reads filtered out
  @basic_statistics["Filtered Sequences"].to_i
end

#get_module_matrix(module_name, num_of_header_rows) ⇒ Object

Other modules



66
67
68
69
# File 'lib/bio/fastqc/parser.rb', line 66

def get_module_matrix(module_name, num_of_header_rows)
  mod = @module_results.select{|m| m[0][0] == ">>#{module_name}" }[0]
  mod.drop(num_of_header_rows) if mod
end

#kmer_contentObject



115
116
117
# File 'lib/bio/fastqc/parser.rb', line 115

def kmer_content
  get_module_matrix("Kmer Content", 1)
end

#max_lengthObject



127
128
129
# File 'lib/bio/fastqc/parser.rb', line 127

def max_length
  sequence_length.sub(/^\d+-/,"").to_i
end

#mean_sequence_lengthObject



163
164
165
166
167
168
169
170
171
172
173
174
175
# File 'lib/bio/fastqc/parser.rb', line 163

def mean_sequence_length
  dist = sequence_length_distribution.drop(1) # drop column header
  if dist.size == 1
    dist[0][0].to_f
  else
    sum = dist.map do |length_count|
      l = length_count[0]
      c  = length_count[1].to_f
      ((l.sub(/-\d+$/,"").to_f + l.sub(/^\d+-/,"").to_f) / 2) * c
    end
    sum.reduce(:+) / dist.map{|l_c| l_c[1].to_f }.reduce(:+)
  end
end

#median_sequence_lengthObject



177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# File 'lib/bio/fastqc/parser.rb', line 177

def median_sequence_length
  dist = sequence_length_distribution.drop(1) # drop column header
  if dist.size == 1
    dist[0][0].to_f
  else
    k = dist.map{|l_c| l_c[1].to_f }.reduce(:+) / 2 # position of median
    median = 0
    dist.each do |l_c|
      c = l_c[1].to_f # count of reads in this length range
      if k > c
        k -= c
      else
        l = l_c[0]
        median = ((l.sub(/-\d+$/,"").to_f + l.sub(/^\d+-/,"").to_f) / 2)
        break
      end
    end
    median
  end
end

#min_lengthObject

Custom modules



123
124
125
# File 'lib/bio/fastqc/parser.rb', line 123

def min_length
  sequence_length.sub(/-\d+$/,"").to_i
end

#overall_mean_quality_scoreObject



149
150
151
# File 'lib/bio/fastqc/parser.rb', line 149

def overall_mean_quality_score
  overall_quality_score(:mean)
end

#overall_median_quality_scoreObject



153
154
155
# File 'lib/bio/fastqc/parser.rb', line 153

def overall_median_quality_score
  overall_quality_score(:median)
end

#overall_n_contentObject



157
158
159
160
161
# File 'lib/bio/fastqc/parser.rb', line 157

def overall_n_content
  per_base = per_base_n_content
  v = per_base.map{|c| c[1].to_f }
  v.reduce(:+) / v.size
end

#overall_quality_score(mean_or_median) ⇒ Object



140
141
142
143
144
145
146
147
# File 'lib/bio/fastqc/parser.rb', line 140

def overall_quality_score(mean_or_median)
  per_base = per_base_sequence_quality.drop(1) # drop header
  column = per_base_quality_column(mean_or_median)
  v = per_base.map do |row|
    (10**(row[column].to_f / -10)).to_f
  end
  -10 * Math.log10(v.reduce(:+) / v.size)
end

#overrepresented_sequencesObject



107
108
109
# File 'lib/bio/fastqc/parser.rb', line 107

def overrepresented_sequences
  get_module_matrix("Overrepresented sequences", 1)
end

#parseObject



202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
# File 'lib/bio/fastqc/parser.rb', line 202

def parse
  {
    fastqc_version: fastqc_version,
    filename: filename,
    file_type: file_type,
    encoding: encoding,
    total_sequences: total_sequences,
    sequences_flagged_as_poor_quality: sequences_flagged_as_poor_quality,
    filtered_sequences: filtered_sequences,
    sequence_length: sequence_length,
    percent_gc: percent_gc,
    per_base_sequence_quality: per_base_sequence_quality,
    per_tile_sequence_quality: per_tile_sequence_quality,
    per_sequence_quality_scores: per_sequence_quality_scores,
    per_base_sequence_content: per_base_sequence_content,
    per_sequence_gc_content: per_sequence_gc_content,
    per_base_n_content: per_base_n_content,
    sequence_length_distribution: sequence_length_distribution,
    total_duplicate_percentage: total_duplicate_percentage,
    sequence_duplication_levels: sequence_duplication_levels,
    overrepresented_sequences: overrepresented_sequences,
    adapter_content: adapter_content,
    kmer_content: kmer_content,
    min_length: min_length,
    max_length: max_length,
    overall_mean_quality_score: overall_mean_quality_score,
    overall_median_quality_score: overall_median_quality_score,
    overall_n_content: overall_n_content,
    mean_sequence_length: mean_sequence_length,
    median_sequence_length: median_sequence_length,
  }
end

#parse_modulesObject



12
13
14
15
16
# File 'lib/bio/fastqc/parser.rb', line 12

def parse_modules
  @data.split(">>END_MODULE\n").map do |mod|
    mod.split("\n").map{|line| line.split("\t") }
  end
end

#per_base_n_contentObject



91
92
93
# File 'lib/bio/fastqc/parser.rb', line 91

def per_base_n_content
  get_module_matrix("Per base N content", 1)
end

#per_base_quality_column(mean_or_median) ⇒ Object



131
132
133
134
135
136
137
138
# File 'lib/bio/fastqc/parser.rb', line 131

def per_base_quality_column(mean_or_median)
  case mean_or_median
  when :mean
    1
  when :median
    2
  end
end

#per_base_sequence_contentObject



83
84
85
# File 'lib/bio/fastqc/parser.rb', line 83

def per_base_sequence_content
  get_module_matrix("Per base sequence content", 1)
end

#per_base_sequence_qualityObject



71
72
73
# File 'lib/bio/fastqc/parser.rb', line 71

def per_base_sequence_quality
  get_module_matrix("Per base sequence quality", 1)
end

#per_sequence_gc_contentObject



87
88
89
# File 'lib/bio/fastqc/parser.rb', line 87

def per_sequence_gc_content
  get_module_matrix("Per sequence GC content", 1)
end

#per_sequence_quality_scoresObject



79
80
81
# File 'lib/bio/fastqc/parser.rb', line 79

def per_sequence_quality_scores
  get_module_matrix("Per sequence quality scores", 1)
end

#per_tile_sequence_qualityObject



75
76
77
# File 'lib/bio/fastqc/parser.rb', line 75

def per_tile_sequence_quality
  get_module_matrix("Per tile sequence quality", 1)
end

#percent_gcObject

overall percentage of GC content



58
59
60
# File 'lib/bio/fastqc/parser.rb', line 58

def percent_gc # overall percentage of GC content
  @basic_statistics["%GC"].to_f
end

#sequence_duplication_levelsObject



103
104
105
# File 'lib/bio/fastqc/parser.rb', line 103

def sequence_duplication_levels
  get_module_matrix("Sequence Duplication Levels", 2)
end

#sequence_lengthObject

store as string: can be range



54
55
56
# File 'lib/bio/fastqc/parser.rb', line 54

def sequence_length # store as string: can be range
  @basic_statistics["Sequence length"]
end

#sequence_length_distributionObject



95
96
97
# File 'lib/bio/fastqc/parser.rb', line 95

def sequence_length_distribution
  get_module_matrix("Sequence Length Distribution", 1)
end

#sequences_flagged_as_poor_qualityObject

number of sequence reads flagged as poor quality



46
47
48
# File 'lib/bio/fastqc/parser.rb', line 46

def sequences_flagged_as_poor_quality # number of sequence reads flagged as poor quality
  @basic_statistics["Sequences flagged as poor quality"].to_i
end

#summaryObject



198
199
200
# File 'lib/bio/fastqc/parser.rb', line 198

def summary
  parse
end

#total_duplicate_percentageObject



99
100
101
# File 'lib/bio/fastqc/parser.rb', line 99

def total_duplicate_percentage
  get_module_matrix("Sequence Duplication Levels", 1)[0][1].to_f
end

#total_sequencesObject

total number of sequence reads



42
43
44
# File 'lib/bio/fastqc/parser.rb', line 42

def total_sequences # total number of sequence reads
  @basic_statistics["Total Sequences"].to_i
end