Class: Transrate::Contig

Inherits:
Object
  • Object
show all
Extended by:
Forwardable
Includes:
Enumerable
Defined in:
lib/transrate/contig.rb,
ext/transrate/transrate.c

Overview

A contig in a transcriptome assembly.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(seq, name: nil) ⇒ Contig

Returns a new instance of Contig.



22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/transrate/contig.rb', line 22

def initialize(seq, name: nil)
  # fix null bytes in the nucleotide sequence
  seq.seq.gsub!("\0", "")
  # trim trailing semicolons (because BLAST strips them)
  if seq.respond_to?(:entry_id)
    seq.entry_id.gsub!(/;$/, '')
  end
  @seq = seq
  @seq.data = nil # no need to store raw fasta string
  @name = seq.respond_to?(:entry_id) ? seq.entry_id : name
  @hits = []
  @reference_coverage = 0
  @has_crb = false
  @in_bridges = 0
  @p_good = 0
  @p_seq_true = 0
  @uncovered_bases = length
  @p_uncovered_bases = 1
  @p_not_segmented = 1
  @score = -1
  @good = 0
  @coverage = 0
  @classification = :unknown
end

Instance Attribute Details

#classificationObject

Returns the value of attribute classification.



17
18
19
# File 'lib/transrate/contig.rb', line 17

def classification
  @classification
end

#coverageObject

Returns the value of attribute coverage.



14
15
16
# File 'lib/transrate/contig.rb', line 14

def coverage
  @coverage
end

#eff_countObject

read-based metrics



13
14
15
# File 'lib/transrate/contig.rb', line 13

def eff_count
  @eff_count
end

#eff_lengthObject

read-based metrics



13
14
15
# File 'lib/transrate/contig.rb', line 13

def eff_length
  @eff_length
end

#goodObject

Returns the value of attribute good.



17
18
19
# File 'lib/transrate/contig.rb', line 17

def good
  @good
end

#has_crbObject

reference-based metrics



19
20
21
# File 'lib/transrate/contig.rb', line 19

def has_crb
  @has_crb
end

#hitsObject

Returns the value of attribute hits.



20
21
22
# File 'lib/transrate/contig.rb', line 20

def hits
  @hits
end

#in_bridgesObject

Returns the value of attribute in_bridges.



16
17
18
# File 'lib/transrate/contig.rb', line 16

def in_bridges
  @in_bridges
end

#low_uniqueness_basesObject

Returns the value of attribute low_uniqueness_bases.



16
17
18
# File 'lib/transrate/contig.rb', line 16

def low_uniqueness_bases
  @low_uniqueness_bases
end

#nameObject

Returns the value of attribute name.



11
12
13
# File 'lib/transrate/contig.rb', line 11

def name
  @name
end

#p_goodObject

Returns the value of attribute p_good.



17
18
19
# File 'lib/transrate/contig.rb', line 17

def p_good
  @p_good
end

#p_not_segmentedObject

Returns the value of attribute p_not_segmented.



17
18
19
# File 'lib/transrate/contig.rb', line 17

def p_not_segmented
  @p_not_segmented
end

#p_seq_trueObject

Returns the value of attribute p_seq_true.



15
16
17
# File 'lib/transrate/contig.rb', line 15

def p_seq_true
  @p_seq_true
end

#p_uncovered_basesObject

Returns the value of attribute p_uncovered_bases.



14
15
16
# File 'lib/transrate/contig.rb', line 14

def p_uncovered_bases
  @p_uncovered_bases
end

#reference_coverageObject

reference-based metrics



19
20
21
# File 'lib/transrate/contig.rb', line 19

def reference_coverage
  @reference_coverage
end

#seqObject

Returns the value of attribute seq.



11
12
13
# File 'lib/transrate/contig.rb', line 11

def seq
  @seq
end

#tpmObject

read-based metrics



13
14
15
# File 'lib/transrate/contig.rb', line 13

def tpm
  @tpm
end

#uncovered_basesObject

Returns the value of attribute uncovered_bases.



14
15
16
# File 'lib/transrate/contig.rb', line 14

def uncovered_bases
  @uncovered_bases
end

Instance Method Details

#at_skewObject

AT skew



198
199
200
# File 'lib/transrate/contig.rb', line 198

def at_skew
  (bases_a - bases_t) / (bases_a + bases_t).to_f
end

#base_compositionObject

Base composition of the contig

If called and the instance variable @base_composition is nil then call the c method to count the bases and dibases in the sequence then get the info out of the c array and store it in the hash then if it is called again just return the hash as before



98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/transrate/contig.rb', line 98

def base_composition
  if @base_composition
    return @base_composition
  end
  # else run the C method
  composition(@seq.seq)
  alphabet = ['a', 'c', 'g', 't', 'n']
  @base_composition = {}
  @dibase_composition = {}
  bases = []
  dibases = []
  alphabet.each do |c|
    bases << "#{c}".to_sym
  end
  alphabet.each do |c|
    alphabet.each do |d|
      dibases << "#{c}#{d}".to_sym
    end
  end
  bases.each_with_index do |a,i|
    @base_composition[a] = base_count(i)
  end
  dibases.each_with_index do |a,i|
    @dibase_composition[a] = dibase_count(i)
  end
  return @base_composition
end

#base_countObject



17
# File 'ext/transrate/transrate.c', line 17

VALUE method_base_count(VALUE,VALUE);

#bases_aObject

Number of bases that are A



156
157
158
# File 'lib/transrate/contig.rb', line 156

def bases_a
  base_composition[:a]
end

#bases_cObject

Number of bases that are C



136
137
138
# File 'lib/transrate/contig.rb', line 136

def bases_c
  base_composition[:c]
end

#bases_gObject

Number of bases that are G



146
147
148
# File 'lib/transrate/contig.rb', line 146

def bases_g
  base_composition[:g]
end

#bases_gcObject

GC



184
185
186
# File 'lib/transrate/contig.rb', line 184

def bases_gc
  bases_g + bases_c
end

#bases_nObject



175
176
177
# File 'lib/transrate/contig.rb', line 175

def bases_n
  base_composition[:n]
end

#bases_tObject

Number of bases that are T



166
167
168
# File 'lib/transrate/contig.rb', line 166

def bases_t
  base_composition[:t]
end

#basic_metricsObject

Get all metrics available for this contig



52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/transrate/contig.rb', line 52

def basic_metrics
  basic = {
    :length => length,
    :prop_gc => prop_gc,
    :gc_skew => gc_skew,
    :at_skew => at_skew,
    :cpg_count => cpg_count,
    :cpg_ratio => cpg_ratio,
    :orf_length => orf_length,
    :linguistic_complexity_6 => linguistic_complexity(6),
  }
end

#classify(cutoff) ⇒ Object

Classify the contig into one of the following classes:

  • good (score >= 0.5)

  • fragmented (in_bridges > 0) and no other problems

  • chimeric (p_not_segmented < 0.25) and no other problems

  • bad (score < 0.5 and not in any other category)



251
252
253
254
255
256
257
258
# File 'lib/transrate/contig.rb', line 251

def classify cutoff
  if score >= cutoff
    @classification = :good
  else
    @classification = :bad
  end
  return @classification
end

#comparative_metricsObject



80
81
82
83
84
85
86
87
88
89
90
# File 'lib/transrate/contig.rb', line 80

def comparative_metrics
  reference = @has_crb ? {
    :has_crb => has_crb,
    :reference_coverage => reference_coverage,
    :hits => hits.map{ |h| h.target }.join(";")
  } : {
    :has_crb => false,
    :reference_coverage => "NA",
    :hits => "NA"
  }
end

#compositionObject

contig



16
# File 'ext/transrate/transrate.c', line 16

VALUE method_composition(VALUE, VALUE);

#cpg_countObject

CpG count



203
204
205
# File 'lib/transrate/contig.rb', line 203

def cpg_count
  dibase_composition[:cg] + dibase_composition[:gc]
end

#cpg_ratioObject

observed-to-expected CpG (C-phosphate-G) ratio



208
209
210
211
212
213
# File 'lib/transrate/contig.rb', line 208

def cpg_ratio
  r = dibase_composition[:cg] + dibase_composition[:gc]
  r /= (bases_c * bases_g).to_f
  r *= (length - bases_n)
  return r
end

#dibase_compositionObject

Dibase composition of the contig



127
128
129
130
131
132
133
# File 'lib/transrate/contig.rb', line 127

def dibase_composition
  if @dibase_composition
    return @dibase_composition
  end
  base_composition
  @dibase_composition
end

#dibase_countObject



18
# File 'ext/transrate/transrate.c', line 18

VALUE method_dibase_count(VALUE,VALUE);

#each(&block) ⇒ Object



47
48
49
# File 'lib/transrate/contig.rb', line 47

def each &block
  @seq.seq.each_char &block
end

#gc_skewObject

GC skew



193
194
195
# File 'lib/transrate/contig.rb', line 193

def gc_skew
  (bases_g - bases_c) / (bases_g + bases_c).to_f
end

#kmer_countObject



19
# File 'ext/transrate/transrate.c', line 19

VALUE method_kmer_count(VALUE,VALUE,VALUE);

#linguistic_complexity(k) ⇒ Object



222
223
224
# File 'lib/transrate/contig.rb', line 222

def linguistic_complexity k
  return kmer_count(k, @seq.seq)/(4**k).to_f # call to C
end

#longest_orfObject



20
# File 'ext/transrate/transrate.c', line 20

VALUE method_longest_orf(VALUE, VALUE);

#orf_lengthObject

Find the longest orf in the contig



216
217
218
219
220
# File 'lib/transrate/contig.rb', line 216

def orf_length
  return @orf_length if @orf_length
  @orf_length = longest_orf(@seq.seq) # call to C
  return @orf_length
end

#p_bases_coveredObject



226
227
228
# File 'lib/transrate/contig.rb', line 226

def p_bases_covered
  1 - p_uncovered_bases
end

#prop_aObject

Proportion of bases that are A



161
162
163
# File 'lib/transrate/contig.rb', line 161

def prop_a
  bases_a / length.to_f
end

#prop_cObject

Proportion of bases that are C



141
142
143
# File 'lib/transrate/contig.rb', line 141

def prop_c
  bases_c / length.to_f
end

#prop_gObject

Proportion of bases that are G



151
152
153
# File 'lib/transrate/contig.rb', line 151

def prop_g
  bases_g / length.to_f
end

#prop_gcObject



188
189
190
# File 'lib/transrate/contig.rb', line 188

def prop_gc
  prop_g + prop_c
end

#prop_nObject



179
180
181
# File 'lib/transrate/contig.rb', line 179

def prop_n
  bases_n / length.to_f
end

#prop_tObject

Proportion of bases that are T



171
172
173
# File 'lib/transrate/contig.rb', line 171

def prop_t
  bases_t / length.to_f
end

#read_metricsObject



65
66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/transrate/contig.rb', line 65

def read_metrics
  {
    :in_bridges => in_bridges,
    :p_good => p_good,
    :p_bases_covered => p_bases_covered,
    :p_seq_true => p_seq_true,
    :score => score,
    :p_not_segmented => p_not_segmented,
    :eff_length => eff_length,
    :eff_count => eff_count,
    :tpm => tpm,
    :coverage => coverage
  }
end

#scoreObject

Contig score (product of all score components)



236
237
238
239
240
241
242
243
244
# File 'lib/transrate/contig.rb', line 236

def score
  return @score if @score != -1
  prod =
    [p_bases_covered, 0.01].max.to_f * # proportion of bases covered
    [p_not_segmented, 0.01].max.to_f * # prob contig has 0 changepoints
    [p_good, 0.01].max.to_f * # proportion of reads that mapped good
    [p_seq_true, 0.01].max.to_f # scaled 1 - mean per-base edit distance
  @score = [prod, 0.01].max
end

#to_fastaObject



260
261
262
# File 'lib/transrate/contig.rb', line 260

def to_fasta
  @seq.seq.to_fasta(@name)
end