7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
|
# File 'lib/macroape/cli/scan_collection.rb', line 7
def self.main(argv)
doc = <<-EOS.strip_doc
Command-line format:
#{run_tool_cmd} <pat-file> <collection> [options]
Options:
[-p <P-value>]
[-c <similarity cutoff>] minimal similarity to be included in output, '-c 0.05' by default, [--all] to print all results
[--precise [<level>]] minimal similarity to check on the second pass in precise mode, off by default, '--precise 0.01' if level is not set
[--silent] - hide current progress information during scan (printed to stderr by default)
[--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
[--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
[-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
Output format:
<name> <jaccard index> <shift> <overlap> <orientation> ['*' in case that result was calculated on the second pass (in precise mode), '.' otherwise]
Attention! Name can contain whitespace characters.
Attention! The shift and orientation are reported for the collection matrix relative to the query matrix.
Example:
#{run_tool_cmd} motifs/KLF4_f2.pat hocomoco_ad_uniform.yaml
#{run_tool_cmd} motifs/KLF4_f2.pat hocomoco_ad_uniform.yaml -p 0.0005 --precise 0.03
EOS
if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
$stderr.puts doc
exit
end
data_model = argv.delete('--pcm') ? :pcm : :pwm
filename = argv.shift
collection_file = argv.shift
raise 'No input. You should specify input file with matrix' unless filename
raise 'No input. You should specify input file with collection' unless collection_file
raise "Collection file #{collection_file} doesn't exist" unless File.exist?(collection_file)
pvalue = 0.0005
cutoff = 0.05 collection = YAML.load_file(collection_file)
collection_background = collection.background query_background = collection_background
rough_discretization = collection.rough_discretization
precise_discretization = collection.precise_discretization
max_hash_size = 10000000
max_pair_hash_size = 10000
pvalue_boundary = :upper
silent = false
precision_mode = :rough
until argv.empty?
case argv.shift
when '-b'
query_background = Bioinform::Background.from_string(argv.shift)
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless query_background.symmetric?
when '-p'
pvalue = argv.shift.to_f
when '--max-hash-size'
max_hash_size = argv.shift.to_i
when '--max-2d-hash-size'
max_pair_hash_size = argv.shift.to_i
when '-c'
cutoff = argv.shift.to_f
when '--all'
cutoff = 0.0
when '--silent'
silent = true
when '--boundary'
pvalue_boundary = argv.shift.to_sym
raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
when '--precise'
precision_mode = :precise
begin
Float(argv.first)
minimal_similarity = argv.shift.to_f
rescue
minimal_similarity = 0.05
end
end
end
raise "Thresholds for pvalue #{pvalue} aren't presented in collection (#{collection.pvalues.join(', ')}). Use one of listed pvalues or recalculate the collection with needed pvalue" unless collection.pvalues.include? pvalue
raise "Error! File #{filename} doesn't exist" unless File.exist?(filename)
query_input = File.read(filename)
query_input = Bioinform::MatrixParser.new.parse!(query_input)
case data_model
when :pcm
query_pcm = Bioinform::MotifModel::PCM.new(query_input[:matrix]).named(query_input[:name])
query_pwm = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: query_background).convert(query_pcm)
when :pwm
query_pwm = Bioinform::MotifModel::PWM.new(query_input[:matrix]).named(query_input[:name])
end
query_pwm_rough = query_pwm.discreted(rough_discretization)
query_pwm_rough_counting = PWMCounting.new(query_pwm_rough, background: query_background, max_hash_size: max_hash_size)
query_pwm_precise = query_pwm.discreted(precise_discretization)
query_pwm_precise_counting = PWMCounting.new(query_pwm_precise, background: query_background, max_hash_size: max_hash_size)
if pvalue_boundary == :lower
query_threshold_rough, query_rough_real_pvalue = query_pwm_rough_counting.threshold_and_real_pvalue(pvalue)
query_threshold_precise, query_precise_real_pvalue = query_pwm_precise_counting.threshold_and_real_pvalue(pvalue)
else
query_threshold_rough, query_rough_real_pvalue = query_pwm_rough_counting.weak_threshold_and_real_pvalue(pvalue)
query_threshold_precise, query_precise_real_pvalue = query_pwm_precise_counting.weak_threshold_and_real_pvalue(pvalue)
end
if query_precise_real_pvalue == 0
$stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the precise discretization level of #{precise_discretization}. It's impossible to scan collection for this motif"
return
end
if query_rough_real_pvalue == 0
query_pwm_rough_counting, query_threshold_rough = query_pwm_precise_counting, query_threshold_precise
$stderr.puts "Query motif #{query_pwm.name} gives 0 recognized words for a given P-value of #{pvalue} with the rough discretization level of #{rough_discretization}. Forcing precise discretization level of #{precise_discretization}"
end
similarities = {}
precision_file_mode = {}
collection.motifs.each_with_index do |motif_info, index|
motif = motif_info.model
$stderr.puts "Testing motif #{motif.name} (#{index+1} of #{collection.size}, #{index*100/collection.size}% complete)" unless silent
if motif_info.rough[pvalue]
collection_pwm_rough = motif.discreted(rough_discretization)
collection_pwm_rough_counting = Macroape::PWMCounting.new(collection_pwm_rough, background: collection_background, max_hash_size: max_hash_size)
collection_threshold_rough = motif_info.rough[pvalue] * rough_discretization
info = Macroape::PWMCompare.new(query_pwm_rough_counting, collection_pwm_rough_counting).tap{|x| x.max_pair_hash_size = max_pair_hash_size }.jaccard(query_threshold_rough, collection_threshold_rough)
info[:precision_mode] = :rough
end
if !motif_info.rough[pvalue] || (precision_mode == :precise) && (info[:similarity] >= minimal_similarity)
collection_pwm_precise = motif.discreted(precise_discretization)
collection_pwm_precise_counting = Macroape::PWMCounting.new(collection_pwm_precise, background: collection_background, max_hash_size: max_hash_size)
collection_threshold_precise = motif_info.precise[pvalue] * precise_discretization
info = Macroape::PWMCompare.new(query_pwm_precise_counting, collection_pwm_precise_counting).tap{|x| x.max_pair_hash_size = max_pair_hash_size }.jaccard(query_threshold_precise, collection_threshold_precise)
info[:precision_mode] = :precise
end
info[:name] = motif.name
similarities[motif.name] = info
end
$stderr.puts "100% complete" unless silent
similarities_to_output = similarities.sort_by{|name, info| info[:similarity] }.reverse.select{|name,info| info[:similarity] >= cutoff }.map{|name,info|info}
puts Helper.scan_collection_infos_string( similarities_to_output,
{cutoff: cutoff,
precision_mode: precision_mode,
rough_discretization: rough_discretization,
precise_discretization: precise_discretization,
minimal_similarity: minimal_similarity,
pvalue: pvalue,
pvalue_boundary: pvalue_boundary,
collection_background: collection_background,
query_background: query_background} )
rescue => err
$stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
end
|