7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
|
# File 'lib/macroape/cli/eval_similarity.rb', line 7
def self.main(argv)
doc = <<-EOS.strip_doc
Command-line format:
#{run_tool_cmd} <1st matrix pat-file> <2nd matrix pat-file> [options]
Options:
[-p <P-value>]
[-d <discretization level>]
[--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.
[--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value
[-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25
[--first-threshold <threshold for the first matrix>]
[--second-threshold <threshold for the second matrix>]
Examples:
#{run_tool_cmd} motifs/KLF4_f2.pat motifs/SP1_f1.pat -p 0.0005 -d 100 -b 0.3,0.2,0.2,0.3
EOS
if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}
$stderr.puts doc
exit
end
pvalue = 0.0005
discretization = 10.0
first_background = Bioinform::Background::Wordwise
second_background = Bioinform::Background::Wordwise
max_hash_size = 10000000
max_pair_hash_size = 10000
pvalue_boundary = :upper
data_model = argv.delete('--pcm') ? :pcm : :pwm
first_file = argv.shift
second_file = argv.shift
raise 'You should specify two input files' unless first_file and second_file
until argv.empty?
case argv.shift
when '-p'
pvalue = argv.shift.to_f
when '-d'
discretization = argv.shift.to_f
when '--max-hash-size'
max_hash_size = argv.shift.to_i
when '--max-2d-hash-size'
max_pair_hash_size = argv.shift.to_i
when '-b'
second_background = first_background = Bioinform::Background.from_string(argv.shift)
when '-b1'
first_background = Bioinform::Background.from_string(argv.shift)
when '-b2'
second_background = Bioinform::Background.from_string(argv.shift)
when '--boundary'
pvalue_boundary = argv.shift.to_sym
raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper
when '--first-threshold'
predefined_threshold_first = argv.shift.to_f
when '--second-threshold'
predefined_threshold_second = argv.shift.to_f
end
end
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background.symmetric?
raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background.symmetric?
raise "Error! File #{first_file} don't exist" unless File.exist?(first_file)
input_first = File.read(first_file)
input_first = Bioinform::MatrixParser.new.parse!(input_first)
raise "Error! File #{second_file} don't exist" unless File.exist?(second_file)
input_second = File.read(second_file)
input_second = Bioinform::MatrixParser.new.parse!(input_second)
case data_model
when :pcm
pcm_first = Bioinform::MotifModel::PCM.new(input_first[:matrix]).named(input_first[:name])
pwm_first = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: first_background).convert(pcm_first)
pcm_second = Bioinform::MotifModel::PCM.new(input_second[:matrix]).named(input_second[:name])
pwm_second = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: second_background).convert(pcm_second)
when :pwm
pwm_first = Bioinform::MotifModel::PWM.new(input_first[:matrix]).named(input_first[:name])
pwm_second = Bioinform::MotifModel::PWM.new(input_second[:matrix]).named(input_second[:name])
end
pwm_first = pwm_first.discreted(discretization)
pwm_second = pwm_second.discreted(discretization)
counting_first = PWMCounting.new(pwm_first, background: first_background, max_hash_size: max_hash_size)
counting_second = PWMCounting.new(pwm_second, background: second_background, max_hash_size: max_hash_size)
cmp = Macroape::PWMCompare.new(counting_first, counting_second).tap{|x| x.max_pair_hash_size = max_pair_hash_size }
if predefined_threshold_first
threshold_first = predefined_threshold_first * discretization
else
if pvalue_boundary == :lower
threshold_first = counting_first.threshold(pvalue)
else
threshold_first = counting_first.weak_threshold(pvalue)
end
end
if predefined_threshold_second
threshold_second = predefined_threshold_second * discretization
else
if pvalue_boundary == :lower
threshold_second = counting_second.threshold(pvalue)
else
threshold_second = counting_second.weak_threshold(pvalue)
end
end
info = cmp.jaccard(threshold_first, threshold_second)
info.merge!(predefined_threshold_first: predefined_threshold_first,
predefined_threshold_second: predefined_threshold_second,
threshold_first: threshold_first.to_f / discretization,
threshold_second: threshold_second.to_f / discretization,
discretization: discretization,
first_background: first_background,
second_background: second_background,
requested_pvalue: pvalue,
pvalue_boundary: pvalue_boundary)
puts Helper.similarity_info_string(info)
rescue => err
$stderr.puts "\n#{err}\n#{err.backtrace.first(5).join("\n")}\n\nUse --help option for help\n\n#{doc}"
end
|