7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
|
# File 'lib/macroape/cli/eval_similarity.rb', line 7
def self.main(argv)
doc = " Command-line format:\n \#{run_tool_cmd} <1st matrix pat-file> <2nd matrix pat-file> [options]\n\n Options:\n [-p <P-value>]\n [-d <discretization level>]\n [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.\n [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value\n [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25\n [--first-threshold <threshold for the first matrix>]\n [--second-threshold <threshold for the second matrix>]\n\n Examples:\n \#{run_tool_cmd} motifs/KLF4_f2.pat motifs/SP1_f1.pat -p 0.0005 -d 100 -b 0.3,0.2,0.2,0.3\n EOS\n\n if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}\n $stderr.puts doc\n exit\n end\n\n pvalue = 0.0005\n discretization = 10.0\n\n first_background = Bioinform::Background::Wordwise\n second_background = Bioinform::Background::Wordwise\n\n max_hash_size = 10000000\n max_pair_hash_size = 10000\n pvalue_boundary = :upper\n\n data_model = argv.delete('--pcm') ? :pcm : :pwm\n first_file = argv.shift\n second_file = argv.shift\n raise 'You should specify two input files' unless first_file and second_file\n\n until argv.empty?\n case argv.shift\n when '-p'\n pvalue = argv.shift.to_f\n when '-d'\n discretization = argv.shift.to_f\n when '--max-hash-size'\n max_hash_size = argv.shift.to_i\n when '--max-2d-hash-size'\n max_pair_hash_size = argv.shift.to_i\n when '-b'\n second_background = first_background = Bioinform::Background.from_string(argv.shift)\n when '-b1'\n first_background = Bioinform::Background.from_string(argv.shift)\n when '-b2'\n second_background = Bioinform::Background.from_string(argv.shift)\n when '--boundary'\n pvalue_boundary = argv.shift.to_sym\n raise 'boundary should be either lower or upper' unless pvalue_boundary == :lower || pvalue_boundary == :upper\n when '--first-threshold'\n predefined_threshold_first = argv.shift.to_f\n when '--second-threshold'\n predefined_threshold_second = argv.shift.to_f\n end\n end\n raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless first_background.symmetric?\n raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless second_background.symmetric?\n\n raise \"Error! File \#{first_file} don't exist\" unless File.exist?(first_file)\n input_first = File.read(first_file)\n input_first = Bioinform::MatrixParser.new.parse!(input_first)\n\n raise \"Error! File \#{second_file} don't exist\" unless File.exist?(second_file)\n input_second = File.read(second_file)\n input_second = Bioinform::MatrixParser.new.parse!(input_second)\n\n case data_model\n when :pcm\n pcm_first = Bioinform::MotifModel::PCM.new(input_first[:matrix]).named(input_first[:name])\n pwm_first = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: first_background).convert(pcm_first)\n pcm_second = Bioinform::MotifModel::PCM.new(input_second[:matrix]).named(input_second[:name])\n pwm_second = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: second_background).convert(pcm_second)\n when :pwm\n pwm_first = Bioinform::MotifModel::PWM.new(input_first[:matrix]).named(input_first[:name])\n pwm_second = Bioinform::MotifModel::PWM.new(input_second[:matrix]).named(input_second[:name])\n end\n\n pwm_first = pwm_first.discreted(discretization)\n pwm_second = pwm_second.discreted(discretization)\n\n counting_first = PWMCounting.new(pwm_first, background: first_background, max_hash_size: max_hash_size)\n counting_second = PWMCounting.new(pwm_second, background: second_background, max_hash_size: max_hash_size)\n\n cmp = Macroape::PWMCompare.new(counting_first, counting_second).tap{|x| x.max_pair_hash_size = max_pair_hash_size }\n\n if predefined_threshold_first\n threshold_first = predefined_threshold_first * discretization\n else\n if pvalue_boundary == :lower\n threshold_first = counting_first.threshold(pvalue)\n else\n threshold_first = counting_first.weak_threshold(pvalue)\n end\n end\n\n if predefined_threshold_second\n threshold_second = predefined_threshold_second * discretization\n else\n if pvalue_boundary == :lower\n threshold_second = counting_second.threshold(pvalue)\n else\n threshold_second = counting_second.weak_threshold(pvalue)\n end\n end\n\n info = cmp.jaccard(threshold_first, threshold_second)\n info.merge!(predefined_threshold_first: predefined_threshold_first,\n predefined_threshold_second: predefined_threshold_second,\n threshold_first: threshold_first.to_f / discretization,\n threshold_second: threshold_second.to_f / discretization,\n discretization: discretization,\n first_background: first_background,\n second_background: second_background,\n requested_pvalue: pvalue,\n pvalue_boundary: pvalue_boundary)\n puts Helper.similarity_info_string(info)\n\nrescue => err\n $stderr.puts \"\\n\#{err}\\n\#{err.backtrace.first(5).join(\"\\n\")}\\n\\nUse --help option for help\\n\\n\#{doc}\"\nend\n".strip_doc
|