Module: Macroape::CLI::ScanCollection

Defined in:
lib/macroape/cli/scan_collection.rb

Class Method Summary collapse

Class Method Details

.main(argv) ⇒ Object



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# File 'lib/macroape/cli/scan_collection.rb', line 7

def self.main(argv)
  doc = "    Command-line format:\n    \#{run_tool_cmd} <pat-file> <collection> [options]\n\n    Options:\n      [-p <P-value>]\n      [-c <similarity cutoff>] minimal similarity to be included in output, '-c 0.05' by default, [--all] to print all results\n      [--precise [<level>]] minimal similarity to check on the second pass in precise mode, off by default, '--precise 0.01' if level is not set\n      [--silent] - hide current progress information during scan (printed to stderr by default)\n      [--pcm] - treat the input file as Position Count Matrix. PCM-to-PWM transformation to be done internally.\n      [--boundary lower|upper] Upper boundary (default) means that the obtained P-value is greater than or equal to the requested P-value\n      [-b <background probabilities] ACGT - 4 numbers, comma-delimited(spaces not allowed), sum should be equal to 1, like 0.25,0.24,0.26,0.25\n\n    Output format:\n     <name> <jaccard index> <shift> <overlap> <orientation> ['*' in case that result was calculated on the second pass (in precise mode), '.' otherwise]\n        Attention! Name can contain whitespace characters.\n        Attention! The shift and orientation are reported for the collection matrix relative to the query matrix.\n\n    Example:\n      \#{run_tool_cmd} motifs/KLF4_f2.pat hocomoco_ad_uniform.yaml\n      \#{run_tool_cmd} motifs/KLF4_f2.pat hocomoco_ad_uniform.yaml -p 0.0005 --precise 0.03\n  EOS\n\n  if argv.empty? || ['-h', '--h', '-help', '--help'].any?{|help_option| argv.include?(help_option)}\n    $stderr.puts doc\n    exit\n  end\n\n  data_model = argv.delete('--pcm') ? :pcm : :pwm\n  filename = argv.shift\n  collection_file = argv.shift\n  raise 'No input. You should specify input file with matrix' unless filename\n  raise 'No input. You should specify input file with collection' unless collection_file\n  raise \"Collection file \#{collection_file} doesn't exist\" unless File.exist?(collection_file)\n\n  pvalue = 0.0005\n  cutoff = 0.05 # minimal similarity to output\n  collection = YAML.load_file(collection_file)\n  collection_background = collection.background #(collection.background == [1,1,1,1]) ? Bioinform::Background::Wordwise : Bioinform::Frequencies.new(collection.background)\n  query_background = collection_background\n\n  rough_discretization = collection.rough_discretization\n  precise_discretization = collection.precise_discretization\n  max_hash_size = 10000000\n  max_pair_hash_size = 10000\n  pvalue_boundary = :upper\n\n  silent = false\n  precision_mode = :rough\n  until argv.empty?\n    case argv.shift\n      when '-b'\n        query_background = Bioinform::Background.from_string(argv.shift)\n        raise 'background should be symmetric: p(A)=p(T) and p(G) = p(C)' unless query_background.symmetric?\n      when '-p'\n        pvalue = argv.shift.to_f\n      when '--max-hash-size'\n        max_hash_size = argv.shift.to_i\n      when '--max-2d-hash-size'\n        max_pair_hash_size = argv.shift.to_i\n      when '-c'\n        cutoff = argv.shift.to_f\n      when '--all'\n        cutoff = 0.0\n      when '--silent'\n        silent = true\n      when '--boundary'\n        pvalue_boundary = argv.shift.to_sym\n        raise 'boundary should be either lower or upper'  unless  pvalue_boundary == :lower || pvalue_boundary == :upper\n      when '--precise'\n        precision_mode = :precise\n        begin\n          Float(argv.first)\n          minimal_similarity = argv.shift.to_f\n        rescue\n          minimal_similarity = 0.05\n        end\n    end\n  end\n\n  raise \"Thresholds for pvalue \#{pvalue} aren't presented in collection (\#{collection.pvalues.join(', ')}). Use one of listed pvalues or recalculate the collection with needed pvalue\" unless collection.pvalues.include? pvalue\n\n  raise \"Error! File \#{filename} doesn't exist\"  unless File.exist?(filename)\n  query_input = File.read(filename)\n\n  query_input = Bioinform::MatrixParser.new.parse!(query_input)\n  case data_model\n  when :pcm\n    query_pcm = Bioinform::MotifModel::PCM.new(query_input[:matrix]).named(query_input[:name])\n    query_pwm = Bioinform::ConversionAlgorithms::PCM2PWMConverter.new(pseudocount: :log, background: query_background).convert(query_pcm)\n  when :pwm\n    query_pwm = Bioinform::MotifModel::PWM.new(query_input[:matrix]).named(query_input[:name])\n  end\n\n  query_pwm_rough = query_pwm.discreted(rough_discretization)\n  query_pwm_rough_counting = PWMCounting.new(query_pwm_rough, background: query_background, max_hash_size: max_hash_size)\n  query_pwm_precise = query_pwm.discreted(precise_discretization)\n  query_pwm_precise_counting = PWMCounting.new(query_pwm_precise, background: query_background, max_hash_size: max_hash_size)\n\n  if pvalue_boundary == :lower\n    query_threshold_rough, query_rough_real_pvalue = query_pwm_rough_counting.threshold_and_real_pvalue(pvalue)\n    query_threshold_precise, query_precise_real_pvalue = query_pwm_precise_counting.threshold_and_real_pvalue(pvalue)\n  else\n    query_threshold_rough, query_rough_real_pvalue = query_pwm_rough_counting.weak_threshold_and_real_pvalue(pvalue)\n    query_threshold_precise, query_precise_real_pvalue = query_pwm_precise_counting.weak_threshold_and_real_pvalue(pvalue)\n  end\n\n  if query_precise_real_pvalue == 0\n    $stderr.puts \"Query motif \#{query_pwm.name} gives 0 recognized words for a given P-value of \#{pvalue} with the precise discretization level of \#{precise_discretization}. It's impossible to scan collection for this motif\"\n    return\n  end\n\n  if query_rough_real_pvalue == 0\n    query_pwm_rough_counting, query_threshold_rough = query_pwm_precise_counting, query_threshold_precise\n    $stderr.puts \"Query motif \#{query_pwm.name} gives 0 recognized words for a given P-value of \#{pvalue} with the rough discretization level of \#{rough_discretization}. Forcing precise discretization level of \#{precise_discretization}\"\n  end\n\n  similarities = {}\n  precision_file_mode = {}\n\n  collection.motifs.each_with_index do |motif_info, index|\n    motif = motif_info.model\n    $stderr.puts \"Testing motif \#{motif.name} (\#{index+1} of \#{collection.size}, \#{index*100/collection.size}% complete)\"  unless silent\n\n    if motif_info.rough[pvalue]\n      collection_pwm_rough = motif.discreted(rough_discretization)\n      collection_pwm_rough_counting = Macroape::PWMCounting.new(collection_pwm_rough, background: collection_background, max_hash_size: max_hash_size)\n\n      collection_threshold_rough = motif_info.rough[pvalue] * rough_discretization\n      info = Macroape::PWMCompare.new(query_pwm_rough_counting, collection_pwm_rough_counting).tap{|x| x.max_pair_hash_size = max_pair_hash_size }.jaccard(query_threshold_rough, collection_threshold_rough)\n      info[:precision_mode] = :rough\n    end\n    if !motif_info.rough[pvalue] || (precision_mode == :precise) && (info[:similarity] >= minimal_similarity)\n      collection_pwm_precise = motif.discreted(precise_discretization)\n      collection_pwm_precise_counting = Macroape::PWMCounting.new(collection_pwm_precise, background: collection_background, max_hash_size: max_hash_size)\n\n      collection_threshold_precise = motif_info.precise[pvalue] * precise_discretization\n      info = Macroape::PWMCompare.new(query_pwm_precise_counting, collection_pwm_precise_counting).tap{|x| x.max_pair_hash_size = max_pair_hash_size }.jaccard(query_threshold_precise, collection_threshold_precise)\n      info[:precision_mode] = :precise\n    end\n    info[:name] = motif.name\n    similarities[motif.name] = info\n  end\n\n  $stderr.puts \"100% complete\"  unless silent\n\n  similarities_to_output = similarities.sort_by{|name, info| info[:similarity] }.reverse.select{|name,info| info[:similarity] >= cutoff }.map{|name,info|info}\n  puts Helper.scan_collection_infos_string( similarities_to_output,\n                                            {cutoff: cutoff,\n                                            precision_mode: precision_mode,\n                                            rough_discretization: rough_discretization,\n                                            precise_discretization: precise_discretization,\n                                            minimal_similarity: minimal_similarity,\n                                            pvalue: pvalue,\n                                            pvalue_boundary: pvalue_boundary,\n                                            collection_background: collection_background,\n                                            query_background: query_background} )\nrescue => err\n  $stderr.puts \"\\n\#{err}\\n\#{err.backtrace.first(5).join(\"\\n\")}\\n\\nUse --help option for help\\n\\n\#{doc}\"\nend\n".strip_doc