Module: Wp2txt
- Included in:
- Article, Runner
- Defined in:
- lib/wp2txt/progressbar.rb,
lib/wp2txt.rb,
lib/wp2txt/utils.rb,
lib/wp2txt/mw_api.rb,
lib/wp2txt/article.rb,
lib/wp2txt/version.rb
Overview
Ruby/ProgressBar - a text progress bar library
Copyright © 2001-2005 Satoru Takabayashi <[email protected]>
All rights reserved.
This is free software with ABSOLUTELY NO WARRANTY.
You can redistribute it and/or modify it under the terms of Ruby’s license.
Defined Under Namespace
Classes: Article, CmdProgbar, NewProgressBar, ProgressBar, ReversedProgressBar, Runner
Constant Summary
collapse
- VERSION =
"0.7.7"
Instance Method Summary
collapse
-
#batch_file_mod(dir_path, &block) ⇒ Object
modify files under a directry (recursive).
-
#chrref_to_utf!(num_str) ⇒ Object
-
#collect_files(str, regex = nil) ⇒ Object
collect filenames recursively.
-
#convert_characters!(text, has_retried = false) ⇒ Object
-
#correct_inline_template!(str) ⇒ Object
-
#correct_separator(input) ⇒ Object
take care of difference of separators among environments.
-
#decimal_format(i) ⇒ Object
-
#escape_nowiki!(str) ⇒ Object
methods used from format_wiki ####################.
-
#expand_template(uri, template, page) ⇒ Object
-
#file_mod(file_path, backup = false, &block) ⇒ Object
modify a file using block/yield mechanism.
-
#format_article!(text) ⇒ Object
-
#format_ref!(page) ⇒ Object
-
#format_wiki!(text, has_retried = false) ⇒ Object
-
#make_reference!(str) ⇒ Object
-
#mndash!(str) ⇒ Object
-
#parse_wikitext(uri, wikitext, page) ⇒ Object
-
#post_request(uri_string, data = {}) ⇒ Object
-
#process_external_links!(str) ⇒ Object
-
#process_interwiki_links!(str) ⇒ Object
-
#process_nested_structure(scanner, left, right, recur_count, &block) ⇒ Object
parser for nested structure ####################.
-
#remove_directive!(str) ⇒ Object
-
#remove_emphasis!(str) ⇒ Object
-
#remove_hr!(page) ⇒ Object
-
#remove_inbetween!(str, tagset = ['<', '>']) ⇒ Object
-
#remove_table!(str) ⇒ Object
-
#remove_tag!(str) ⇒ Object
-
#remove_templates!(str) ⇒ Object
methods used from format_article ####################.
-
#rename(files) ⇒ Object
-
#sec_to_str(int) ⇒ Object
convert int of seconds to string in the format 00:00:00.
-
#special_chr!(str) ⇒ Object
-
#unescape_nowiki!(str) ⇒ Object
Instance Method Details
#batch_file_mod(dir_path, &block) ⇒ Object
modify files under a directry (recursive)
370
371
372
373
374
375
376
377
378
|
# File 'lib/wp2txt/utils.rb', line 370
def batch_file_mod(dir_path, &block)
if FileTest.directory?(dir_path)
collect_files(dir_path).each do |file|
yield file if FileTest.file?(file)
end
else
yield dir_path if FileTest.file?(dir_path)
end
end
|
#chrref_to_utf!(num_str) ⇒ Object
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
|
# File 'lib/wp2txt/utils.rb', line 276
def chrref_to_utf!(num_str)
begin
num_str.gsub!($chrref_to_utf_regex) do
if $1 == 'x'
ch = $2.to_i(16)
else
ch = $2.to_i
end
hi = ch>>8
lo = ch&0xff
u = "\377\376" << lo.chr << hi.chr
u.encode("UTF-8", "UTF-16")
end
rescue StandardError
return nil
end
return true
end
|
#collect_files(str, regex = nil) ⇒ Object
collect filenames recursively
344
345
346
347
348
349
350
351
|
# File 'lib/wp2txt/utils.rb', line 344
def collect_files(str, regex = nil)
regex ||= //
text_array = Array.new
Find.find(str) do |f|
text_array << f if regex =~ f
end
text_array.sort
end
|
#convert_characters!(text, has_retried = false) ⇒ Object
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
|
# File 'lib/wp2txt/utils.rb', line 90
def convert_characters!(text, has_retried = false)
begin
text << ""
chrref_to_utf!(text)
special_chr!(text)
rescue
if has_retried
puts "invalid byte sequence detected"
puts "******************************"
File.open("error_log.txt", "w") do |f|
f.write text
end
exit
else
text.encode!("UTF-16")
text.encode!("UTF-8")
convert_characters!(text, true)
end
end
end
|
#correct_inline_template!(str) ⇒ Object
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
|
# File 'lib/wp2txt/utils.rb', line 316
def correct_inline_template!(str)
str.gsub!($remove_inline_regex) do
key = $1
if $onset_bar_regex =~ key
result = key
elsif
info = key.split("|")
type_code = info.first
case type_code
when $type_code_regex
out = info[-1]
else
if $leave_template
out = "{" + info.collect{|i|i.chomp}.join("|") + "}"
else
out = ""
end
end
out
else
""
end
end
end
|
#correct_separator(input) ⇒ Object
take care of difference of separators among environments
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
|
# File 'lib/wp2txt/utils.rb', line 381
def correct_separator(input)
if input.is_a?(String)
ret_str = String.new
if RUBY_PLATFORM.index("win32")
ret_str = input.gsub("/", "\\")
else
ret_str = input.gsub("\\", "/")
end
return ret_str
elsif input.is_a?(Array)
ret_array = Array.new
input.each do |item|
ret_array << correct_separator(item)
end
return ret_array
end
end
|
429
430
431
432
|
# File 'lib/wp2txt/utils.rb', line 429
def decimal_format(i)
str = i.to_s.reverse
return str.scan(/.?.?./).join(',').reverse
end
|
#escape_nowiki!(str) ⇒ Object
methods used from format_wiki ####################
184
185
186
187
188
189
190
191
192
193
194
195
196
|
# File 'lib/wp2txt/utils.rb', line 184
def escape_nowiki!(str)
if @nowikis
@nowikis.clear
else
@nowikis = {}
end
str.gsub!($escape_nowiki_regex) do
nowiki = $1
nowiki_id = nowiki.object_id
@nowikis[nowiki_id] = nowiki
"<nowiki-#{nowiki_id}>"
end
end
|
#expand_template(uri, template, page) ⇒ Object
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
|
# File 'lib/wp2txt/mw_api.rb', line 22
def expand_template(uri, template, page)
text = URI.escape(template)
title = URI.escape(page)
data = {"action" => "expandtemplates",
"format" => "json",
"text" => text,
"title" => title}
jsn = post_request(uri, data)
hash = JSON.parse(jsn)
begin
result = hash["expandtemplates"]["*"]
result = special_chr(result)
return chrref_to_utf(result).gsub("{{", "{{").gsub("}}", "}}")
rescue => e
puts "ERROR!"
p e
exit
template
end
end
|
#file_mod(file_path, backup = false, &block) ⇒ Object
modify a file using block/yield mechanism
354
355
356
357
358
359
360
361
362
363
364
365
366
367
|
# File 'lib/wp2txt/utils.rb', line 354
def file_mod(file_path, backup = false, &block)
File.open(file_path, "r") do |fr|
str = fr.read
newstr = yield(str)
str = newstr unless newstr == nil
File.open("temp", "w") do |tf|
tf.write(str)
end
end
File.rename(file_path, file_path + ".bak")
File.rename("temp", file_path)
File.unlink(file_path + ".bak") unless backup
end
|
#format_article!(text) ⇒ Object
121
122
123
124
125
126
127
128
129
130
131
132
133
|
# File 'lib/wp2txt/utils.rb', line 121
def format_article!(text)
remove_directive!(text)
remove_emphasis!(text)
mndash!(text)
make_reference!(text)
format_ref!(text)
remove_hr!(text)
remove_tag!(text)
convert_characters!(text)
correct_inline_template!(text) unless $leave_template
remove_templates!(text) unless $leave_template
remove_table!(text) unless $leave_table
end
|
310
311
312
313
314
|
# File 'lib/wp2txt/utils.rb', line 310
def format_ref!(page)
end
|
112
113
114
115
116
117
118
119
|
# File 'lib/wp2txt/utils.rb', line 112
def format_wiki!(text, has_retried = false)
escape_nowiki!(text)
process_interwiki_links!(text)
process_external_links!(text)
unescape_nowiki!(text)
end
|
#make_reference!(str) ⇒ Object
303
304
305
306
307
308
|
# File 'lib/wp2txt/utils.rb', line 303
def make_reference!(str)
str.gsub!($make_reference_regex_a, "\n")
str.gsub!($make_reference_regex_b, "")
str.gsub!($make_reference_regex_c, "[ref]")
str.gsub!($make_reference_regex_d, "[/ref]")
end
|
#mndash!(str) ⇒ Object
295
296
297
|
# File 'lib/wp2txt/utils.rb', line 295
def mndash!(str)
str.gsub!($mndash_regex, "–")
end
|
#parse_wikitext(uri, wikitext, page) ⇒ Object
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
|
# File 'lib/wp2txt/mw_api.rb', line 43
def parse_wikitext(uri, wikitext, page)
text = URI.escape(wikitext)
title = URI.escape(page)
data = {"action" => "parse",
"format" => "json",
"text" => text,
"title" => title}
jsn = post_request(uri, data)
hash = JSON.parse(jsn)
begin
result = hash["parse"]["text"]["*"]
result = special_chr(result)
return chrref_to_utf(result).gsub("[[", "[[").gsub("]]", "]]")
rescue => e
puts "ERROR!"
p e
exit
template
end
end
|
#post_request(uri_string, data = {}) ⇒ Object
14
15
16
17
18
19
20
|
# File 'lib/wp2txt/mw_api.rb', line 14
def post_request(uri_string, data={})
data = data.map{ |k, v| "#{k}=#{v}" }.join("&")
uri = URI.parse(uri_string)
uri.path = "/" if uri.path.empty?
http = Net::HTTP.new(uri.host)
return http.post(uri.path, data).body
end
|
#process_external_links!(str) ⇒ Object
220
221
222
223
224
225
226
227
228
229
230
231
232
|
# File 'lib/wp2txt/utils.rb', line 220
def process_external_links!(str)
scanner = StringScanner.new(str)
result = process_nested_structure(scanner, "[", "]", $limit_recur) do |contents|
parts = contents.split(" ", 2)
case parts.size
when 1
parts.first || ""
else
parts.last || ""
end
end
str.replace(result)
end
|
#process_interwiki_links!(str) ⇒ Object
205
206
207
208
209
210
211
212
213
214
215
216
217
218
|
# File 'lib/wp2txt/utils.rb', line 205
def process_interwiki_links!(str)
scanner = StringScanner.new(str)
result = process_nested_structure(scanner, "[[", "]]", $limit_recur) do |contents|
parts = contents.split("|")
case parts.size
when 1
parts.first || ""
else
parts.shift
parts.join("|")
end
end
str.replace(result)
end
|
#process_nested_structure(scanner, left, right, recur_count, &block) ⇒ Object
parser for nested structure ####################
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
|
# File 'lib/wp2txt/utils.rb', line 137
def process_nested_structure(scanner, left, right, recur_count, &block)
buffer = ""
begin
if left == "[" && right == "]"
regex = $single_square_bracket_regex
elsif left == "[[" && right == "]]"
regex = $double_square_bracket_regex
elsif left == "{" && right == "}"
regex = $single_curly_bracket_regex
elsif left == "{{" && right == "}}"
regex = $double_curly_bracket_regex
elsif left == "{|" && right == "|}"
regex = $curly_square_bracket_regex
else
regex = Regexp.new('(#{Regexp.escape(left)}|#{Regexp.escape(right)})', Regexp::MULTILINE)
end
while str = scanner.scan_until(regex)
case scanner[1]
when left
buffer << str
has_left = true
when right
if has_left
buffer = buffer[0...-(left.size)]
contents = block.call(str[0...-(left.size)])
buffer << contents
break
else
buffer << str
end
end
end
buffer << scanner.rest
recur_count = recur_count - 1
if recur_count < 0 || buffer == scanner.string
return buffer
else
scanner.string = buffer
return process_nested_structure(scanner, left, right, recur_count, &block) || ""
end
rescue => e
return scanner.string
end
end
|
#remove_directive!(str) ⇒ Object
266
267
268
|
# File 'lib/wp2txt/utils.rb', line 266
def remove_directive!(str)
str.gsub!($remove_directives_regex, "")
end
|
#remove_emphasis!(str) ⇒ Object
270
271
272
273
274
|
# File 'lib/wp2txt/utils.rb', line 270
def remove_emphasis!(str)
str.gsub!($remove_emphasis_regex) do
$2
end
end
|
#remove_hr!(page) ⇒ Object
299
300
301
|
# File 'lib/wp2txt/utils.rb', line 299
def remove_hr!(page)
page.gsub!($remove_hr_regex, "")
end
|
#remove_inbetween!(str, tagset = ['<', '>']) ⇒ Object
256
257
258
259
260
|
# File 'lib/wp2txt/utils.rb', line 256
def remove_inbetween!(str, tagset = ['<', '>'])
tagsets = Regexp.quote(tagset.uniq.join(""))
regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
str.gsub!(regex, "")
end
|
#remove_table!(str) ⇒ Object
244
245
246
247
248
249
250
|
# File 'lib/wp2txt/utils.rb', line 244
def remove_table!(str)
scanner = StringScanner.new(str)
result = process_nested_structure(scanner, "{|", "|}", $limit_recur) do |contents|
""
end
str.replace(result)
end
|
#remove_tag!(str) ⇒ Object
262
263
264
|
# File 'lib/wp2txt/utils.rb', line 262
def remove_tag!(str)
str.gsub!($remove_tag_regex, "")
end
|
#remove_templates!(str) ⇒ Object
methods used from format_article ####################
236
237
238
239
240
241
242
|
# File 'lib/wp2txt/utils.rb', line 236
def remove_templates!(str)
scanner = StringScanner.new(str)
result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
""
end
str.replace(result)
end
|
#rename(files) ⇒ Object
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
|
# File 'lib/wp2txt/utils.rb', line 399
def rename(files)
maxwidth = 0
files.each do |f|
width = f.slice(/\-(\d+)\z/, 1).to_s.length.to_i
maxwidth = width if maxwidth < width
end
files.each do |f|
newname= f.sub(/\-(\d+)\z/) do
"-" + sprintf("%0#{maxwidth}d", $1.to_i)
end
File.rename(f, newname + ".txt")
end
end
|
#sec_to_str(int) ⇒ Object
convert int of seconds to string in the format 00:00:00
417
418
419
420
421
422
423
424
425
426
427
|
# File 'lib/wp2txt/utils.rb', line 417
def sec_to_str(int)
unless int
str = "--:--:--"
return str
end
h = int / 3600
m = (int - h * 3600) / 60
s = int % 60
str = sprintf("%02d:%02d:%02d", h, m, s)
return str
end
|
#special_chr!(str) ⇒ Object
252
253
254
|
# File 'lib/wp2txt/utils.rb', line 252
def special_chr!(str)
str.replace $html_decoder.decode(str)
end
|
#unescape_nowiki!(str) ⇒ Object
198
199
200
201
202
203
|
# File 'lib/wp2txt/utils.rb', line 198
def unescape_nowiki!(str)
str.gsub!($unescape_nowiki_regex) do
obj_id = $1.to_i
@nowikis[obj_id]
end
end
|