Class: Wp2txt::Runner
Constant Summary
Constants included from Wp2txt
Instance Method Summary collapse
- #extract ⇒ Object
- #extract_and_convert(&block) ⇒ Object
-
#extract_text(&block) ⇒ Object
call this method to do the job.
- #file_size(file) ⇒ Object
-
#fill_buffer ⇒ Object
read text data from bz2 compressed file by 1 megabyte.
- #get_newline ⇒ Object
- #get_page ⇒ Object
-
#initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false, limit_recur = 10) ⇒ Runner
constructor
A new instance of Runner.
-
#notify_parent(last = false) ⇒ Object
control the display of command line progressbar (or gui which is not available for now).
-
#prepare ⇒ Object
check the size of input file (bz2 or plain xml) when uncompressed.
Methods included from Wp2txt
#batch_file_mod, #chrref_to_utf!, #collect_files, #convert_characters!, #correct_inline_template!, #correct_separator, #decimal_format, #escape_nowiki!, #expand_template, #file_mod, #format_article!, #format_ref!, #format_wiki!, #make_reference!, #mndash!, #parse_wikitext, #post_request, #process_external_links!, #process_interwiki_links!, #process_nested_structure, #remove_directive!, #remove_emphasis!, #remove_hr!, #remove_inbetween!, #remove_table!, #remove_tag!, #remove_templates!, #rename, #sec_to_str, #special_chr!, #unescape_nowiki!
Constructor Details
#initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false, limit_recur = 10) ⇒ Runner
Returns a new instance of Runner.
32 33 34 35 36 37 38 39 40 41 42 43 44 |
# File 'lib/wp2txt.rb', line 32 def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false, limit_recur = 10) @parent = parent @fp = nil @input_file = input_file @output_dir = output_dir @tfile_size = tfile_size @convert = convert @strip_tmarker = strip_tmarker #max number of recursive calls (global variable) $limit_recur = limit_recur end |
Instance Method Details
#extract ⇒ Object
325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 |
# File 'lib/wp2txt.rb', line 325 def extract output_text = "" end_flag = false while text = get_newline @count ||= 0;@count += 1; @size_read ||=0;@size_read += text.bytesize @total_size += text.bytesize output_text << text end_flag = true if @total_size > (@tfile_size * 1024 * 1024) notify_parent # never close the file until the end of the page even if end_flag is on if end_flag && /<\/page/ =~ text @fp.puts(output_text) output_text = "" @total_size = 0 end_flag = false @fp.close @file_index += 1 outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s) @outfiles << outfilename @fp = File.open(outfilename, "w") next end end @fp.puts(output_text) if output_text != "" notify_parent(true) @parent.after @fp.close rename(@outfiles) @parent.msg("Processing finished", 1) end |
#extract_and_convert(&block) ⇒ Object
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 |
# File 'lib/wp2txt.rb', line 235 def extract_and_convert(&block) in_text = false = false result_text = "" title = nil end_flag = false terminal_round = false output_text = "" while page = get_page xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n" xml = xmlns + page + "</mediawiki>" input = Nokogiri::XML(xml, nil, 'UTF-8') page = input.xpath("//xmlns:text").first pp_title = page.parent.parent.at_css "title" title = pp_title.content next if /\:/ =~ title text = page.content # input = Oga.parse_xml(xml) # page = input.xpath("//xmlns:text").first # title = page.parent.parent.xpath("//xmlns:title").first.text # next if /\:/ =~ title # text = page.text # input = Ox.load(xml, :encoding => "UTF-8") # title = "" # text = "" # input.nodes.first.nodes.each do |n| # if n.name == "title" # title = n.nodes.first # if /\:/ =~ title # title = "" # break # end # elsif n.name == "revision" # n.nodes.each do |o| # if o.name == "text" # text = o.nodes.first # break # end # end # end # end # next if title == "" || text == "" # remove all comment texts # and insert as many number of new line chars included in # each comment instead text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content| num_of_newlines = content.count("\n") if num_of_newlines == 0 "" else "\n" * num_of_newlines end end @count ||= 0;@count += 1; article = Article.new(text, title, @strip_tmarker) output_text += block.call(article) @total_size = output_text.bytesize # flagged when data exceeds the size of output file end_flag = true if @total_size > (@tfile_size * 1024 * 1024) #close the present file, then open a new one if end_flag @fp.puts(output_text) output_text = "" @total_size = 0 end_flag = false @fp.close @file_index += 1 outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s) @outfiles << outfilename @fp = File.open(outfilename, "w") next end end @fp.puts(output_text) if output_text != "" notify_parent(true) @parent.after @fp.close rename(@outfiles) @parent.msg("Processing finished", 1) end |
#extract_text(&block) ⇒ Object
call this method to do the job
221 222 223 224 225 226 227 228 229 230 231 232 233 |
# File 'lib/wp2txt.rb', line 221 def extract_text(&block) prepare if @convert if block extract_and_convert(&block) else extract_and_convert end else # output the original xml only split to files of the specified size extract end end |
#file_size(file) ⇒ Object
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# File 'lib/wp2txt.rb', line 46 def file_size(file) origin = Time.now size = 0; unit = 10485760; star = 0; before = Time.now.to_f error_count = 10 while true do begin a = file.read(unit) rescue => e a = nil end break unless a present = Time.now.to_f size += a.size if present - before > 0.3 star = 0 if star > 10 star += 1 before = present end end time_elapsed = Time.now - origin size end |
#fill_buffer ⇒ Object
read text data from bz2 compressed file by 1 megabyte
149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
# File 'lib/wp2txt.rb', line 149 def fill_buffer while true do begin new_lines = @file_pointer.read(10485760) rescue => e return nil end return nil unless new_lines # temp_buf is filled with text split by "\n" temp_buf = [] ss = StringScanner.new(new_lines) while ss.scan(/.*?\n/m) temp_buf << ss[0] end temp_buf << ss.rest unless ss.eos? new_first_line = temp_buf.shift if new_first_line[-1, 1] == "\n" # new_first_line.index("\n") @buffer.last << new_first_line @buffer << "" else @buffer.last << new_first_line end @buffer += temp_buf unless temp_buf.empty? if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n") @buffer << "" end break if @buffer.size > 1 end return true end |
#get_newline ⇒ Object
182 183 184 185 186 187 188 189 190 191 192 193 |
# File 'lib/wp2txt.rb', line 182 def get_newline @buffer ||= [""] if @buffer.size == 1 return nil unless fill_buffer end if @buffer.empty? return nil else new_line = @buffer.shift return new_line end end |
#get_page ⇒ Object
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 |
# File 'lib/wp2txt.rb', line 195 def get_page inside_page = false page = "" while line = get_newline notify_parent @size_read ||=0; @size_read += line.bytesize if /<page>/ =~ line # page << line inside_page = true next elsif /<\/page>/ =~ line # page << line inside_page = false break end page << line if inside_page end if page.empty? return false else return page.force_encoding("utf-8") rescue page end end |
#notify_parent(last = false) ⇒ Object
control the display of command line progressbar (or gui which is not available for now)
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
# File 'lib/wp2txt.rb', line 71 def notify_parent(last = false) @last_time ||= Time.now.to_f @elapsed_sum ||= 0 time_now = Time.now.to_f elapsed_from_last = (time_now - @last_time).to_i if elapsed_from_last > 0.3 || last @last_time = time_now @elapsed_sum += elapsed_from_last gvalue = (@size_read.to_f / @infile_size.to_f * 100 * 100).to_i elt_str = sec_to_str(@elapsed_sum) if last eta_str = "00:00:00" else lines_persec = @size_read / @elapsed_sum if @elapsed_sum > 0 eta_sec = (@infile_size - @size_read) / lines_persec eta_str = sec_to_str(eta_sec) end @parent.prg_update(gvalue, elt_str, eta_str) end end |
#prepare ⇒ Object
check the size of input file (bz2 or plain xml) when uncompressed
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
# File 'lib/wp2txt.rb', line 95 def prepare # if output_dir is not specified, output in the same directory # as the imput file if !@output_dir && @input_file @output_dir = File.dirname(@input_file) end # if input file is bz2 compressed, use bz2-ruby if available, # use command line bzip2 program otherwise. if /.bz2$/ =~ @input_file unless NO_BZ2 file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8") @parent.msg("Preparing ... This may take several minutes or more ", 0) @infile_size = file_size(file) @parent.msg("... Done.", 1) file.close file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8") else if RUBY_PLATFORM.index("win32") file = IO.popen("bunzip2.exe -c #{@input_file}") else file = IO.popen("bzip2 -c -d #{@input_file}") end @parent.msg("Preparing ... This may take several minutes or more ", 0) @infile_size = file_size(file) @parent.msg("... Done.", 1) file.close # try to reopen since rewind method is unavailable if RUBY_PLATFORM.index("win32") file = IO.popen("bunzip2.exe -c #{@input_file}") else file = IO.popen("bzip2 -c -d #{@input_file}") end end else # meaning that it is a text file @infile_size = File.stat(@input_file).size file = open(@input_file) end #create basename of output file @outfile_base = File.basename(@input_file, ".*") + "-" @total_size = 0 @file_index = 1 outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s) @outfiles = [] @outfiles << outfilename @fp = File.open(outfilename, "w") @parent.before @parent.data_set(@input_file, 100 * 100) @file_pointer = file return true end |