Module: EbookTools
Instance Method Summary collapse
- #allow_extract_struct?(file) ⇒ Boolean
- #batch_convert(source, destination, options = {}) ⇒ Object
-
#batch_extract_from_dir(source, destination, options = {}) ⇒ Object
batch_extract_from_dir batch extract book struct form dir parameters:
sourcesource directorydestinationoutput directoryoptionsoptional parameter. - #convert(filename, epub_file, options = {}) ⇒ Object
- #extract_book_struct_to_file(source, destination, options = {}) ⇒ Object
- #extract_text_from_file(filename, format) ⇒ Object
-
#html2epub(filename, epub_file, options = {}) ⇒ Object
html2epub 将HTML格式转换成EPUB格式.
-
#pdf2epub(filename, epub_file, options = {}) ⇒ Object
pdf2epub 将PDF格式转换成EPUB格式.
-
#sanitize_for_epub_text(content) ⇒ Object
sanitize_for_epub_text.
-
#text_paras_repair(source_file, target_file, options = {}) ⇒ Object
text_paras_repair 对文本文件格式中的中断段落进行修复.
-
#txt2epub(filename, epub_file, options = {}) ⇒ Object
txt2epub 将文本格式转换成EPUB格式.
Instance Method Details
#allow_extract_struct?(file) ⇒ Boolean
198 199 200 201 |
# File 'lib/ebook_tools.rb', line 198 def allow_extract_struct?(file) extname = File.extname(file) ['.txt','.html','.epub'].include?(extname.downcase) end |
#batch_convert(source, destination, options = {}) ⇒ Object
102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
# File 'lib/ebook_tools.rb', line 102 def batch_convert(source,destination,={}) log = File.open('batch.log','a') success_log = File.open('success.log','a') error_log = File.open('error.log','a') scan_log = File.open('scan.log','a') unknown_log = File.open('unknown.log','a') source_path = File.absolute_path(source) dest_path = File.join(File.absolute_path(destination),'epub') scan_path = File.join(File.absolute_path(destination),'scan') unknown_path = File.join(File.absolute_path(destination),'unknown') backup_path = File.join(File.absolute_path(destination),'backup') format = [:format] files = Utils.scan_file_from_dir(source_path,:format=>format) total_count = files.count scan_count = 0 success_count = 0 error_count = 0 unknown_count = 0 puts "count: #{total_count} file " log.puts "****batch convert****** : #{Time.now}" log.puts "#{source_path} => #{dest_path} " log.puts "count: #{total_count} file " success_log.puts "****batch convert****** : #{Time.now}" success_log.puts "#{source_path} => #{dest_path} " error_log.puts "****batch convert****** : #{Time.now}" error_log.puts "#{source_path} => #{dest_path} " scan_log.puts "****batch convert****** : #{Time.now}" scan_log.puts "#{source_path} => #{dest_path} " unknown_log.puts "****batch convert****** : #{Time.now}" unknown_log.puts "#{source_path} => #{dest_path} " files.each do |file| dest_file = File.join(File.dirname(File.join(dest_path,file.gsub(source_path,''))),"#{File.basename(file,File.extname(file))}.epub") keywords = Utils.extract_keywords_from_path(File.dirname(file).gsub(source_path,'')) puts "start convert #{file}" extname = File.extname(file).gsub('.','') method_name = "#{extname}2epub" if extname == 'epub' FileUtils.mkdir_p(dest_path) unless Dir.exists?(dest_path) FileUtils.cp(file,dest_file) success_file = File.join(backup_path,file.gsub(source_path,'')) FileUtils.mkdir_p(File.dirname(success_file)) unless Dir.exists?(File.dirname(success_file)) FileUtils.mv(file,success_file,:force=>true) success_count += 1 success_log.puts "success: #{source} conversion successfully!" elsif EbookTools.respond_to?(method_name) begin if PDF.scan_pdf?(file) scan_file = File.join(scan_path,file.gsub(source_path,'')) FileUtils.mkdir_p(File.dirname(scan_file)) unless Dir.exists?(File.dirname(scan_file)) FileUtils.mv(file,scan_file,:force=>true) scan_count += 1 scan_log.puts "warning: #{file} is scan pdf." else EbookTools.send(method_name,file,dest_file,{:keywords=>keywords}) success_file = File.join(backup_path,file.gsub(source_path,'')) FileUtils.mkdir_p(File.dirname(success_file)) unless Dir.exists?(File.dirname(success_file)) FileUtils.mv(file,success_file,:force=>true) success_count += 1 success_log.puts "success: #{source} conversion successfully!" end rescue Exception => e unknown_file = File.join(unknown_path,file.gsub(source_path,'')) FileUtils.mkdir_p(File.dirname(unknown_file)) unless Dir.exists?(File.dirname(unknown_file)) FileUtils.mv(file,unknown_file,:force=>true) error_count += 1 error_log.puts "error: #{source} \n#{e.backtrace.join("\n")}" end end end success_log.puts "count: #{success_count} Time: #{Time.now} \n" scan_log.puts "count: #{scan_count} Time: #{Time.now} \n" error_log.puts "count: #{error_count} Time: #{Time.now} \n" unknown_log.puts "unknown: #{unknown_count} Time: #{Time.now} \n" log.puts "success: #{success_count} scan: #{scan_count} error: #{error_count} Time: #{Time.now} \n" ensure success_log.close error_log.close scan_log.close unknown_log.close log.close end |
#batch_extract_from_dir(source, destination, options = {}) ⇒ Object
batch_extract_from_dir
batch extract book struct form dir
parameters:
+source+ source directory
+destination+ output directory
+options+ optional parameter.
:format 指定需要提取结构的文件后缀名,例如要从所有txt文件中提取,通过:format=>'.txt'指定
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 |
# File 'lib/ebook_tools.rb', line 235 def batch_extract_from_dir(source,destination,={}) format = .delete(:format) files = Utils.scan_file_from_dir(source,{:format=>format}) files.each do |file| extname = File.extname(file) basename = File.basename(file,extname) dest_file = File.join(File.dirname(File.join(destination,file.gsub(source,''))),"#{basename}.xml") if allow_extract_struct?(file) puts "start extract #{file} ..." begin if extract_book_struct_to_file(file,dest_file) puts "success: extract book struct successfully!" else puts "警告: 没有检测到书结构信息." end rescue Exception => e puts "error: #{file} \n#{e.backtrace.join("\n")}" end else puts "error: #{file}不是允许的文件格式: txt,html,epub" end end end |
#convert(filename, epub_file, options = {}) ⇒ Object
10 11 12 13 14 15 16 17 18 |
# File 'lib/ebook_tools.rb', line 10 def convert(filename,epub_file,={}) method_name = "#{File.extname(filename).gsub('.','')}2epub" if EbookTools.respond_to?(method_name) EbookTools.send(method_name,filename,epub_file,) return true else return nil end end |
#extract_book_struct_to_file(source, destination, options = {}) ⇒ Object
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 |
# File 'lib/ebook_tools.rb', line 203 def extract_book_struct_to_file(source,destination,={}) [:title] ||= File.basename(source,File.extname(source)) content = case File.extname(source) when '.html' extract_text_from_file(source,'.html') when '.epub' text = extract_text_from_file(source,'.epub') sanitize_for_epub_text(text) when '.txt' File.open(source).read end txt_book = TxtBook.new(content,) docbook_xml = txt_book.to_doc_book if docbook_xml FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination)) File.open(destination,'wb'){|file|file.write docbook_xml} puts "目录结构:" puts txt_book.toc_to_text puts "共修复#{txt_book.breaklines_count}个断点." return true else return nil end end |
#extract_text_from_file(filename, format) ⇒ Object
269 270 271 272 273 274 275 276 |
# File 'lib/ebook_tools.rb', line 269 def extract_text_from_file(filename,format) txt_file = File.basename(filename,format) cmd = %Q(ebook-convert #{filename} #{txt_file}.txt) output = `#{cmd}` content = File.open("#{txt_file}.txt").read FileUtils.remove_file("#{txt_file}.txt",true) return content end |
#html2epub(filename, epub_file, options = {}) ⇒ Object
html2epub 将HTML格式转换成EPUB格式
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
# File 'lib/ebook_tools.rb', line 49 def html2epub(filename,epub_file,={}) basename = File.basename(filename,'.html') temp_dir = "#{basename}" FileUtils.mkdir(temp_dir) unless File.exists?(temp_dir) html = File.open(filename).read html_file = File.join([temp_dir,"#{basename}.html"].compact) Utils.write_file(html,html_file) sections = Utils.detect_sections_from_html(html_file) nav_file = EPUB.gen_nav_file(html_file,sections) EPUB.write_epub(epub_file,.merge(:files=>[nav_file,html_file])) ensure FileUtils.remove_dir(temp_dir,true) end |
#pdf2epub(filename, epub_file, options = {}) ⇒ Object
pdf2epub 将PDF格式转换成EPUB格式
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
# File 'lib/ebook_tools.rb', line 69 def pdf2epub(filename,epub_file,={}) basename = File.basename(filename,'.pdf') temp_dir = "#{basename}" FileUtils.mkdir(temp_dir) unless File.exists?(temp_dir) pages_text = PDF.extract_pdf_pages_text(filename) pages_text = PDF.(pages_text,) pages_text = PDF.fixed_break_with_pages_text(pages_text) sections = PDF.extract_sections(filename) illustrations = PDF.extract_illustrations(filename,{:dir=>temp_dir}) html_content = PDF.gen_html_from_sections_and_page_texts(sections,pages_text,illustrations) html = Utils.wrapper_html(html_content) html_file = File.join([temp_dir,"#{basename}.html"].compact) Utils.write_file(html,html_file) illustrations_path = illustrations.map{|image_path| File.join(temp_dir,image_path)} nav_file = EPUB.gen_nav_file(html_file,sections) files = [html_file,nav_file,illustrations_path].flatten = PDF.(filename) = .merge().merge(:files=>files) EPUB.write_epub(epub_file,) ensure FileUtils.remove_dir(temp_dir,true) end |
#sanitize_for_epub_text(content) ⇒ Object
sanitize_for_epub_text
279 280 281 282 283 284 285 286 287 288 289 290 |
# File 'lib/ebook_tools.rb', line 279 def sanitize_for_epub_text(content) return content if content.blank? lines = [] content.each_line do |line| unless line.downcase.include?('document outline') lines << line else break; end end lines.join("") end |
#text_paras_repair(source_file, target_file, options = {}) ⇒ Object
text_paras_repair 对文本文件格式中的中断段落进行修复
262 263 264 265 266 267 |
# File 'lib/ebook_tools.rb', line 262 def text_paras_repair(source_file,target_file,={}) content = File.open(source_file).read content = Utils.to_utf8 unless Utils.detect_utf8(content) content = Utils.fixed_page_break(content,) File.open(target_file,'w'){|file| file.write content} end |
#txt2epub(filename, epub_file, options = {}) ⇒ Object
txt2epub 将文本格式转换成EPUB格式
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
# File 'lib/ebook_tools.rb', line 22 def txt2epub(filename,epub_file,={}) basename = File.basename(filename,'.txt') temp_dir = "#{basename}" FileUtils.mkdir(temp_dir) unless File.exists?(temp_dir) title,outlines, content = TXT.extract_book_part(filename) if [:fix] content = Utils.fixed_page_break(content) end html_content = TXT.gen_html_from_txt_book(title,outlines,content) html = Utils.wrapper_html(html_content) html_file = File.join([temp_dir,"#{basename}.html"].compact) Utils.write_file(html,html_file) sections = Utils.detect_sections_from_html(html_file) nav_file = EPUB.gen_nav_file(html_file,sections) EPUB.write_epub(epub_file,.merge(:files=>[nav_file,html_file])) ensure FileUtils.remove_dir(temp_dir,true) end |