Module: Html2Doc
- Defined in:
- lib/html2doc/base.rb,
lib/html2doc/mime.rb,
lib/html2doc/version.rb
Constant Summary collapse
- VERSION =
"0.5.0".freeze
Class Method Summary collapse
- .asciimath_to_mathml(doc, delims) ⇒ Object
- .cleanup(docxml, dir) ⇒ Object
- .define_head(docxml, dir, filename, cssname, header_file) ⇒ Object
- .define_head1(docxml, dir) ⇒ Object
- .filename_substitute(stylesheet, header_filename, filename) ⇒ Object
- .generate_filelist(filename, dir) ⇒ Object
- .image_cleanup(docxml, dir) ⇒ Object
- .image_resize(orig_filename) ⇒ Object
- .mathml_to_ooml(docxml) ⇒ Object
- .mime_attachment(boundary, filename, item, dir) ⇒ Object
- .mime_boundary ⇒ Object
- .mime_package(result, filename, dir) ⇒ Object
- .mime_preamble(boundary, filename, result) ⇒ Object
- .mime_type(item) ⇒ Object
- .msonormal(docxml) ⇒ Object
- .msword_fix(r) ⇒ Object
- .namespace(root) ⇒ Object
- .process(result, filename, stylesheet, header_file, dir, asciimathdelims = nil) ⇒ Object
- .process_html(result, filename, stylesheet, header_file, dir, asciimathdelims) ⇒ Object
- .rm_temp_files(filename, dir) ⇒ Object
- .stylesheet(filename, header_filename, fn) ⇒ Object
-
.xhtml(result) ⇒ Object
preserve HTML escapes.
Class Method Details
.asciimath_to_mathml(doc, delims) ⇒ Object
40 41 42 43 44 45 46 47 |
# File 'lib/html2doc/base.rb', line 40 def self.asciimath_to_mathml(doc, delims) return doc if delims.nil? || delims.size < 2 doc.split(/(#{delims[0]}|#{delims[1]})/).each_slice(4).map do |a| a[2].nil? || a[2] = AsciiMath.parse(a[2]).to_mathml. gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>") a.size > 1 ? a[0] + a[2] : a[0] end.join end |
.cleanup(docxml, dir) ⇒ Object
33 34 35 36 37 38 |
# File 'lib/html2doc/base.rb', line 33 def self.cleanup(docxml, dir) image_cleanup(docxml, dir) mathml_to_ooml(docxml) msonormal(docxml) docxml end |
.define_head(docxml, dir, filename, cssname, header_file) ⇒ Object
146 147 148 149 150 151 152 153 154 155 156 157 |
# File 'lib/html2doc/base.rb', line 146 def self.define_head(docxml, dir, filename, cssname, header_file) title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']") head = docxml.at("//*[local-name() = 'head']") css = stylesheet(filename, header_file, cssname) if title.nil? head.children.first.add_previous_sibling css else title.add_next_sibling css end define_head1(docxml, dir) namespace(docxml.root) end |
.define_head1(docxml, dir) ⇒ Object
109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
# File 'lib/html2doc/base.rb', line 109 def self.define_head1(docxml, dir) docxml.xpath("//*[local-name() = 'head']").each do |h| h.children.first.add_previous_sibling <<~XML <!--[if gte mso 9]> <xml> <w:WordDocument> <w:View>Print</w:View> <w:Zoom>100</w:Zoom> <w:DoNotOptimizeForBrowser/> </w:WordDocument> </xml> <![endif]--> <meta http-equiv=Content-Type content="text/html; charset=utf-8"/> <link rel="File-List" href="#{dir}/filelist.xml"/> XML end end |
.filename_substitute(stylesheet, header_filename, filename) ⇒ Object
127 128 129 130 131 132 133 134 |
# File 'lib/html2doc/base.rb', line 127 def self.filename_substitute(stylesheet, header_filename, filename) if header_filename.nil? stylesheet.gsub!(/\n[^\n]*FILENAME[^\n]*i\n/, "\n") else stylesheet.gsub!(/FILENAME/, filename) end stylesheet end |
.generate_filelist(filename, dir) ⇒ Object
168 169 170 171 172 173 174 175 176 177 178 179 180 |
# File 'lib/html2doc/base.rb', line 168 def self.generate_filelist(filename, dir) File.open(File.join(dir, "filelist.xml"), "w") do |f| f.write(<<~"XML") <xml xmlns:o="urn:schemas-microsoft-com:office:office"> <o:MainFile HRef="../#{filename}.htm"/> XML Dir.foreach(dir) do |item| next if item == "." || item == ".." || /^\./.match(item) f.write %{ <o:File HRef="#{item}"/>\n} end f.write("</xml>\n") end end |
.image_cleanup(docxml, dir) ⇒ Object
96 97 98 99 100 101 102 103 104 105 106 107 |
# File 'lib/html2doc/base.rb', line 96 def self.image_cleanup(docxml, dir) docxml.xpath("//*[local-name() = 'img']").each do |i| matched = /\.(?<suffix>\S+)$/.match i["src"] uuid = UUIDTools::UUID.random_create.to_s new_full_filename = File.join(dir, "#{uuid}.#{matched[:suffix]}") # presupposes that the image source is local system "cp #{i['src']} #{new_full_filename}" i["width"], i["height"] = image_resize(i["src"]) i["src"] = new_full_filename end docxml end |
.image_resize(orig_filename) ⇒ Object
82 83 84 85 86 87 88 89 90 91 92 93 94 |
# File 'lib/html2doc/base.rb', line 82 def self.image_resize(orig_filename) image_size = ImageSize.path(orig_filename).size # max width for Word document is 400, max height is 680 if image_size[0] > 400 image_size[1] = (image_size[1] * 400 / image_size[0]).ceil image_size[0] = 400 end if image_size[1] > 680 image_size[0] = (image_size[0] * 680 / image_size[1]).ceil image_size[1] = 680 end image_size end |
.mathml_to_ooml(docxml) ⇒ Object
49 50 51 52 53 54 55 56 57 |
# File 'lib/html2doc/base.rb', line 49 def self.mathml_to_ooml(docxml) docxml.xpath("//*[local-name() = 'math']").each do |m| @xslt.xml = m.to_s.gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>") ooml = @xslt.serve.gsub(/<\?[^>]+>\s*/, ""). gsub(/ xmlns:[^=]+="[^"]+"/, "")# .gsub(%r{(</?)}, "\\1m:") m.swap(ooml) end end |
.mime_attachment(boundary, filename, item, dir) ⇒ Object
19 20 21 22 23 24 25 26 27 28 29 30 31 32 |
# File 'lib/html2doc/mime.rb', line 19 def self.(boundary, filename, item, dir) encoded_file = Base64.strict_encode64( File.read("#{dir}/#{item}"), ).gsub(/(.{76})/, "\\1\n") <<~"FILE" --#{boundary} Content-Location: file:///C:/Doc/#{filename}_files/#{item} Content-Transfer-Encoding: base64 Content-Type: #{mime_type(item)} #{encoded_file} FILE end |
.mime_boundary ⇒ Object
41 42 43 44 |
# File 'lib/html2doc/mime.rb', line 41 def self.mime_boundary salt = UUIDTools::UUID.random_create.to_s.gsub(/-/, ".")[0..17] "----=_NextPart_#{salt}" end |
.mime_package(result, filename, dir) ⇒ Object
46 47 48 49 50 51 52 53 54 55 |
# File 'lib/html2doc/mime.rb', line 46 def self.mime_package(result, filename, dir) boundary = mime_boundary mhtml = mime_preamble(boundary, filename, result) Dir.foreach(dir) do |item| next if item == "." || item == ".." || /^\./.match(item) mhtml += (boundary, filename, item, dir) end mhtml += "--#{boundary}--" File.open("#{filename}.doc", "w") { |f| f.write mhtml } end |
.mime_preamble(boundary, filename, result) ⇒ Object
5 6 7 8 9 10 11 12 13 14 15 16 17 |
# File 'lib/html2doc/mime.rb', line 5 def self.mime_preamble(boundary, filename, result) <<~"PREAMBLE" MIME-Version: 1.0 Content-Type: multipart/related; boundary="#{boundary}" --#{boundary} Content-Location: file:///C:/Doc/#{filename}.htm Content-Type: text/html; charset="utf-8" #{result} PREAMBLE end |
.mime_type(item) ⇒ Object
34 35 36 37 38 39 |
# File 'lib/html2doc/mime.rb', line 34 def self.mime_type(item) types = MIME::Types.type_for(item) type = types ? types.first.to_s : 'text/plain; charset="utf-8"' type = type + ' charset="utf-8"' if /^text/.match?(type) && types type end |
.msonormal(docxml) ⇒ Object
182 183 184 185 186 187 188 189 |
# File 'lib/html2doc/base.rb', line 182 def self.msonormal(docxml) docxml.xpath("//*[local-name() = 'p'][not(self::*[@class])]").each do |p| p["class"] = "MsoNormal" end docxml.xpath("//*[local-name() = 'li'][not(self::*[@class])]").each do |p| p["class"] = "MsoNormal" end end |
.msword_fix(r) ⇒ Object
69 70 71 72 73 74 75 76 77 78 79 80 |
# File 'lib/html2doc/base.rb', line 69 def self.msword_fix(r) # brain damage in MSWord parser r.gsub!(%r{<span style="mso-special-character:footnote"/>}, '<span style="mso-special-character:footnote"></span>') r.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>") r.gsub!(%r{<link rel="File-List"}, "<link rel=File-List") r.gsub!(%r{<meta http-equiv="Content-Type"}, "<meta http-equiv=Content-Type") r.gsub!(%r{&tab;|&tab;}, '<span style="mso-tab-count:1">  </span>') r end |
.namespace(root) ⇒ Object
159 160 161 162 163 164 165 166 |
# File 'lib/html2doc/base.rb', line 159 def self.namespace(root) { o: "urn:schemas-microsoft-com:office:office", w: "urn:schemas-microsoft-com:office:word", m: "http://schemas.microsoft.com/office/2004/12/omml", }.each { |k, v| root.add_namespace_definition(k.to_s, v) } root.add_namespace(nil, "http://www.w3.org/TR/REC-html40") end |
.process(result, filename, stylesheet, header_file, dir, asciimathdelims = nil) ⇒ Object
11 12 13 14 15 16 17 18 19 20 |
# File 'lib/html2doc/base.rb', line 11 def self.process(result, filename, stylesheet, header_file, dir, asciimathdelims = nil) result = process_html(result, filename, stylesheet, header_file, dir, asciimathdelims) system "cp #{header_file} #{dir}/header.html" unless header_file.nil? generate_filelist(filename, dir) File.open("#{filename}.htm", "w") { |f| f.write(result) } mime_package result, filename, dir rm_temp_files(filename, dir) end |
.process_html(result, filename, stylesheet, header_file, dir, asciimathdelims) ⇒ Object
22 23 24 25 26 |
# File 'lib/html2doc/base.rb', line 22 def self.process_html(result, filename, stylesheet, header_file, dir, asciimathdelims) docxml = Nokogiri::XML(asciimath_to_mathml(result, asciimathdelims)) define_head(cleanup(docxml, dir), dir, filename, stylesheet, header_file) result = msword_fix(docxml.to_xml) end |
.rm_temp_files(filename, dir) ⇒ Object
28 29 30 31 |
# File 'lib/html2doc/base.rb', line 28 def self.rm_temp_files(filename, dir) system "rm #{filename}.htm" system "rm -r #{filename}_files" end |
.stylesheet(filename, header_filename, fn) ⇒ Object
136 137 138 139 140 141 142 143 144 |
# File 'lib/html2doc/base.rb', line 136 def self.stylesheet(filename, header_filename, fn) (fn.nil? || fn.empty?) && fn = File.join(File.dirname(__FILE__), "wordstyle.css") stylesheet = File.read(fn, encoding: "UTF-8") stylesheet = filename_substitute(stylesheet, header_filename, filename) xml = Nokogiri::XML("<style/>") xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n") xml.root.to_s end |
.xhtml(result) ⇒ Object
preserve HTML escapes
60 61 62 63 64 65 66 67 |
# File 'lib/html2doc/base.rb', line 60 def self.xhtml(result) unless /<!DOCTYPE html/.match? result result.gsub!(/<\?xml version="1.0"\?>/, "") result = "<!DOCTYPE html SYSTEM " + "'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'>" + result end result end |