Module: Html2Doc
- Defined in:
- lib/html2doc/base.rb,
lib/html2doc/mime.rb,
lib/html2doc/notes.rb,
lib/html2doc/version.rb
Constant Summary collapse
- VERSION =
"0.6.0".freeze
Class Method Summary collapse
- .asciimath_to_mathml(doc, delims) ⇒ Object
- .cleanup(docxml, dir) ⇒ Object
- .define_head(docxml, dir, filename, cssname, header_file) ⇒ Object
- .define_head1(docxml, dir) ⇒ Object
- .filename_substitute(stylesheet, header_filename, filename) ⇒ Object
-
.footnote_cleanup(docxml) ⇒ Object
We expect that the content of the footnote text received is one or more text containers, p or aside or div (which we have already converted to p).
- .footnote_container(i) ⇒ Object
- .footnote_div_to_p(f) ⇒ Object
- .footnotes(docxml) ⇒ Object
- .generate_filelist(filename, dir) ⇒ Object
- .image_cleanup(docxml, dir) ⇒ Object
- .image_resize(i) ⇒ Object
- .is_footnote(a) ⇒ Object
- .mathml_to_ooml(docxml) ⇒ Object
- .mime_attachment(boundary, filename, item, dir) ⇒ Object
- .mime_boundary ⇒ Object
- .mime_package(result, filename, dir) ⇒ Object
- .mime_preamble(boundary, filename, result) ⇒ Object
- .mime_type(item) ⇒ Object
- .msonormal(docxml) ⇒ Object
- .msword_fix(r) ⇒ Object
- .namespace(root) ⇒ Object
- .process(result, filename, stylesheet, header_file, dir, asciimathdelims = nil) ⇒ Object
- .process_footnote_link(docxml, a, i, fn) ⇒ Object
- .process_footnote_texts(docxml, footnotes) ⇒ Object
- .process_html(result, filename, stylesheet, header_file, dir, asciimathdelims) ⇒ Object
- .rm_temp_files(filename, dir) ⇒ Object
- .set_footnote_link_attrs(a, i) ⇒ Object
- .stylesheet(filename, header_filename, fn) ⇒ Object
- .transform_footnote_text(note) ⇒ Object
-
.xhtml(result) ⇒ Object
preserve HTML escapes.
Class Method Details
.asciimath_to_mathml(doc, delims) ⇒ Object
41 42 43 44 45 46 47 48 |
# File 'lib/html2doc/base.rb', line 41 def self.asciimath_to_mathml(doc, delims) return doc if delims.nil? || delims.size < 2 doc.split(/(#{delims[0]}|#{delims[1]})/).each_slice(4).map do |a| a[2].nil? || a[2] = AsciiMath.parse(a[2]).to_mathml. gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>") a.size > 1 ? a[0] + a[2] : a[0] end.join end |
.cleanup(docxml, dir) ⇒ Object
33 34 35 36 37 38 39 |
# File 'lib/html2doc/base.rb', line 33 def self.cleanup(docxml, dir) image_cleanup(docxml, dir) mathml_to_ooml(docxml) footnotes(docxml) msonormal(docxml) docxml end |
.define_head(docxml, dir, filename, cssname, header_file) ⇒ Object
148 149 150 151 152 153 154 155 156 157 158 159 |
# File 'lib/html2doc/base.rb', line 148 def self.define_head(docxml, dir, filename, cssname, header_file) title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']") head = docxml.at("//*[local-name() = 'head']") css = stylesheet(filename, header_file, cssname) if title.nil? head.children.first.add_previous_sibling css else title.add_next_sibling css end define_head1(docxml, dir) namespace(docxml.root) end |
.define_head1(docxml, dir) ⇒ Object
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
# File 'lib/html2doc/base.rb', line 111 def self.define_head1(docxml, dir) docxml.xpath("//*[local-name() = 'head']").each do |h| h.children.first.add_previous_sibling <<~XML <!--[if gte mso 9]> <xml> <w:WordDocument> <w:View>Print</w:View> <w:Zoom>100</w:Zoom> <w:DoNotOptimizeForBrowser/> </w:WordDocument> </xml> <![endif]--> <meta http-equiv=Content-Type content="text/html; charset=utf-8"/> <link rel="File-List" href="#{dir}/filelist.xml"/> XML end end |
.filename_substitute(stylesheet, header_filename, filename) ⇒ Object
129 130 131 132 133 134 135 136 |
# File 'lib/html2doc/base.rb', line 129 def self.filename_substitute(stylesheet, header_filename, filename) if header_filename.nil? stylesheet.gsub!(/\n[^\n]*FILENAME[^\n]*i\n/, "\n") else stylesheet.gsub!(/FILENAME/, filename) end stylesheet end |
.footnote_cleanup(docxml) ⇒ Object
We expect that the content of the footnote text received is one or more text containers, p or aside or div (which we have already converted to p). We do not expect any <a name> or links back to text; if they are present in the HTML, they need to have been cleaned out before passing to this gem
85 86 87 88 89 90 91 92 |
# File 'lib/html2doc/notes.rb', line 85 def self.footnote_cleanup(docxml) docxml.xpath('//div[@style="mso-element:footnote"]/a'). each do |x| n = x.next_element n&.children&.first&.add_previous_sibling(x.remove) end docxml end |
.footnote_container(i) ⇒ Object
37 38 39 40 41 42 43 44 45 |
# File 'lib/html2doc/notes.rb', line 37 def self.footnote_container(i) <<~DIV <div style='mso-element:footnote' id='ftn#{i}'> <a style='mso-footnote-id:ftn#{i}' href=#_ftn#{i}' name='_ftnref#{i}' title='' id='_ftnref#{i}'><span class='MsoFootnoteReference'><span style='mso-special-character:footnote'></span></span></div> DIV end |
.footnote_div_to_p(f) ⇒ Object
26 27 28 29 30 31 32 33 34 35 |
# File 'lib/html2doc/notes.rb', line 26 def self.footnote_div_to_p(f) if %w{div aside}.include? f.name if f.at(".//p") f = f.replace(f.children) else f.name = "p" f["class"] = "MsoFootnoteText" end end end |
.footnotes(docxml) ⇒ Object
6 7 8 9 10 11 12 13 |
# File 'lib/html2doc/notes.rb', line 6 def self.footnotes(docxml) i, fn = 1, [] docxml.xpath("//a").each do |a| next unless process_footnote_link(docxml, a, i, fn) i += 1 end process_footnote_texts(docxml, fn) end |
.generate_filelist(filename, dir) ⇒ Object
170 171 172 173 174 175 176 177 178 179 180 181 182 |
# File 'lib/html2doc/base.rb', line 170 def self.generate_filelist(filename, dir) File.open(File.join(dir, "filelist.xml"), "w") do |f| f.write(<<~"XML") <xml xmlns:o="urn:schemas-microsoft-com:office:office"> <o:MainFile HRef="../#{filename}.htm"/> XML Dir.foreach(dir) do |item| next if item == "." || item == ".." || /^\./.match(item) f.write %{ <o:File HRef="#{item}"/>\n} end f.write("</xml>\n") end end |
.image_cleanup(docxml, dir) ⇒ Object
98 99 100 101 102 103 104 105 106 107 108 109 |
# File 'lib/html2doc/base.rb', line 98 def self.image_cleanup(docxml, dir) docxml.xpath("//*[local-name() = 'img']").each do |i| matched = /\.(?<suffix>\S+)$/.match i["src"] uuid = UUIDTools::UUID.random_create.to_s new_full_filename = File.join(dir, "#{uuid}.#{matched[:suffix]}") # presupposes that the image source is local system "cp #{i['src']} #{new_full_filename}" i["width"], i["height"] = image_resize(i) i["src"] = new_full_filename end docxml end |
.image_resize(i) ⇒ Object
83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
# File 'lib/html2doc/base.rb', line 83 def self.image_resize(i) size = [i["width"].to_i, i["height"].to_i] size = ImageSize.path(i["src"]).size unless size[0] && size[1] # max width for Word document is 400, max height is 680 if size[0] > 400 size[1] = (size[1] * 400 / size[0]).ceil size[0] = 400 end if size[1] > 680 size[0] = (size[0] * 680 / size[1]).ceil size[1] = 680 end size end |
.is_footnote(a) ⇒ Object
68 69 70 71 |
# File 'lib/html2doc/notes.rb', line 68 def self.is_footnote(a) a["epub:type"]&.casecmp("footnote") == 0 || a["class"]&.casecmp("footnote") == 0 end |
.mathml_to_ooml(docxml) ⇒ Object
50 51 52 53 54 55 56 57 58 |
# File 'lib/html2doc/base.rb', line 50 def self.mathml_to_ooml(docxml) docxml.xpath("//*[local-name() = 'math']").each do |m| @xslt.xml = m.to_s.gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>") ooml = @xslt.serve.gsub(/<\?[^>]+>\s*/, ""). gsub(/ xmlns:[^=]+="[^"]+"/, "")# .gsub(%r{(</?)}, "\\1m:") m.swap(ooml) end end |
.mime_attachment(boundary, filename, item, dir) ⇒ Object
19 20 21 22 23 24 25 26 27 28 29 30 31 32 |
# File 'lib/html2doc/mime.rb', line 19 def self.(boundary, filename, item, dir) encoded_file = Base64.strict_encode64( File.read("#{dir}/#{item}"), ).gsub(/(.{76})/, "\\1\n") <<~"FILE" --#{boundary} Content-Location: file:///C:/Doc/#{filename}_files/#{item} Content-Transfer-Encoding: base64 Content-Type: #{mime_type(item)} #{encoded_file} FILE end |
.mime_boundary ⇒ Object
41 42 43 44 |
# File 'lib/html2doc/mime.rb', line 41 def self.mime_boundary salt = UUIDTools::UUID.random_create.to_s.gsub(/-/, ".")[0..17] "----=_NextPart_#{salt}" end |
.mime_package(result, filename, dir) ⇒ Object
46 47 48 49 50 51 52 53 54 55 |
# File 'lib/html2doc/mime.rb', line 46 def self.mime_package(result, filename, dir) boundary = mime_boundary mhtml = mime_preamble(boundary, filename, result) Dir.foreach(dir) do |item| next if item == "." || item == ".." || /^\./.match(item) mhtml += (boundary, filename, item, dir) end mhtml += "--#{boundary}--" File.open("#{filename}.doc", "w") { |f| f.write mhtml } end |
.mime_preamble(boundary, filename, result) ⇒ Object
5 6 7 8 9 10 11 12 13 14 15 16 17 |
# File 'lib/html2doc/mime.rb', line 5 def self.mime_preamble(boundary, filename, result) <<~"PREAMBLE" MIME-Version: 1.0 Content-Type: multipart/related; boundary="#{boundary}" --#{boundary} Content-Location: file:///C:/Doc/#{filename}.htm Content-Type: text/html; charset="utf-8" #{result} PREAMBLE end |
.mime_type(item) ⇒ Object
34 35 36 37 38 39 |
# File 'lib/html2doc/mime.rb', line 34 def self.mime_type(item) types = MIME::Types.type_for(item) type = types ? types.first.to_s : 'text/plain; charset="utf-8"' type = type + ' charset="utf-8"' if /^text/.match?(type) && types type end |
.msonormal(docxml) ⇒ Object
184 185 186 187 188 189 190 191 |
# File 'lib/html2doc/base.rb', line 184 def self.msonormal(docxml) docxml.xpath("//*[local-name() = 'p'][not(self::*[@class])]").each do |p| p["class"] = "MsoNormal" end docxml.xpath("//*[local-name() = 'li'][not(self::*[@class])]").each do |p| p["class"] = "MsoNormal" end end |
.msword_fix(r) ⇒ Object
70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/html2doc/base.rb', line 70 def self.msword_fix(r) # brain damage in MSWord parser r.gsub!(%r{<span style="mso-special-character:footnote"/>}, '<span style="mso-special-character:footnote"></span>') r.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>") r.gsub!(%r{<link rel="File-List"}, "<link rel=File-List") r.gsub!(%r{<meta http-equiv="Content-Type"}, "<meta http-equiv=Content-Type") r.gsub!(%r{&tab;|&tab;}, '<span style="mso-tab-count:1">  </span>') r end |
.namespace(root) ⇒ Object
161 162 163 164 165 166 167 168 |
# File 'lib/html2doc/base.rb', line 161 def self.namespace(root) { o: "urn:schemas-microsoft-com:office:office", w: "urn:schemas-microsoft-com:office:word", m: "http://schemas.microsoft.com/office/2004/12/omml", }.each { |k, v| root.add_namespace_definition(k.to_s, v) } root.add_namespace(nil, "http://www.w3.org/TR/REC-html40") end |
.process(result, filename, stylesheet, header_file, dir, asciimathdelims = nil) ⇒ Object
11 12 13 14 15 16 17 18 19 20 |
# File 'lib/html2doc/base.rb', line 11 def self.process(result, filename, stylesheet, header_file, dir, asciimathdelims = nil) result = process_html(result, filename, stylesheet, header_file, dir, asciimathdelims) system "cp #{header_file} #{dir}/header.html" unless header_file.nil? generate_filelist(filename, dir) File.open("#{filename}.htm", "w") { |f| f.write(result) } mime_package result, filename, dir rm_temp_files(filename, dir) end |
.process_footnote_link(docxml, a, i, fn) ⇒ Object
47 48 49 50 51 52 53 54 55 56 |
# File 'lib/html2doc/notes.rb', line 47 def self.process_footnote_link(docxml, a, i, fn) return false unless is_footnote(a) href = a["href"].gsub(/^#/, "") note = docxml.at("//*[@name = '#{href}' or @id = '#{href}']") return false if note.nil? set_footnote_link_attrs(a, i) a.children = "<span class='MsoFootnoteReference'>"\ "<span style='mso-special-character:footnote'/></span>" fn << transform_footnote_text(note) end |
.process_footnote_texts(docxml, footnotes) ⇒ Object
15 16 17 18 19 20 21 22 23 24 |
# File 'lib/html2doc/notes.rb', line 15 def self.process_footnote_texts(docxml, footnotes) body = docxml.at("//body") list = body.add_child("<div style='mso-element:footnote-list'/>") footnotes.each_with_index do |f, i| fn = list.first.add_child(footnote_container(i+1)) f.parent = fn.first footnote_div_to_p(f) end footnote_cleanup(docxml) end |
.process_html(result, filename, stylesheet, header_file, dir, asciimathdelims) ⇒ Object
22 23 24 25 26 |
# File 'lib/html2doc/base.rb', line 22 def self.process_html(result, filename, stylesheet, header_file, dir, asciimathdelims) docxml = Nokogiri::XML(asciimath_to_mathml(result, asciimathdelims)) define_head(cleanup(docxml, dir), dir, filename, stylesheet, header_file) result = msword_fix(docxml.to_xml) end |
.rm_temp_files(filename, dir) ⇒ Object
28 29 30 31 |
# File 'lib/html2doc/base.rb', line 28 def self.rm_temp_files(filename, dir) system "rm #{filename}.htm" system "rm -r #{filename}_files" end |
.set_footnote_link_attrs(a, i) ⇒ Object
73 74 75 76 77 78 |
# File 'lib/html2doc/notes.rb', line 73 def self.set_footnote_link_attrs(a, i) a["style"] = "mso-footnote-id:ftn#{i}" a["href"] = "#_ftn#{i}" a["name"] = "_ftnref#{i}" a["title"] = "" end |
.stylesheet(filename, header_filename, fn) ⇒ Object
138 139 140 141 142 143 144 145 146 |
# File 'lib/html2doc/base.rb', line 138 def self.stylesheet(filename, header_filename, fn) (fn.nil? || fn.empty?) && fn = File.join(File.dirname(__FILE__), "wordstyle.css") stylesheet = File.read(fn, encoding: "UTF-8") stylesheet = filename_substitute(stylesheet, header_filename, filename) xml = Nokogiri::XML("<style/>") xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n") xml.root.to_s end |
.transform_footnote_text(note) ⇒ Object
58 59 60 61 62 63 64 65 66 |
# File 'lib/html2doc/notes.rb', line 58 def self.transform_footnote_text(note) note["id"] = "" note.xpath(".//div").each { |div| div = div.replace(div.children) } note.xpath(".//aside | .//p").each do |p| p.name = "p" p["class"] = "MsoFootnoteText" end note.remove end |
.xhtml(result) ⇒ Object
preserve HTML escapes
61 62 63 64 65 66 67 68 |
# File 'lib/html2doc/base.rb', line 61 def self.xhtml(result) unless /<!DOCTYPE html/.match? result result.gsub!(/<\?xml version="1.0"\?>/, "") result = "<!DOCTYPE html SYSTEM " + "'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'>" + result end result end |