Module: Html2Doc
- Defined in:
- lib/html2doc/base.rb,
lib/html2doc/mime.rb,
lib/html2doc/notes.rb,
lib/html2doc/version.rb
Constant Summary collapse
- NOKOHEAD =
<<~HERE.freeze <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <title></title> <meta charset="UTF-8" /> </head> <body> </body> </html> HERE
- DOCTYPE =
<<~"DOCTYPE".freeze <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> DOCTYPE
- PRINT_VIEW =
<<~XML.freeze <!--[if gte mso 9]> <xml> <w:WordDocument> <w:View>Print</w:View> <w:Zoom>100</w:Zoom> <w:DoNotOptimizeForBrowser/> </w:WordDocument> </xml> <![endif]--> <meta http-equiv=Content-Type content="text/html; charset=utf-8"/> XML
- VERSION =
"0.6.2".freeze
Class Method Summary collapse
- .add_stylesheet(head, title, css) ⇒ Object
- .asciimath_to_mathml(doc, delims) ⇒ Object
- .cleanup(docxml, dir) ⇒ Object
- .create_dir(filename, dir) ⇒ Object
- .define_head(docxml, dir, filename, cssname, header_file) ⇒ Object
- .define_head1(docxml, dir) ⇒ Object
- .filename_substitute(stylesheet, header_filename, filename) ⇒ Object
- .footnote?(a) ⇒ Boolean
-
.footnote_cleanup(docxml) ⇒ Object
We expect that the content of the footnote text received is one or more text containers, p or aside or div (which we have already converted to p).
- .footnote_container(i) ⇒ Object
- .footnote_div_to_p(f) ⇒ Object
- .footnotes(docxml) ⇒ Object
- .from_xhtml(xml) ⇒ Object
- .generate_filelist(filename, dir) ⇒ Object
- .image_cleanup(docxml, dir) ⇒ Object
- .image_resize(i, maxheight, maxwidth) ⇒ Object
- .mathml_to_ooml(docxml) ⇒ Object
- .mime_attachment(boundary, filename, item, dir) ⇒ Object
- .mime_boundary ⇒ Object
- .mime_package(result, filename, dir) ⇒ Object
- .mime_preamble(boundary, filename, result) ⇒ Object
- .mime_type(item) ⇒ Object
- .msonormal(docxml) ⇒ Object
- .msword_fix(r) ⇒ Object
- .namespace(root) ⇒ Object
- .process(result, filename, stylesheet, header_file, dir = nil, asciimathdelims = nil) ⇒ Object
- .process_footnote_link(docxml, a, i, fn) ⇒ Object
- .process_footnote_texts(docxml, footnotes) ⇒ Object
- .process_html(result, filename, stylesheet, header_file, dir, asciimathdelims) ⇒ Object
- .rm_temp_files(filename, dir, dir1) ⇒ Object
- .set_footnote_link_attrs(a, i) ⇒ Object
- .stylesheet(filename, header_filename, fn) ⇒ Object
- .to_xhtml(xml) ⇒ Object
- .transform_footnote_text(note) ⇒ Object
Class Method Details
.add_stylesheet(head, title, css) ⇒ Object
188 189 190 191 192 193 194 195 196 |
# File 'lib/html2doc/base.rb', line 188 def self.add_stylesheet(head, title, css) if head.children.empty? head.add_child css elsif title.nil? head.children.first.add_previous_sibling css else title.add_next_sibling css end end |
.asciimath_to_mathml(doc, delims) ⇒ Object
52 53 54 55 56 57 58 59 |
# File 'lib/html2doc/base.rb', line 52 def self.asciimath_to_mathml(doc, delims) return doc if delims.nil? || delims.size < 2 doc.split(/(#{delims[0]}|#{delims[1]})/).each_slice(4).map do |a| a[2].nil? || a[2] = AsciiMath.parse(a[2]).to_mathml. gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>") a.size > 1 ? a[0] + a[2] : a[0] end.join end |
.cleanup(docxml, dir) ⇒ Object
44 45 46 47 48 49 50 |
# File 'lib/html2doc/base.rb', line 44 def self.cleanup(docxml, dir) image_cleanup(docxml, dir) mathml_to_ooml(docxml) footnotes(docxml) msonormal(docxml) docxml end |
.create_dir(filename, dir) ⇒ Object
24 25 26 27 28 29 |
# File 'lib/html2doc/base.rb', line 24 def self.create_dir(filename, dir) return dir if dir dir = "#{filename}_files" Dir.mkdir(dir) unless File.exists?(dir) dir end |
.define_head(docxml, dir, filename, cssname, header_file) ⇒ Object
179 180 181 182 183 184 185 186 |
# File 'lib/html2doc/base.rb', line 179 def self.define_head(docxml, dir, filename, cssname, header_file) title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']") head = docxml.at("//*[local-name() = 'head']") css = stylesheet(filename, header_file, cssname) add_stylesheet(head, title, css) define_head1(docxml, dir) namespace(docxml.root) end |
.define_head1(docxml, dir) ⇒ Object
151 152 153 154 155 156 157 158 |
# File 'lib/html2doc/base.rb', line 151 def self.define_head1(docxml, dir) docxml.xpath("//*[local-name() = 'head']").each do |h| h.children.first.add_previous_sibling <<~XML #{PRINT_VIEW} <link rel="File-List" href="#{dir}/filelist.xml"/> XML end end |
.filename_substitute(stylesheet, header_filename, filename) ⇒ Object
160 161 162 163 164 165 166 167 |
# File 'lib/html2doc/base.rb', line 160 def self.filename_substitute(stylesheet, header_filename, filename) if header_filename.nil? stylesheet.gsub!(/\n[^\n]*FILENAME[^\n]*i\n/, "\n") else stylesheet.gsub!(/FILENAME/, filename) end stylesheet end |
.footnote?(a) ⇒ Boolean
67 68 69 70 |
# File 'lib/html2doc/notes.rb', line 67 def self.footnote?(a) a["epub:type"]&.casecmp("footnote")&.zero? || a["class"]&.casecmp("footnote")&.zero? end |
.footnote_cleanup(docxml) ⇒ Object
We expect that the content of the footnote text received is one or more text containers, p or aside or div (which we have already converted to p). We do not expect any <a name> or links back to text; if they are present in the HTML, they need to have been cleaned out before passing to this gem
84 85 86 87 88 89 90 91 |
# File 'lib/html2doc/notes.rb', line 84 def self.footnote_cleanup(docxml) docxml.xpath('//div[@style="mso-element:footnote"]/a'). each do |x| n = x.next_element n&.children&.first&.add_previous_sibling(x.remove) end docxml end |
.footnote_container(i) ⇒ Object
36 37 38 39 40 41 42 43 44 |
# File 'lib/html2doc/notes.rb', line 36 def self.footnote_container(i) <<~DIV <div style='mso-element:footnote' id='ftn#{i}'> <a style='mso-footnote-id:ftn#{i}' href='#_ftn#{i}' name='_ftnref#{i}' title='' id='_ftnref#{i}'><span class='MsoFootnoteReference'><span style='mso-special-character:footnote'></span></span></div> DIV end |
.footnote_div_to_p(f) ⇒ Object
25 26 27 28 29 30 31 32 33 34 |
# File 'lib/html2doc/notes.rb', line 25 def self.footnote_div_to_p(f) if %w{div aside}.include? f.name if f.at(".//p") f.replace(f.children) else f.name = "p" f["class"] = "MsoFootnoteText" end end end |
.footnotes(docxml) ⇒ Object
4 5 6 7 8 9 10 11 12 |
# File 'lib/html2doc/notes.rb', line 4 def self.footnotes(docxml) i = 1 fn = [] docxml.xpath("//a").each do |a| next unless process_footnote_link(docxml, a, i, fn) i += 1 end process_footnote_texts(docxml, fn) end |
.from_xhtml(xml) ⇒ Object
92 93 94 95 96 |
# File 'lib/html2doc/base.rb', line 92 def self.from_xhtml(xml) xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, ""). sub(DOCTYPE, ""). gsub(%{ />}, "/>") end |
.generate_filelist(filename, dir) ⇒ Object
207 208 209 210 211 212 213 214 215 216 217 |
# File 'lib/html2doc/base.rb', line 207 def self.generate_filelist(filename, dir) File.open(File.join(dir, "filelist.xml"), "w") do |f| f.write %{<xml xmlns:o="urn:schemas-microsoft-com:office:office"> <o:MainFile HRef="../#{filename}.htm"/>} Dir.foreach(dir) do |item| next if item == "." || item == ".." || /^\./.match(item) f.write %{ <o:File HRef="#{item}"/>\n} end f.write("</xml>\n") end end |
.image_cleanup(docxml, dir) ⇒ Object
125 126 127 128 129 130 131 132 133 134 135 136 |
# File 'lib/html2doc/base.rb', line 125 def self.image_cleanup(docxml, dir) docxml.xpath("//*[local-name() = 'img']").each do |i| matched = /\.(?<suffix>\S+)$/.match i["src"] uuid = UUIDTools::UUID.random_create.to_s new_full_filename = File.join(dir, "#{uuid}.#{matched[:suffix]}") # presupposes that the image source is local system "cp #{i['src']} #{new_full_filename}" i["width"], i["height"] = image_resize(i, 400, 680) i["src"] = new_full_filename end docxml end |
.image_resize(i, maxheight, maxwidth) ⇒ Object
112 113 114 115 116 117 118 119 120 121 122 123 |
# File 'lib/html2doc/base.rb', line 112 def self.image_resize(i, maxheight, maxwidth) size = [i["width"].to_i, i["height"].to_i] size = ImageSize.path(i["src"]).size if size[0].zero? && size[1].zero? # max height for Word document is 400, max width is 680 if size[0] > maxheight size = [maxheight, (size[1] * maxheight / size[0]).ceil] end if size[1] > maxwidth size = [(size[0] * maxwidth / size[1]).ceil, maxwidth] end size end |
.mathml_to_ooml(docxml) ⇒ Object
61 62 63 64 65 66 67 68 69 |
# File 'lib/html2doc/base.rb', line 61 def self.mathml_to_ooml(docxml) docxml.xpath("//*[local-name() = 'math']").each do |m| @xslt.xml = m.to_s. gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>") ooml = @xslt.serve.gsub(/<\?[^>]+>\s*/, ""). gsub(/ xmlns:[^=]+="[^"]+"/, "") m.swap(ooml) end end |
.mime_attachment(boundary, filename, item, dir) ⇒ Object
20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
# File 'lib/html2doc/mime.rb', line 20 def self.(boundary, filename, item, dir) encoded_file = Base64.strict_encode64( File.read("#{dir}/#{item}"), ).gsub(/(.{76})/, "\\1\n") <<~"FILE" --#{boundary} Content-Location: file:///C:/Doc/#{filename}_files/#{item} Content-Transfer-Encoding: base64 Content-Type: #{mime_type(item)} #{encoded_file} FILE end |
.mime_boundary ⇒ Object
42 43 44 45 |
# File 'lib/html2doc/mime.rb', line 42 def self.mime_boundary salt = UUIDTools::UUID.random_create.to_s.gsub(/-/, ".")[0..17] "----=_NextPart_#{salt}" end |
.mime_package(result, filename, dir) ⇒ Object
47 48 49 50 51 52 53 54 55 56 57 |
# File 'lib/html2doc/mime.rb', line 47 def self.mime_package(result, filename, dir) boundary = mime_boundary mhtml = mime_preamble(boundary, filename, result) mhtml += (boundary, filename, "filelist.xml", dir) Dir.foreach(dir) do |item| next if item == "." || item == ".." || /^\./.match(item) || item == "filelist.xml" mhtml += (boundary, filename, item, dir) end mhtml += "--#{boundary}--" File.open("#{filename}.doc", "w") { |f| f.write mhtml } end |
.mime_preamble(boundary, filename, result) ⇒ Object
6 7 8 9 10 11 12 13 14 15 16 17 18 |
# File 'lib/html2doc/mime.rb', line 6 def self.mime_preamble(boundary, filename, result) <<~"PREAMBLE" MIME-Version: 1.0 Content-Type: multipart/related; boundary="#{boundary}" --#{boundary} Content-Location: file:///C:/Doc/#{filename}.htm Content-Type: text/html; charset="utf-8" #{result} PREAMBLE end |
.mime_type(item) ⇒ Object
35 36 37 38 39 40 |
# File 'lib/html2doc/mime.rb', line 35 def self.mime_type(item) types = MIME::Types.type_for(item) type = types ? types.first.to_s : 'text/plain; charset="utf-8"' type = type + ' charset="utf-8"' if /^text/.match?(type) && types type end |
.msonormal(docxml) ⇒ Object
219 220 221 222 223 224 225 226 |
# File 'lib/html2doc/base.rb', line 219 def self.msonormal(docxml) docxml.xpath("//*[local-name() = 'p'][not(self::*[@class])]").each do |p| p["class"] = "MsoNormal" end docxml.xpath("//*[local-name() = 'li'][not(self::*[@class])]").each do |p| p["class"] = "MsoNormal" end end |
.msword_fix(r) ⇒ Object
98 99 100 101 102 103 104 105 106 107 108 109 110 |
# File 'lib/html2doc/base.rb', line 98 def self.msword_fix(r) # brain damage in MSWord parser r.gsub!(%r{<span style="mso-special-character:footnote"/>}, '<span style="mso-special-character:footnote"></span>') r.gsub!(%r{<div style="mso-element:footnote-list"></div>}, '<div style="mso-element:footnote-list"/>') r.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>") r.gsub!(%r{<link rel="File-List"}, "<link rel=File-List") r.gsub!(%r{<meta http-equiv="Content-Type"}, "<meta http-equiv=Content-Type") r.gsub!(%r{&tab;|&tab;}, '<span style="mso-tab-count:1">  </span>') r end |
.namespace(root) ⇒ Object
198 199 200 201 202 203 204 205 |
# File 'lib/html2doc/base.rb', line 198 def self.namespace(root) { o: "urn:schemas-microsoft-com:office:office", w: "urn:schemas-microsoft-com:office:word", m: "http://schemas.microsoft.com/office/2004/12/omml", }.each { |k, v| root.add_namespace_definition(k.to_s, v) } root.add_namespace(nil, "http://www.w3.org/TR/REC-html40") end |
.process(result, filename, stylesheet, header_file, dir = nil, asciimathdelims = nil) ⇒ Object
12 13 14 15 16 17 18 19 20 21 22 |
# File 'lib/html2doc/base.rb', line 12 def self.process(result, filename, stylesheet, header_file, dir = nil, asciimathdelims = nil) dir1 = create_dir(filename, dir) result = process_html(result, filename, stylesheet, header_file, dir1, asciimathdelims) system "cp #{header_file} #{dir1}/header.html" unless header_file.nil? generate_filelist(filename, dir1) File.open("#{filename}.htm", "w") { |f| f.write(result) } mime_package result, filename, dir1 rm_temp_files(filename, dir, dir1) end |
.process_footnote_link(docxml, a, i, fn) ⇒ Object
46 47 48 49 50 51 52 53 54 55 |
# File 'lib/html2doc/notes.rb', line 46 def self.process_footnote_link(docxml, a, i, fn) return false unless footnote?(a) href = a["href"].gsub(/^#/, "") note = docxml.at("//*[@name = '#{href}' or @id = '#{href}']") return false if note.nil? set_footnote_link_attrs(a, i) a.children = "<span class='MsoFootnoteReference'>"\ "<span style='mso-special-character:footnote'/></span>" fn << transform_footnote_text(note) end |
.process_footnote_texts(docxml, footnotes) ⇒ Object
14 15 16 17 18 19 20 21 22 23 |
# File 'lib/html2doc/notes.rb', line 14 def self.process_footnote_texts(docxml, footnotes) body = docxml.at("//body") list = body.add_child("<div style='mso-element:footnote-list'/>") footnotes.each_with_index do |f, i| fn = list.first.add_child(footnote_container(i + 1)) f.parent = fn.first footnote_div_to_p(f) end footnote_cleanup(docxml) end |
.process_html(result, filename, stylesheet, header_file, dir, asciimathdelims) ⇒ Object
31 32 33 34 35 36 37 |
# File 'lib/html2doc/base.rb', line 31 def self.process_html(result, filename, stylesheet, header_file, dir, asciimathdelims) # docxml = Nokogiri::XML(asciimath_to_mathml(result, asciimathdelims)) docxml = to_xhtml(asciimath_to_mathml(result, asciimathdelims)) define_head(cleanup(docxml, dir), dir, filename, stylesheet, header_file) msword_fix(from_xhtml(docxml)) end |
.rm_temp_files(filename, dir, dir1) ⇒ Object
39 40 41 42 |
# File 'lib/html2doc/base.rb', line 39 def self.rm_temp_files(filename, dir, dir1) system "rm #{filename}.htm" system "rm -r #{dir1}" unless dir end |
.set_footnote_link_attrs(a, i) ⇒ Object
72 73 74 75 76 77 |
# File 'lib/html2doc/notes.rb', line 72 def self.set_footnote_link_attrs(a, i) a["style"] = "mso-footnote-id:ftn#{i}" a["href"] = "#_ftn#{i}" a["name"] = "_ftnref#{i}" a["title"] = "" end |
.stylesheet(filename, header_filename, fn) ⇒ Object
169 170 171 172 173 174 175 176 177 |
# File 'lib/html2doc/base.rb', line 169 def self.stylesheet(filename, header_filename, fn) (fn.nil? || fn.empty?) && fn = File.join(File.dirname(__FILE__), "wordstyle.css") stylesheet = File.read(fn, encoding: "UTF-8") stylesheet = filename_substitute(stylesheet, header_filename, filename) xml = Nokogiri::XML("<style/>") xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n") xml.root.to_s end |
.to_xhtml(xml) ⇒ Object
79 80 81 82 83 84 85 86 |
# File 'lib/html2doc/base.rb', line 79 def self.to_xhtml(xml) xml.gsub!(/<\?xml[^>]*>/, "") unless /<!DOCTYPE /.match? xml xml = '<!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml end Nokogiri::XML.parse(xml) end |
.transform_footnote_text(note) ⇒ Object
57 58 59 60 61 62 63 64 65 |
# File 'lib/html2doc/notes.rb', line 57 def self.transform_footnote_text(note) note["id"] = "" note.xpath(".//div").each { |div| div.replace(div.children) } note.xpath(".//aside | .//p").each do |p| p.name = "p" p["class"] = "MsoFootnoteText" end note.remove end |