Module: Html2Doc
- Defined in:
- lib/html2doc/base.rb,
lib/html2doc/math.rb,
lib/html2doc/mime.rb,
lib/html2doc/lists.rb,
lib/html2doc/notes.rb,
lib/html2doc/version.rb
Constant Summary collapse
- NOKOHEAD =
<<~HERE.freeze <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <title></title> <meta charset="UTF-8" /> </head> <body> </body> </html> HERE
- DOCTYPE =
<<~"DOCTYPE".freeze <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> DOCTYPE
- PRINT_VIEW =
<<~XML.freeze <!--[if gte mso 9]> <xml> <w:WordDocument> <w:View>Print</w:View> <w:Zoom>100</w:Zoom> <w:DoNotOptimizeForBrowser/> </w:WordDocument> </xml> <![endif]--> <meta http-equiv=Content-Type content="text/html; charset=utf-8"/> XML
- IMAGE_PATH =
"//*[local-name() = 'img' or local-name() = 'imagedata']".freeze
- TOPLIST =
"[not(ancestor::ul) and not(ancestor::ol)]".freeze
- VERSION =
"0.9.1".freeze
Class Method Summary collapse
- .add_stylesheet(head, title, css) ⇒ Object
- .asciimath_to_mathml(doc, delims) ⇒ Object
- .asciimath_to_mathml1(x) ⇒ Object
- .bookmarks(docxml) ⇒ Object
- .cleanup(docxml, hash) ⇒ Object
- .create_dir(filename, dir) ⇒ Object
- .define_head(docxml, hash) ⇒ Object
- .define_head1(docxml, dir) ⇒ Object
-
.esc_space(xml) ⇒ Object
escape space as 2; we are removing any spaces generated by XML indentation.
- .filename_substitute(stylesheet, header_filename, filename) ⇒ Object
- .footnote?(a) ⇒ Boolean
-
.footnote_cleanup(docxml) ⇒ Object
We expect that the content of the footnote text received is one or more text containers, p or aside or div (which we have already converted to p).
- .footnote_container(i) ⇒ Object
- .footnote_div_to_p(f) ⇒ Object
- .footnotes(docxml) ⇒ Object
- .from_xhtml(xml) ⇒ Object
- .generate_filelist(filename, dir) ⇒ Object
-
.header_image_cleanup(doc, dir, filename, localdir) ⇒ Object
do not parse the header through Nokogiri, since it will contain non-XML like <![if !supportFootnotes]>.
- .header_image_cleanup1(a, dir, filename, localdir) ⇒ Object
-
.image_cleanup(docxml, dir, localdir) ⇒ Object
only processes locally stored images.
-
.image_resize(i, path, maxheight, maxwidth) ⇒ Object
max width for Word document is 400, max height is 680.
- .list2para(u) ⇒ Object
- .list_add(xpath, liststyles, listtype, level) ⇒ Object
- .lists(docxml, liststyles) ⇒ Object
- .lists1(docxml, liststyles, k) ⇒ Object
- .lists_unstyled(docxml, liststyles) ⇒ Object
- .mathml_to_ooml(docxml) ⇒ Object
- .mime_attachment(boundary, filename, item, dir) ⇒ Object
- .mime_boundary ⇒ Object
- .mime_package(result, filename, dir) ⇒ Object
- .mime_preamble(boundary, filename, result) ⇒ Object
- .mime_type(item) ⇒ Object
- .mkuuid ⇒ Object
- .msonormal(docxml) ⇒ Object
- .msword_fix(r) ⇒ Object
- .namespace(root) ⇒ Object
-
.ooxml_cleanup(m, docnamespaces) ⇒ Object
random fixes to MathML input that OOXML needs to render properly.
- .process(result, hash) ⇒ Object
- .process_footnote_link(docxml, a, i, fn) ⇒ Object
- .process_footnote_texts(docxml, footnotes) ⇒ Object
- .process_header(headerfile, hash) ⇒ Object
- .process_html(result, hash) ⇒ Object
- .rm_temp_files(filename, dir, dir1) ⇒ Object
- .rootnamespace(root) ⇒ Object
- .set_footnote_link_attrs(a, i) ⇒ Object
- .style_list(li, level, liststyle, listnumber) ⇒ Object
- .stylesheet(filename, header_filename, fn) ⇒ Object
- .to_xhtml(xml) ⇒ Object
- .transform_footnote_text(note) ⇒ Object
-
.uncenter(m, ooxml) ⇒ Object
if oomml has no siblings, by default it is centered; override this with left/right if parent is so tagged.
- .warnsvg(src) ⇒ Object
Class Method Details
.add_stylesheet(head, title, css) ⇒ Object
159 160 161 162 163 164 165 166 167 |
# File 'lib/html2doc/base.rb', line 159 def self.add_stylesheet(head, title, css) if head.children.empty? head.add_child css elsif title.nil? head.children.first.add_previous_sibling css else title.add_next_sibling css end end |
.asciimath_to_mathml(doc, delims) ⇒ Object
16 17 18 19 20 21 22 23 24 25 |
# File 'lib/html2doc/math.rb', line 16 def self.asciimath_to_mathml(doc, delims) return doc if delims.nil? || delims.size < 2 m = doc.split(/(#{Regexp.escape(delims[0])}|#{Regexp.escape(delims[1])})/) m.each_slice(4).map.with_index do |(*a), i| i % 500 == 0 && m.size > 1000 && i > 0 and warn "MathML #{i} of #{(m.size / 4).floor}" a[2].nil? || a[2] = asciimath_to_mathml1(a[2]) a.size > 1 ? a[0] + a[2] : a[0] end.join end |
.asciimath_to_mathml1(x) ⇒ Object
11 12 13 14 |
# File 'lib/html2doc/math.rb', line 11 def self.asciimath_to_mathml1(x) AsciiMath.parse(HTMLEntities.new.decode(x)).to_mathml. gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>") end |
.bookmarks(docxml) ⇒ Object
182 183 184 185 186 187 188 189 190 191 192 193 |
# File 'lib/html2doc/base.rb', line 182 def self.bookmarks(docxml) docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]").each do |x| next if x["id"].empty? next if %w(shapetype v:shapetype shape v:shape).include? x.name if x.children.empty? x.add_child("<a name='#{x["id"]}'></a>") else x.children.first.previous = "<a name='#{x["id"]}'></a>" end x.delete("id") end end |
.cleanup(docxml, hash) ⇒ Object
46 47 48 49 50 51 52 53 54 55 |
# File 'lib/html2doc/base.rb', line 46 def self.cleanup(docxml, hash) namespace(docxml.root) image_cleanup(docxml, hash[:dir1], File.dirname(hash[:filename])) mathml_to_ooml(docxml) lists(docxml, hash[:liststyles]) footnotes(docxml) bookmarks(docxml) msonormal(docxml) docxml end |
.create_dir(filename, dir) ⇒ Object
27 28 29 30 31 32 |
# File 'lib/html2doc/base.rb', line 27 def self.create_dir(filename, dir) return dir if dir dir = "#{filename}_files" Dir.mkdir(dir) unless File.exists?(dir) dir end |
.define_head(docxml, hash) ⇒ Object
150 151 152 153 154 155 156 157 |
# File 'lib/html2doc/base.rb', line 150 def self.define_head(docxml, hash) title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']") head = docxml.at("//*[local-name() = 'head']") css = stylesheet(hash[:filename], hash[:header_file], hash[:stylesheet]) add_stylesheet(head, title, css) define_head1(docxml, hash[:dir1]) rootnamespace(docxml.root) end |
.define_head1(docxml, dir) ⇒ Object
122 123 124 125 126 127 128 129 |
# File 'lib/html2doc/base.rb', line 122 def self.define_head1(docxml, dir) docxml.xpath("//*[local-name() = 'head']").each do |h| h.children.first.add_previous_sibling <<~XML #{PRINT_VIEW} <link rel="File-List" href="#{File.basename(dir)}/filelist.xml"/> XML end end |
.esc_space(xml) ⇒ Object
escape space as 2; we are removing any spaces generated by XML indentation
58 59 60 61 62 63 64 |
# File 'lib/html2doc/math.rb', line 58 def self.esc_space(xml) xml.traverse do |n| next unless n.text? n = n.text.gsub(/ /, "2") end xml end |
.filename_substitute(stylesheet, header_filename, filename) ⇒ Object
131 132 133 134 135 136 137 138 |
# File 'lib/html2doc/base.rb', line 131 def self.filename_substitute(stylesheet, header_filename, filename) if header_filename.nil? stylesheet.gsub!(/\n[^\n]*FILENAME[^\n]*i\n/, "\n") else stylesheet.gsub!(/FILENAME/, File.basename(filename)) end stylesheet end |
.footnote?(a) ⇒ Boolean
67 68 69 70 |
# File 'lib/html2doc/notes.rb', line 67 def self.footnote?(a) a["epub:type"]&.casecmp("footnote")&.zero? || a["class"]&.casecmp("footnote")&.zero? end |
.footnote_cleanup(docxml) ⇒ Object
We expect that the content of the footnote text received is one or more text containers, p or aside or div (which we have already converted to p). We do not expect any <a name> or links back to text; if they are present in the HTML, they need to have been cleaned out before passing to this gem
84 85 86 87 88 89 90 91 |
# File 'lib/html2doc/notes.rb', line 84 def self.footnote_cleanup(docxml) docxml.xpath('//div[@style="mso-element:footnote"]/a'). each do |x| n = x.next_element n&.children&.first&.add_previous_sibling(x.remove) end docxml end |
.footnote_container(i) ⇒ Object
36 37 38 39 40 41 42 43 44 |
# File 'lib/html2doc/notes.rb', line 36 def self.footnote_container(i) <<~DIV <div style='mso-element:footnote' id='ftn#{i}'> <a style='mso-footnote-id:ftn#{i}' href='#_ftn#{i}' name='_ftnref#{i}' title='' id='_ftnref#{i}'><span class='MsoFootnoteReference'><span style='mso-special-character:footnote'></span></span></div> DIV end |
.footnote_div_to_p(f) ⇒ Object
25 26 27 28 29 30 31 32 33 34 |
# File 'lib/html2doc/notes.rb', line 25 def self.footnote_div_to_p(f) if %w{div aside}.include? f.name if f.at(".//p") f.replace(f.children) else f.name = "p" f["class"] = "MsoFootnoteText" end end end |
.footnotes(docxml) ⇒ Object
4 5 6 7 8 9 10 11 12 |
# File 'lib/html2doc/notes.rb', line 4 def self.footnotes(docxml) i = 1 fn = [] docxml.xpath("//a").each do |a| next unless process_footnote_link(docxml, a, i, fn) i += 1 end process_footnote_texts(docxml, fn) end |
.from_xhtml(xml) ⇒ Object
78 79 80 81 82 |
# File 'lib/html2doc/base.rb', line 78 def self.from_xhtml(xml) xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, ""). sub(DOCTYPE, ""). gsub(%{ />}, "/>") end |
.generate_filelist(filename, dir) ⇒ Object
126 127 128 129 130 131 132 133 134 135 136 |
# File 'lib/html2doc/mime.rb', line 126 def self.generate_filelist(filename, dir) File.open(File.join(dir, "filelist.xml"), "w") do |f| f.write %{<xml xmlns:o="urn:schemas-microsoft-com:office:office"> <o:MainFile HRef="../#{filename}.htm"/>} Dir.entries(dir).sort.each do |item| next if item == "." || item == ".." || /^\./.match(item) f.write %{ <o:File HRef="#{item}"/>\n} end f.write("</xml>\n") end end |
.header_image_cleanup(doc, dir, filename, localdir) ⇒ Object
do not parse the header through Nokogiri, since it will contain non-XML like <![if !supportFootnotes]>
107 108 109 110 111 |
# File 'lib/html2doc/mime.rb', line 107 def self.header_image_cleanup(doc, dir, filename, localdir) doc.split(%r{(<img [^>]*>|<v:imagedata [^>]*>)}).each_slice(2).map do |a| header_image_cleanup1(a, dir, filename, localdir) end.join end |
.header_image_cleanup1(a, dir, filename, localdir) ⇒ Object
113 114 115 116 117 118 119 120 121 122 123 124 |
# File 'lib/html2doc/mime.rb', line 113 def self.header_image_cleanup1(a, dir, filename, localdir) if a.size == 2 && !(/ src="https?:/.match a[1]) && !(%r{ src="data:image/[^;]+;base64}.match a[1]) m = / src=['"](?<src>[^"']+)['"]/.match a[1] warnsvg(m[:src]) m2 = /\.(?<suffix>\S+)$/.match m[:src] new_filename = "file:///C:/Doc/#{filename}_files/#{mkuuid}.#{m2[:suffix]}" FileUtils.cp File.join(localdir, m[:src]), File.join(dir, "#{mkuuid}.#{m2[:suffix]}") a[1].sub!(%r{ src=['"](?<src>[^"']+)['"]}, " src='#{new_filename}'") end a.join end |
.image_cleanup(docxml, dir, localdir) ⇒ Object
only processes locally stored images
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
# File 'lib/html2doc/mime.rb', line 89 def self.image_cleanup(docxml, dir, localdir) #docxml.xpath(IMAGE_PATH).each do |i| docxml.traverse do |i| next unless i.element? && %w(img v:imagedata).include?(i.name) warnsvg(i["src"]) next if /^http/.match i["src"] next if %r{^data:image/[^;]+;base64}.match i["src"] local_filename = File.join(localdir, i["src"]) new_filename = "#{mkuuid}#{File.extname(i["src"])}" FileUtils.cp local_filename, File.join(dir, new_filename) i["width"], i["height"] = image_resize(i, local_filename, 680, 400) i["src"] = File.join(File.basename(dir), new_filename) end docxml end |
.image_resize(i, path, maxheight, maxwidth) ⇒ Object
max width for Word document is 400, max height is 680
67 68 69 70 71 72 73 74 75 76 |
# File 'lib/html2doc/mime.rb', line 67 def self.image_resize(i, path, maxheight, maxwidth) realSize = ImageSize.path(path).size s = [i["width"].to_i, i["height"].to_i] s = realSize if s[0].zero? && s[1].zero? s[1] = s[0] * realSize[1] / realSize[0] if s[1].zero? && !s[0].zero? s[0] = s[1] * realSize[0] / realSize[1] if s[0].zero? && !s[1].zero? s = [(s[0] * maxheight / s[1]).ceil, maxheight] if s[1] > maxheight s = [maxwidth, (s[1] * maxwidth / s[0]).ceil] if s[0] > maxwidth s end |
.list2para(u) ⇒ Object
39 40 41 42 43 44 45 46 47 48 49 50 51 |
# File 'lib/html2doc/lists.rb', line 39 def self.list2para(u) return if u.xpath("./li").empty? u.xpath("./li").first["class"] ||= "MsoListParagraphCxSpFirst" u.xpath("./li").last["class"] ||= "MsoListParagraphCxSpLast" u.xpath("./li/p").each { |p| p["class"] ||= "MsoListParagraphCxSpMiddle" } u.xpath("./li").each do |l| l.name = "p" l["class"] ||= "MsoListParagraphCxSpMiddle" l&.first_element_child&.name == "p" and l.first_element_child.replace(l.first_element_child.children) end u.replace(u.children) end |
.list_add(xpath, liststyles, listtype, level) ⇒ Object
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/html2doc/lists.rb', line 18 def self.list_add(xpath, liststyles, listtype, level) xpath.each_with_index do |list, i| @listnumber += 1 if level == 1 list["seen"] = true if level == 1 (list.xpath(".//li") - list.xpath(".//ol//li | .//ul//li")).each do |li| style_list(li, level, liststyles[listtype], @listnumber) if [:ul, :ol].include? listtype list_add(li.xpath(".//ul") - li.xpath(".//ul//ul | .//ol//ul"), liststyles, :ul, level + 1) list_add(li.xpath(".//ol") - li.xpath(".//ul//ol | .//ol//ol"), liststyles, :ol, level + 1) else list_add(li.xpath(".//ul") - li.xpath(".//ul//ul | .//ol//ul"), liststyles, listtype, level + 1) list_add(li.xpath(".//ol") - li.xpath(".//ul//ol | .//ol//ol"), liststyles, listtype, level + 1) end end end end |
.lists(docxml, liststyles) ⇒ Object
78 79 80 81 82 83 84 85 |
# File 'lib/html2doc/lists.rb', line 78 def self.lists(docxml, liststyles) return if liststyles.nil? @listnumber = 0 liststyles.each_key { |k| lists1(docxml, liststyles, k) } lists_unstyled(docxml, liststyles) liststyles.has_key?(:ul) and docxml.xpath("//ul").each { |u| list2para(u) } liststyles.has_key?(:ol) and docxml.xpath("//ol").each { |u| list2para(u) } end |
.lists1(docxml, liststyles, k) ⇒ Object
55 56 57 58 59 60 61 62 63 64 65 66 |
# File 'lib/html2doc/lists.rb', line 55 def self.lists1(docxml, liststyles, k) case k when :ul then list_add(docxml.xpath("//ul[not(@class)]#{TOPLIST}"), liststyles, :ul, 1) when :ol then list_add(docxml.xpath("//ol[not(@class)]#{TOPLIST}"), liststyles, :ol, 1) else list_add(docxml.xpath("//ol[@class = '#{k.to_s}']#{TOPLIST} | "\ "//ul[@class = '#{k.to_s}']#{TOPLIST}"), liststyles, k, 1) end end |
.lists_unstyled(docxml, liststyles) ⇒ Object
68 69 70 71 72 73 74 75 76 |
# File 'lib/html2doc/lists.rb', line 68 def self.lists_unstyled(docxml, liststyles) list_add(docxml.xpath("//ul#{TOPLIST}[not(@seen)]"), liststyles, :ul, 1) if liststyles.has_key?(:ul) list_add(docxml.xpath("//ol#{TOPLIST}[not(@seen)]"), liststyles, :ul, 1) if liststyles.has_key?(:ol) docxml.xpath("//ul[@seen] | //ol[@seen]").each do |l| l.delete("seen") end end |
.mathml_to_ooml(docxml) ⇒ Object
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
# File 'lib/html2doc/math.rb', line 38 def self.mathml_to_ooml(docxml) docnamespaces = docxml.collect_namespaces m = docxml.xpath("//*[local-name() = 'math']") m.each_with_index do |x, i| i % 100 == 0 && m.size > 500 && i > 0 and warn "Math OOXML #{i} of #{m.size}" element = ooxml_cleanup(x, docnamespaces) doc = Nokogiri::XML::Document::new() doc.root = element ooxml = (esc_space(@xsltemplate.transform(doc))).to_s. gsub(/<\?[^>]+>\s*/, ""). gsub(/ xmlns(:[^=]+)?="[^"]+"/, ""). gsub(%r{<(/)?([a-z])}, "<\\1m:\\2") ooxml = uncenter(x, ooxml) x.swap(ooxml) end end |
.mime_attachment(boundary, filename, item, dir) ⇒ Object
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
# File 'lib/html2doc/mime.rb', line 22 def self.(boundary, filename, item, dir) content_type = mime_type(item) text_mode = %w[text application].any? { |p| content_type.start_with? p } path = File.join(dir, item) content = text_mode ? File.read(path, encoding: "utf-8") : IO.binread(path) encoded_file = Base64.strict_encode64(content).gsub(/(.{76})/, "\\1\n") <<~"FILE" --#{boundary} Content-Location: file:///C:/Doc/#{File.basename(filename)}_files/#{item} Content-Transfer-Encoding: base64 Content-Type: #{content_type} #{encoded_file} FILE end |
.mime_boundary ⇒ Object
48 49 50 51 |
# File 'lib/html2doc/mime.rb', line 48 def self.mime_boundary salt = UUIDTools::UUID.random_create.to_s.gsub(/-/, ".")[0..17] "----=_NextPart_#{salt}" end |
.mime_package(result, filename, dir) ⇒ Object
53 54 55 56 57 58 59 60 61 62 63 64 |
# File 'lib/html2doc/mime.rb', line 53 def self.mime_package(result, filename, dir) boundary = mime_boundary mhtml = mime_preamble(boundary, filename, result) mhtml += (boundary, filename, "filelist.xml", dir) Dir.foreach(dir) do |item| next if item == "." || item == ".." || /^\./.match(item) || item == "filelist.xml" mhtml += (boundary, filename, item, dir) end mhtml += "--#{boundary}--" File.open("#{filename}.doc", "w:UTF-8") { |f| f.write mhtml } end |
.mime_preamble(boundary, filename, result) ⇒ Object
8 9 10 11 12 13 14 15 16 17 18 19 20 |
# File 'lib/html2doc/mime.rb', line 8 def self.mime_preamble(boundary, filename, result) <<~"PREAMBLE" MIME-Version: 1.0 Content-Type: multipart/related; boundary="#{boundary}" --#{boundary} Content-Location: file:///C:/Doc/#{File.basename(filename)}.htm Content-Type: text/html; charset="utf-8" #{result} PREAMBLE end |
.mime_type(item) ⇒ Object
41 42 43 44 45 46 |
# File 'lib/html2doc/mime.rb', line 41 def self.mime_type(item) types = MIME::Types.type_for(item) type = types ? types.first.to_s : 'text/plain; charset="utf-8"' type = type + ' charset="utf-8"' if /^text/.match(type) && types type end |
.mkuuid ⇒ Object
80 81 82 |
# File 'lib/html2doc/mime.rb', line 80 def self.mkuuid UUIDTools::UUID.random_create.to_s end |
.msonormal(docxml) ⇒ Object
195 196 197 198 199 200 201 202 |
# File 'lib/html2doc/base.rb', line 195 def self.msonormal(docxml) docxml.xpath("//*[local-name() = 'p'][not(self::*[@class])]").each do |p| p["class"] = "MsoNormal" end docxml.xpath("//*[local-name() = 'li'][not(self::*[@class])]").each do |p| p["class"] = "MsoNormal" end end |
.msword_fix(r) ⇒ Object
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
# File 'lib/html2doc/base.rb', line 84 def self.msword_fix(r) # brain damage in MSWord parser r.gsub!(%r{<span style="mso-special-character:footnote"/>}, '<span style="mso-special-character:footnote"></span>') r.gsub!(%r{<div style="mso-element:footnote-list"></div>}, '<div style="mso-element:footnote-list"/>') r.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>") r.gsub!(%r{<link rel="File-List"}, "<link rel=File-List") r.gsub!(%r{<meta http-equiv="Content-Type"}, "<meta http-equiv=Content-Type") r.gsub!(%r{></m:jc>}, "/>") r.gsub!(%r{></v:stroke>}, "/>") r.gsub!(%r{></v:f>}, "/>") r.gsub!(%r{></v:path>}, "/>") r.gsub!(%r{></o:lock>}, "/>") r.gsub!(%r{></v:imagedata>}, "/>") r.gsub!(%r{></w:wrap>}, "/>") r.gsub!(%r{&tab;|&tab;}, '<span style="mso-tab-count:1">  </span>') r = r.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a| a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><") a end.join r end |
.namespace(root) ⇒ Object
169 170 171 172 173 174 175 176 |
# File 'lib/html2doc/base.rb', line 169 def self.namespace(root) { o: "urn:schemas-microsoft-com:office:office", w: "urn:schemas-microsoft-com:office:word", v: "urn:schemas-microsoft-com:vml", m: "http://schemas.microsoft.com/office/2004/12/omml", }.each { |k, v| root.add_namespace_definition(k.to_s, v) } end |
.ooxml_cleanup(m, docnamespaces) ⇒ Object
random fixes to MathML input that OOXML needs to render properly
28 29 30 31 32 33 34 35 36 |
# File 'lib/html2doc/math.rb', line 28 def self.ooxml_cleanup(m, docnamespaces) m.xpath(%w(msup msub msubsup munder mover munderover). map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x| next unless x.next_element && x.next_element != "mrow" x.next_element.wrap("<mrow/>") end m.add_namespace(nil, "http://www.w3.org/1998/Math/MathML") m end |
.process(result, hash) ⇒ Object
10 11 12 13 14 15 16 17 18 |
# File 'lib/html2doc/base.rb', line 10 def self.process(result, hash) hash[:dir1] = create_dir(hash[:filename], hash[:dir]) result = process_html(result, hash) process_header(hash[:header_file], hash) generate_filelist(hash[:filename], hash[:dir1]) File.open("#{hash[:filename]}.htm", "w:UTF-8") { |f| f.write(result) } mime_package result, hash[:filename], hash[:dir1] rm_temp_files(hash[:filename], hash[:dir], hash[:dir1]) unless hash[:debug] end |
.process_footnote_link(docxml, a, i, fn) ⇒ Object
46 47 48 49 50 51 52 53 54 55 |
# File 'lib/html2doc/notes.rb', line 46 def self.process_footnote_link(docxml, a, i, fn) return false unless footnote?(a) href = a["href"].gsub(/^#/, "") note = docxml.at("//*[@name = '#{href}' or @id = '#{href}']") return false if note.nil? set_footnote_link_attrs(a, i) a.children = "<span class='MsoFootnoteReference'>"\ "<span style='mso-special-character:footnote'/></span>" fn << transform_footnote_text(note) end |
.process_footnote_texts(docxml, footnotes) ⇒ Object
14 15 16 17 18 19 20 21 22 23 |
# File 'lib/html2doc/notes.rb', line 14 def self.process_footnote_texts(docxml, footnotes) body = docxml.at("//body") list = body.add_child("<div style='mso-element:footnote-list'/>") footnotes.each_with_index do |f, i| fn = list.first.add_child(footnote_container(i + 1)) f.parent = fn.first footnote_div_to_p(f) end footnote_cleanup(docxml) end |
.process_header(headerfile, hash) ⇒ Object
20 21 22 23 24 25 |
# File 'lib/html2doc/base.rb', line 20 def self.process_header(headerfile, hash) return if headerfile.nil? doc = File.read(headerfile, encoding: "utf-8") doc = header_image_cleanup(doc, hash[:dir1], hash[:filename], File.dirname(hash[:filename])) File.open("#{hash[:dir1]}/header.html", "w:UTF-8") { |f| f.write(doc) } end |
.process_html(result, hash) ⇒ Object
34 35 36 37 38 |
# File 'lib/html2doc/base.rb', line 34 def self.process_html(result, hash) docxml = to_xhtml(asciimath_to_mathml(result, hash[:asciimathdelims])) define_head(cleanup(docxml, hash), hash) msword_fix(from_xhtml(docxml)) end |
.rm_temp_files(filename, dir, dir1) ⇒ Object
40 41 42 43 44 |
# File 'lib/html2doc/base.rb', line 40 def self.rm_temp_files(filename, dir, dir1) FileUtils.rm "#{filename}.htm" FileUtils.rm_f "#{dir1}/header.html" FileUtils.rm_r dir1 unless dir end |
.rootnamespace(root) ⇒ Object
178 179 180 |
# File 'lib/html2doc/base.rb', line 178 def self.rootnamespace(root) root.add_namespace(nil, "http://www.w3.org/TR/REC-html40") end |
.set_footnote_link_attrs(a, i) ⇒ Object
72 73 74 75 76 77 |
# File 'lib/html2doc/notes.rb', line 72 def self.set_footnote_link_attrs(a, i) a["style"] = "mso-footnote-id:ftn#{i}" a["href"] = "#_ftn#{i}" a["name"] = "_ftnref#{i}" a["title"] = "" end |
.style_list(li, level, liststyle, listnumber) ⇒ Object
8 9 10 11 12 13 14 15 16 |
# File 'lib/html2doc/lists.rb', line 8 def self.style_list(li, level, liststyle, listnumber) return unless liststyle if li["style"] li["style"] += ";" else li["style"] = "" end li["style"] += "mso-list:#{liststyle} level#{level} lfo#{listnumber};" end |
.stylesheet(filename, header_filename, fn) ⇒ Object
140 141 142 143 144 145 146 147 148 |
# File 'lib/html2doc/base.rb', line 140 def self.stylesheet(filename, header_filename, fn) (fn.nil? || fn.empty?) && fn = File.join(File.dirname(__FILE__), "wordstyle.css") stylesheet = File.read(fn, encoding: "UTF-8") stylesheet = filename_substitute(stylesheet, header_filename, filename) xml = Nokogiri::XML("<style/>") xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n") xml.root.to_s end |
.to_xhtml(xml) ⇒ Object
65 66 67 68 69 70 71 72 |
# File 'lib/html2doc/base.rb', line 65 def self.to_xhtml(xml) xml.gsub!(/<\?xml[^>]*>/, "") unless /<!DOCTYPE /.match xml xml = '<!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml end Nokogiri::XML.parse(xml) end |
.transform_footnote_text(note) ⇒ Object
57 58 59 60 61 62 63 64 65 |
# File 'lib/html2doc/notes.rb', line 57 def self.transform_footnote_text(note) note["id"] = "" note.xpath(".//div").each { |div| div.replace(div.children) } note.xpath(".//aside | .//p").each do |p| p.name = "p" p["class"] = "MsoFootnoteText" end note.remove end |
.uncenter(m, ooxml) ⇒ Object
if oomml has no siblings, by default it is centered; override this with left/right if parent is so tagged
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
# File 'lib/html2doc/math.rb', line 68 def self.uncenter(m, ooxml) if m.next == nil && m.previous == nil alignnode = m.at(".//ancestor::*[@style][local-name() = 'p' or "\ "local-name() = 'div' or local-name() = 'td']/@style") return ooxml unless alignnode if alignnode.text.include? ("text-align:left") ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\ "m:val='left'/></m:oMathParaPr>#{ooxml}</m:oMathPara>" elsif alignnode.text.include? ("text-align:right") ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\ "m:val='right'/></m:oMathParaPr>#{ooxml}</m:oMathPara>" end end ooxml end |
.warnsvg(src) ⇒ Object
84 85 86 |
# File 'lib/html2doc/mime.rb', line 84 def self.warnsvg(src) warn "#{src}: SVG not supported" if /\.svg$/i.match(src) end |