Module: Html2Doc

Defined in:: lib/html2doc/base.rb,
lib/html2doc/math.rb,
lib/html2doc/mime.rb,
lib/html2doc/lists.rb,
lib/html2doc/notes.rb,
lib/html2doc/version.rb

Constant Summary collapse

NOKOHEAD =

<<~HERE.freeze
  <!DOCTYPE html SYSTEM
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  <html xmlns="http://www.w3.org/1999/xhtml">
  <head> <title></title> <meta charset="UTF-8" /> </head>
  <body> </body> </html>
HERE

DOCTYPE =

<<~"DOCTYPE".freeze
  <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
DOCTYPE

PRINT_VIEW =

<<~XML.freeze
  <!--[if gte mso 9]>
  <xml>
  <w:WordDocument>
  <w:View>Print</w:View>
  <w:Zoom>100</w:Zoom>
  <w:DoNotOptimizeForBrowser/>
  </w:WordDocument>
  </xml>
  <![endif]-->
  <meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
XML

HTML_NS =

'xmlns="http://www.w3.org/1999/xhtml"'.freeze

IMAGE_PATH =

"//*[local-name() = 'img' or local-name() = 'imagedata']".freeze

TOPLIST =

"[not(ancestor::ul) and not(ancestor::ol)]".freeze

FN =

"<span class='MsoFootnoteReference'>"\
"<span style='mso-special-character:footnote'/></span>".freeze

VERSION =

"1.1.3".freeze

Class Method Summary collapse

.add_stylesheet(head, title, css) ⇒ Object
.asciimath_to_mathml(doc, delims) ⇒ Object
.asciimath_to_mathml1(expr) ⇒ Object
.bookmarks(docxml) ⇒ Object
.cleanup(docxml, hash) ⇒ Object
.clear_dir(dir) ⇒ Object
.contentid(mhtml) ⇒ Object
.create_dir(filename, dir) ⇒ Object
.define_head(docxml, hash) ⇒ Object
.define_head1(docxml, _dir) ⇒ Object
.esc_space(xml) ⇒ Object

escape space as 2; we are removing any spaces generated by XML indentation.
.filename_substitute(head, header_filename) ⇒ Object
.footnote?(elem) ⇒ Boolean
.footnote_cleanup(docxml) ⇒ Object

We expect that the content of the footnote text received is one or more text containers, p or aside or div (which we have already converted to p).
.footnote_container(docxml, idx) ⇒ Object
.footnote_div_to_p(elem) ⇒ Object
.footnotes(docxml) ⇒ Object
.from_xhtml(xml) ⇒ Object
.generate_filelist(filename, dir) ⇒ Object
.header_image_cleanup(doc, dir, filename, localdir) ⇒ Object

do not parse the header through Nokogiri, since it will contain non-XML like <![if !supportFootnotes]>.
.header_image_cleanup1(a, dir, _filename, localdir) ⇒ Object
.image_cleanup(docxml, dir, localdir) ⇒ Object

only processes locally stored images.
.image_resize(img, path, maxheight, maxwidth) ⇒ Object

max width for Word document is 400, max height is 680.
.list2para(list) ⇒ Object
.list_add(xpath, liststyles, listtype, level) ⇒ Object
.list_add1(elem, liststyles, listtype, level) ⇒ Object
.lists(docxml, liststyles) ⇒ Object
.lists1(docxml, liststyles, style) ⇒ Object
.lists_unstyled(docxml, liststyles) ⇒ Object
.localname(src, localdir) ⇒ Object
.mathml_insert_rows(math, docnamespaces) ⇒ Object
.mathml_preserve_space(math, docnamespaces) ⇒ Object
.mathml_to_ooml(docxml) ⇒ Object
.mathml_to_ooml1(xml, docnamespaces) ⇒ Object
.mime_attachment(boundary, _filename, item, dir) ⇒ Object
.mime_boundary ⇒ Object
.mime_package(result, filename, dir) ⇒ Object
.mime_preamble(boundary, filename, result) ⇒ Object
.mime_type(item) ⇒ Object
.mkuuid ⇒ Object
.msonormal(docxml) ⇒ Object
.msword_fix(doc) ⇒ Object
.namespace(root) ⇒ Object
.ooml_clean(xml) ⇒ Object

We need span and em not to be namespaced.
.ooxml_cleanup(math, docnamespaces) ⇒ Object

random fixes to MathML input that OOXML needs to render properly.
.process(result, hash) ⇒ Object
.process_footnote_link(docxml, elem, idx, footnote) ⇒ Object
.process_footnote_link1(elem) ⇒ Object
.process_footnote_texts(docxml, footnotes) ⇒ Object
.process_header(headerfile, hash) ⇒ Object
.process_html(result, hash) ⇒ Object
.progress_conv(idx, step, total, threshold, msg) ⇒ Object
.rm_temp_files(filename, dir, dir1) ⇒ Object
.rootnamespace(root) ⇒ Object
.set_footnote_link_attrs(elem, idx) ⇒ Object
.style_list(elem, level, liststyle, listnumber) ⇒ Object
.stylesheet(_filename, _header_filename, fn) ⇒ Object
.to_plane1(xml, font) ⇒ Object
.to_xhtml(xml) ⇒ Object
.transform_footnote_text(note) ⇒ Object
.uncenter(math, ooxml) ⇒ Object

if oomml has no siblings, by default it is centered; override this with left/right if parent is so tagged.
.unitalic(math) ⇒ Object
.unwrap_accents(doc) ⇒ Object
.warnsvg(src) ⇒ Object

Class Method Details

.add_stylesheet(head, title, css) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 170

def self.add_stylesheet(head, title, css)
  if head.children.empty?
    head.add_child css
  elsif title.nil?
    head.children.first.add_previous_sibling css
  else
    title.add_next_sibling css
  end
end

.asciimath_to_mathml(doc, delims) ⇒ `Object`

# File 'lib/html2doc/math.rb', line 23

def self.asciimath_to_mathml(doc, delims)
  return doc if delims.nil? || delims.size < 2

  m = doc.split(/(#{Regexp.escape(delims[0])}|#{Regexp.escape(delims[1])})/)
  m.each_slice(4).map.with_index do |(*a), i|
    progress_conv(i, 500, (m.size / 4).floor, 1000, "AsciiMath")
    a[2].nil? || a[2] = asciimath_to_mathml1(a[2])
    a.size > 1 ? a[0] + a[2] : a[0]
  end.join
end

.asciimath_to_mathml1(expr) ⇒ `Object`

# File 'lib/html2doc/math.rb', line 12

def self.asciimath_to_mathml1(expr)
  AsciiMath::MathMLBuilder.new(msword: true).append_expression(
    AsciiMath.parse(HTMLEntities.new.decode(expr)).ast,
  ).to_s
    .gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>")
rescue StandardError => e
  puts "parsing: #{expr}"
  puts e.message
  raise e
end

.bookmarks(docxml) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 193

def self.bookmarks(docxml)
  docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
    .each do |x|
    next if x["id"].empty? ||
      %w(shapetype v:shapetype shape v:shape).include?(x.name)

    if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
    else x.children.first.previous = "<a name='#{x['id']}'></a>"
    end
    x.delete("id")
  end
end

.cleanup(docxml, hash) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 54

def self.cleanup(docxml, hash)
  namespace(docxml.root)
  image_cleanup(docxml, hash[:dir1], File.dirname(hash[:filename]))
  mathml_to_ooml(docxml)
  lists(docxml, hash[:liststyles])
  footnotes(docxml)
  bookmarks(docxml)
  msonormal(docxml)
  docxml
end

.clear_dir(dir) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 27

def self.clear_dir(dir)
  Dir.foreach(dir) do |f|
    fn = File.join(dir, f)
    File.delete(fn) if f != "." && f != ".."
  end
  dir
end

.contentid(mhtml) ⇒ `Object`

# File 'lib/html2doc/mime.rb', line 69

def self.contentid(mhtml)
  mhtml.gsub %r{(<img[^>]*?src=")([^\"']+)(['"])}m do |m|
    repl = "#{$1}cid:#{File.basename($2)}#{$3}"
    /^data:|^https?:/.match($2) ? m : repl
  end.gsub %r{(<v:imagedata[^>]*?src=")([^\"']+)(['"])}m do |m|
    repl = "#{$1}cid:#{File.basename($2)}#{$3}"
    /^data:|^https?:/.match($2) ? m : repl
  end
end

.create_dir(filename, dir) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 35

def self.create_dir(filename, dir)
  dir and return clear_dir(dir)
  dir = "#{filename}_files"
  Dir.mkdir(dir) unless File.exists?(dir)
  clear_dir(dir)
end

.define_head(docxml, hash) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 160

def self.define_head(docxml, hash)
  title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']")
  head = docxml.at("//*[local-name() = 'head']")
  css = stylesheet(hash[:filename], hash[:header_file], hash[:stylesheet])
  add_stylesheet(head, title, css)
  filename_substitute(head, hash[:header_file])
  define_head1(docxml, hash[:dir1])
  rootnamespace(docxml.root)
end

.define_head1(docxml, _dir) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 131

def self.define_head1(docxml, _dir)
  docxml.xpath("//*[local-name() = 'head']").each do |h|
    h.children.first.add_previous_sibling <<~XML
      #{PRINT_VIEW}
        <link rel="File-List" href="cid:filelist.xml"/>
    XML
  end
end

.esc_space(xml) ⇒ `Object`

escape space as 2; we are removing any spaces generated by XML indentation

# File 'lib/html2doc/math.rb', line 164

def self.esc_space(xml)
  xml.traverse do |n|
    next unless n.text?

    n = n.text.gsub(/ /, "&#x32;")
  end
  xml
end

.filename_substitute(head, header_filename) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 140

def self.filename_substitute(head, header_filename)
  return if header_filename.nil?

  head.xpath(".//*[local-name() = 'style']").each do |s|
    s1 = s.to_xml.gsub(/url\("[^"]+"\)/) do |m|
      /FILENAME/.match?(m) ? "url(cid:header.html)" : m
    end
    s.replace(s1)
  end
end

.footnote?(elem) ⇒ `Boolean`

Returns:

# File 'lib/html2doc/notes.rb', line 85

def self.footnote?(elem)
  elem["epub:type"]&.casecmp("footnote")&.zero? ||
    elem["class"]&.casecmp("footnote")&.zero?
end

.footnote_cleanup(docxml) ⇒ `Object`

We expect that the content of the footnote text received is one or more text containers, p or aside or div (which we have already converted to p). We do not expect any <a name> or links back to text; if they are present in the HTML, they need to have been cleaned out before passing to this gem

# File 'lib/html2doc/notes.rb', line 102

def self.footnote_cleanup(docxml)
  docxml.xpath('//div[@style="mso-element:footnote"]/a')
    .each do |x|
    n = x.next_element
    n&.children&.first&.add_previous_sibling(x.remove)
  end
  docxml
end

.footnote_container(docxml, idx) ⇒ `Object`

# File 'lib/html2doc/notes.rb', line 40

def self.footnote_container(docxml, idx)
  ref = docxml&.at("//a[@href='#_ftn#{idx}']")&.children&.to_xml(indent: 0)
    &.gsub(/>\n</, "><") || FN
  <<~DIV
    <div style='mso-element:footnote' id='ftn#{idx}'>
      <a style='mso-footnote-id:ftn#{idx}' href='#_ftn#{idx}'
         name='_ftnref#{idx}' title='' id='_ftnref#{idx}'>#{ref.strip}</a></div>
  DIV
end

.footnote_div_to_p(elem) ⇒ `Object`

# File 'lib/html2doc/notes.rb', line 26

def self.footnote_div_to_p(elem)
  if %w{div aside}.include? elem.name
    if elem.at(".//p")
      elem.replace(elem.children)
    else
      elem.name = "p"
      elem["class"] = "MsoFootnoteText"
    end
  end
end

.footnotes(docxml) ⇒ `Object`

# File 'lib/html2doc/notes.rb', line 4

def self.footnotes(docxml)
  i = 1
  fn = []
  docxml.xpath("//a").each do |a|
    next unless process_footnote_link(docxml, a, i, fn)

    i += 1
  end
  process_footnote_texts(docxml, fn)
end

.from_xhtml(xml) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 86

def self.from_xhtml(xml)
  xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
    .sub(DOCTYPE, "")
    .gsub(%{ />}, "/>")
end

.generate_filelist(filename, dir) ⇒ `Object`

# File 'lib/html2doc/mime.rb', line 143

def self.generate_filelist(filename, dir)
  File.open(File.join(dir, "filelist.xml"), "w") do |f|
    f.write %{<xml xmlns:o="urn:schemas-microsoft-com:office:office">
      <o:MainFile HRef="../#{filename}.htm"/>}
    Dir.entries(dir).sort.each do |item|
      next if item == "." || item == ".." || /^\./.match(item)

      f.write %{  <o:File HRef="#{item}"/>\n}
    end
    f.write("</xml>\n")
  end
end

.header_image_cleanup(doc, dir, filename, localdir) ⇒ `Object`

do not parse the header through Nokogiri, since it will contain non-XML like <![if !supportFootnotes]>

# File 'lib/html2doc/mime.rb', line 125

def self.header_image_cleanup(doc, dir, filename, localdir)
  doc.split(%r{(<img [^>]*>|<v:imagedata [^>]*>)}).each_slice(2).map do |a|
    header_image_cleanup1(a, dir, filename, localdir)
  end.join
end

.header_image_cleanup1(a, dir, _filename, localdir) ⇒ `Object`

# File 'lib/html2doc/mime.rb', line 131

def self.header_image_cleanup1(a, dir, _filename, localdir)
  if a.size == 2 && !(/ src="https?:/.match a[1]) &&
      !(%r{ src="data:(image|application)/[^;]+;base64}.match a[1])
    m = / src=['"](?<src>[^"']+)['"]/.match a[1]
    m2 = /\.(?<suffix>[a-zA-Z_0-9]+)$/.match m[:src]
    new_filename = "#{mkuuid}.#{m2[:suffix]}"
    FileUtils.cp localname(m[:src], localdir), File.join(dir, new_filename)
    a[1].sub!(%r{ src=['"](?<src>[^"']+)['"]}, " src='cid:#{new_filename}'")
  end
  a.join
end

.image_cleanup(docxml, dir, localdir) ⇒ `Object`

only processes locally stored images

# File 'lib/html2doc/mime.rb', line 108

def self.image_cleanup(docxml, dir, localdir)
  docxml.traverse do |i|
    next unless i.element? && %w(img v:imagedata).include?(i.name)
    next if /^http/.match? i["src"]
    next if %r{^data:(image|application)/[^;]+;base64}.match? i["src"]

    local_filename = localname(i["src"], localdir)
    new_filename = "#{mkuuid}#{File.extname(i['src'])}"
    FileUtils.cp local_filename, File.join(dir, new_filename)
    i["width"], i["height"] = image_resize(i, local_filename, 680, 400)
    i["src"] = File.join(File.basename(dir), new_filename)
  end
  docxml
end

.image_resize(img, path, maxheight, maxwidth) ⇒ `Object`

max width for Word document is 400, max height is 680

# File 'lib/html2doc/mime.rb', line 80

def self.image_resize(img, path, maxheight, maxwidth)
  realsize = ImageSize.path(path).size
  s = [img["width"].to_i, img["height"].to_i]
  s = realsize if s[0].zero? && s[1].zero?
  return [nil, nil] if realsize.nil? || realsize[0].nil? || realsize[1].nil?

  s[1] = s[0] * realsize[1] / realsize[0] if s[1].zero? && !s[0].zero?
  s[0] = s[1] * realsize[0] / realsize[1] if s[0].zero? && !s[1].zero?
  s = [(s[0] * maxheight / s[1]).ceil, maxheight] if s[1] > maxheight
  s = [maxwidth, (s[1] * maxwidth / s[0]).ceil] if s[0] > maxwidth
  s
end

.list2para(list) ⇒ `Object`

# File 'lib/html2doc/lists.rb', line 49

def self.list2para(list)
  return if list.xpath("./li").empty?

  list.xpath("./li").first["class"] ||= "MsoListParagraphCxSpFirst"
  list.xpath("./li").last["class"] ||= "MsoListParagraphCxSpLast"
  list.xpath("./li/p").each { |p| p["class"] ||= "MsoListParagraphCxSpMiddle" }
  list.xpath("./li").each do |l|
    l.name = "p"
    l["class"] ||= "MsoListParagraphCxSpMiddle"
    l&.first_element_child&.name == "p" and
      l.first_element_child.replace(l.first_element_child.children)
  end
  list.replace(list.children)
end

.list_add(xpath, liststyles, listtype, level) ⇒ `Object`

# File 'lib/html2doc/lists.rb', line 32

def self.list_add(xpath, liststyles, listtype, level)
  xpath.each_with_index do |l, _i|
    @listnumber += 1 if level == 1
    l["seen"] = true if level == 1
    l["id"] ||= UUIDTools::UUID.random_create
    (l.xpath(".//li") - l.xpath(".//ol//li | .//ul//li")).each do |li|
      style_list(li, level, liststyles[listtype], @listnumber)
      list_add1(li, liststyles, listtype, level)
    end
    l.xpath(".//ul[not(ancestor::li/ancestor::*/@id = '#{l['id']}')] | "\
            ".//ol[not(ancestor::li/ancestor::*/@id = '#{l['id']}')]")
      .each do |li|
      list_add1(li.parent, liststyles, listtype, level - 1)
    end
  end
end

.list_add1(elem, liststyles, listtype, level) ⇒ `Object`

# File 'lib/html2doc/lists.rb', line 18

def self.list_add1(elem, liststyles, listtype, level)
  if %i[ul ol].include? listtype
    list_add(elem.xpath(".//ul") - elem.xpath(".//ul//ul | .//ol//ul"),
             liststyles, :ul, level + 1)
    list_add(elem.xpath(".//ol") - elem.xpath(".//ul//ol | .//ol//ol"),
             liststyles, :ol, level + 1)
  else
    list_add(elem.xpath(".//ul") - elem.xpath(".//ul//ul | .//ol//ul"),
             liststyles, listtype, level + 1)
    list_add(elem.xpath(".//ol") - elem.xpath(".//ul//ol | .//ol//ol"),
             liststyles, listtype, level + 1)
  end
end

.lists(docxml, liststyles) ⇒ `Object`

# File 'lib/html2doc/lists.rb', line 91

def self.lists(docxml, liststyles)
  return if liststyles.nil?

  @listnumber = 0
  liststyles.each_key { |k| lists1(docxml, liststyles, k) }
  lists_unstyled(docxml, liststyles)
  liststyles.has_key?(:ul) and docxml.xpath("//ul").each { |u| list2para(u) }
  liststyles.has_key?(:ol) and docxml.xpath("//ol").each { |u| list2para(u) }
end

.lists1(docxml, liststyles, style) ⇒ `Object`

# File 'lib/html2doc/lists.rb', line 66

def self.lists1(docxml, liststyles, style)
  case style
  when :ul then list_add(docxml.xpath("//ul[not(@class)]#{TOPLIST}"),
                         liststyles, :ul, 1)
  when :ol then list_add(docxml.xpath("//ol[not(@class)]#{TOPLIST}"),
                         liststyles, :ol, 1)
  else
    list_add(docxml.xpath("//ol[@class = '#{style}']#{TOPLIST} | "\
                          "//ul[@class = '#{style}']#{TOPLIST}"),
    liststyles, style, 1)
  end
end

.lists_unstyled(docxml, liststyles) ⇒ `Object`

# File 'lib/html2doc/lists.rb', line 79

def self.lists_unstyled(docxml, liststyles)
  liststyles.has_key?(:ul) and
    list_add(docxml.xpath("//ul#{TOPLIST}[not(@seen)]"),
             liststyles, :ul, 1)
  liststyles.has_key?(:ol) and
    list_add(docxml.xpath("//ol#{TOPLIST}[not(@seen)]"),
             liststyles, :ul, 1)
  docxml.xpath("//ul[@seen] | //ol[@seen]").each do |l|
    l.delete("seen")
  end
end

.localname(src, localdir) ⇒ `Object`



103
104
105

# File 'lib/html2doc/mime.rb', line 103

def self.localname(src, localdir)
  %r{^([A-Z]:)?/}.match?(src) ? src : File.join(localdir, src)
end

.mathml_insert_rows(math, docnamespaces) ⇒ `Object`

# File 'lib/html2doc/math.rb', line 60

def self.mathml_insert_rows(math, docnamespaces)
  math.xpath(%w(msup msub msubsup munder mover munderover)
          .map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
    next unless x.next_element && x.next_element != "mrow"

    x.next_element.wrap("<mrow/>")
  end
  math
end

.mathml_preserve_space(math, docnamespaces) ⇒ `Object`

# File 'lib/html2doc/math.rb', line 70

def self.mathml_preserve_space(math, docnamespaces)
  math.xpath(".//xmlns:mtext", docnamespaces).each do |x|
    x.children = x.children.to_xml.gsub(/^\s/, "&#xA0;").gsub(/\s$/, "&#xA0;")
  end
  math
end

.mathml_to_ooml(docxml) ⇒ `Object`

# File 'lib/html2doc/math.rb', line 134

def self.mathml_to_ooml(docxml)
  docnamespaces = docxml.collect_namespaces
  m = docxml.xpath("//*[local-name() = 'math']")
  m.each_with_index do |x, i|
    progress_conv(i, 100, m.size, 500, "Math OOXML")
    mathml_to_ooml1(x, docnamespaces)
  end
end

.mathml_to_ooml1(xml, docnamespaces) ⇒ `Object`

# File 'lib/html2doc/math.rb', line 154

def self.mathml_to_ooml1(xml, docnamespaces)
  doc = Nokogiri::XML::Document::new
  doc.root = ooxml_cleanup(xml, docnamespaces)
    ooxml = ooml_clean(unitalic(esc_space(@xsltemplate.transform(doc))))
  ooxml = uncenter(xml, ooxml)
  xml.swap(ooxml)
end

.mime_attachment(boundary, _filename, item, dir) ⇒ `Object`

# File 'lib/html2doc/mime.rb', line 23

def self.mime_attachment(boundary, _filename, item, dir)
  content_type = mime_type(item)
  text_mode = %w[text application].any? { |p| content_type.start_with? p }

  path = File.join(dir, item)
  content = text_mode ? File.read(path, encoding: "utf-8") : IO.binread(path)

  encoded_file = Base64.strict_encode64(content).gsub(/(.{76})/, "\\1\n")
  <<~"FILE"
    --#{boundary}
    Content-ID: <#{File.basename(item)}>
    Content-Disposition: inline; filename="#{File.basename(item)}"
    Content-Transfer-Encoding: base64
    Content-Type: #{content_type}

    #{encoded_file}

  FILE
end

.mime_boundary ⇒ `Object`

# File 'lib/html2doc/mime.rb', line 50

def self.mime_boundary
  salt = UUIDTools::UUID.random_create.to_s.gsub(/-/, ".")[0..17]
  "----=_NextPart_#{salt}"
end

.mime_package(result, filename, dir) ⇒ `Object`

# File 'lib/html2doc/mime.rb', line 55

def self.mime_package(result, filename, dir)
  boundary = mime_boundary
  mhtml = mime_preamble(boundary, "#{filename}.htm", result)
  mhtml += mime_attachment(boundary, "#{filename}.htm", "filelist.xml", dir)
  Dir.foreach(dir) do |item|
    next if item == "." || item == ".." || /^\./.match(item) ||
      item == "filelist.xml"

    mhtml += mime_attachment(boundary, "#{filename}.htm", item, dir)
  end
  mhtml += "--#{boundary}--"
  File.open("#{filename}.doc", "w:UTF-8") { |f| f.write contentid(mhtml) }
end

.mime_preamble(boundary, filename, result) ⇒ `Object`

# File 'lib/html2doc/mime.rb', line 8

def self.mime_preamble(boundary, filename, result)
  <<~"PREAMBLE"
    MIME-Version: 1.0
    Content-Type: multipart/related; boundary="#{boundary}"

    --#{boundary}
    Content-ID: <#{File.basename(filename)}>
    Content-Disposition: inline; filename="#{File.basename(filename)}"
    Content-Type: text/html; charset="utf-8"

    #{result}

  PREAMBLE
end

.mime_type(item) ⇒ `Object`

# File 'lib/html2doc/mime.rb', line 43

def self.mime_type(item)
  types = MIME::Types.type_for(item)
  type = types ? types.first.to_s : 'text/plain; charset="utf-8"'
  type = %(#{type} charset="utf-8") if /^text/.match(type) && types
  type
end

.mkuuid ⇒ `Object`



95
96
97

# File 'lib/html2doc/mime.rb', line 95

def self.mkuuid
  UUIDTools::UUID.random_create.to_s
end

.msonormal(docxml) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 206

def self.msonormal(docxml)
  docxml.xpath("//*[local-name() = 'p'][not(self::*[@class])]").each do |p|
    p["class"] = "MsoNormal"
  end
  docxml.xpath("//*[local-name() = 'li'][not(self::*[@class])]").each do |p|
    p["class"] = "MsoNormal"
  end
end

.msword_fix(doc) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 92

def self.msword_fix(doc)
  # brain damage in MSWord parser
  doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
            '<span style="mso-special-character:footnote"></span>')
  doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
            '<div style="mso-element:footnote-list"/>')
  doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
  doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
  doc.gsub!(%r{<meta http-equiv="Content-Type"},
            "<meta http-equiv=Content-Type")
  doc.gsub!(%r{></m:jc>}, "/>")
  doc.gsub!(%r{></v:stroke>}, "/>")
  doc.gsub!(%r{></v:f>}, "/>")
  doc.gsub!(%r{></v:path>}, "/>")
  doc.gsub!(%r{></o:lock>}, "/>")
  doc.gsub!(%r{></v:imagedata>}, "/>")
  doc.gsub!(%r{></w:wrap>}, "/>")
  doc.gsub!(%r{<(/)?m:(span|em)\b}, "<\\1\\2")
  doc.gsub!(%r{&tab;|&amp;tab;},
            '<span style="mso-tab-count:1">&#xA0; </span>')
  doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
    a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
    a
  end.join
end

.namespace(root) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 180

def self.namespace(root)
  {
    o: "urn:schemas-microsoft-com:office:office",
    w: "urn:schemas-microsoft-com:office:word",
    v: "urn:schemas-microsoft-com:vml",
    m: "http://schemas.microsoft.com/office/2004/12/omml",
  }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
end

.ooml_clean(xml) ⇒ `Object`

We need span and em not to be namespaced. Word can’t deal with explicit namespaces. We will end up stripping them out again under Nokogiri 1.11, which correctly insists on inheriting namespace from parent.

# File 'lib/html2doc/math.rb', line 147

def self.ooml_clean(xml)
  xml.to_s
    .gsub(/<\?[^>]+>\s*/, "")
    .gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
    .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
end

.ooxml_cleanup(math, docnamespaces) ⇒ `Object`

random fixes to MathML input that OOXML needs to render properly

# File 'lib/html2doc/math.rb', line 50

def self.ooxml_cleanup(math, docnamespaces)
  math = unwrap_accents(
    mathml_preserve_space(
      mathml_insert_rows(math, docnamespaces), docnamespaces
    ),
  )
  math.add_namespace(nil, "http://www.w3.org/1998/Math/MathML")
  math
end

.process(result, hash) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 8

def self.process(result, hash)
  hash[:dir1] = create_dir(hash[:filename], hash[:dir])
  result = process_html(result, hash)
  process_header(hash[:header_file], hash)
  generate_filelist(hash[:filename], hash[:dir1])
  File.open("#{hash[:filename]}.htm", "w:UTF-8") { |f| f.write(result) }
  mime_package result, hash[:filename], hash[:dir1]
  rm_temp_files(hash[:filename], hash[:dir], hash[:dir1]) unless hash[:debug]
end

.process_footnote_link(docxml, elem, idx, footnote) ⇒ `Object`

# File 'lib/html2doc/notes.rb', line 50

def self.process_footnote_link(docxml, elem, idx, footnote)
  return false unless footnote?(elem)

  href = elem["href"].gsub(/^#/, "")
  note = docxml.at("//*[@name = '#{href}' or @id = '#{href}']")
  return false if note.nil?

  set_footnote_link_attrs(elem, idx)
  if elem.at("./span[@class = 'MsoFootnoteReference']")
    process_footnote_link1(elem)
  else elem.children = FN
  end
  footnote << transform_footnote_text(note)
end

.process_footnote_link1(elem) ⇒ `Object`

# File 'lib/html2doc/notes.rb', line 65

def self.process_footnote_link1(elem)
  elem.children.each do |c|
    if c.name == "span" && c["class"] == "MsoFootnoteReference"
      c.replace(FN)
    else
      c.wrap("<span class='MsoFootnoteReference'></span>")
    end
  end
end

.process_footnote_texts(docxml, footnotes) ⇒ `Object`

# File 'lib/html2doc/notes.rb', line 15

def self.process_footnote_texts(docxml, footnotes)
  body = docxml.at("//body")
  list = body.add_child("<div style='mso-element:footnote-list'/>")
  footnotes.each_with_index do |f, i|
    fn = list.first.add_child(footnote_container(docxml, i + 1))
    f.parent = fn.first
    footnote_div_to_p(f)
  end
  footnote_cleanup(docxml)
end

.process_header(headerfile, hash) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 18

def self.process_header(headerfile, hash)
  return if headerfile.nil?

  doc = File.read(headerfile, encoding: "utf-8")
  doc = header_image_cleanup(doc, hash[:dir1], hash[:filename],
                             File.dirname(hash[:filename]))
  File.open("#{hash[:dir1]}/header.html", "w:UTF-8") { |f| f.write(doc) }
end

.process_html(result, hash) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 42

def self.process_html(result, hash)
  docxml = to_xhtml(asciimath_to_mathml(result, hash[:asciimathdelims]))
  define_head(cleanup(docxml, hash), hash)
  msword_fix(from_xhtml(docxml))
end

.progress_conv(idx, step, total, threshold, msg) ⇒ `Object`

# File 'lib/html2doc/math.rb', line 34

def self.progress_conv(idx, step, total, threshold, msg)
  return unless (idx % step).zero? && total > threshold && idx.positive?

  warn "#{msg} #{idx} of #{total}"
end

.rm_temp_files(filename, dir, dir1) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 48

def self.rm_temp_files(filename, dir, dir1)
  FileUtils.rm "#{filename}.htm"
  FileUtils.rm_f "#{dir1}/header.html"
  FileUtils.rm_r dir1 unless dir
end

.rootnamespace(root) ⇒ `Object`



189
190
191

# File 'lib/html2doc/base.rb', line 189

def self.rootnamespace(root)
  root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
end

.set_footnote_link_attrs(elem, idx) ⇒ `Object`

# File 'lib/html2doc/notes.rb', line 90

def self.set_footnote_link_attrs(elem, idx)
  elem["style"] = "mso-footnote-id:ftn#{idx}"
  elem["href"] = "#_ftn#{idx}"
  elem["name"] = "_ftnref#{idx}"
  elem["title"] = ""
end

.style_list(elem, level, liststyle, listnumber) ⇒ `Object`

# File 'lib/html2doc/lists.rb', line 7

def self.style_list(elem, level, liststyle, listnumber)
  return unless liststyle

  if elem["style"]
    elem["style"] += ";"
  else
    elem["style"] = ""
  end
  elem["style"] += "mso-list:#{liststyle} level#{level} lfo#{listnumber};"
end

.stylesheet(_filename, _header_filename, fn) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 151

def self.stylesheet(_filename, _header_filename, fn)
  (fn.nil? || fn.empty?) and
    fn = File.join(File.dirname(__FILE__), "wordstyle.css")
  stylesheet = File.read(fn, encoding: "UTF-8")
  xml = Nokogiri::XML("<style/>")
  xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n")
  xml.root.to_s
end

.to_plane1(xml, font) ⇒ `Object`

# File 'lib/html2doc/math.rb', line 125

def self.to_plane1(xml, font)
  xml.traverse do |n|
    next unless n.text?

    n.replace(Plane1Converter.conv(HTMLEntities.new.decode(n.text), font))
  end
  xml
end

.to_xhtml(xml) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 73

def self.to_xhtml(xml)
  xml.gsub!(/<\?xml[^>]*>/, "")
  unless /<!DOCTYPE /.match? xml
    xml = '<!DOCTYPE html SYSTEM
        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
  end
  Nokogiri::XML.parse(xml)
end

.transform_footnote_text(note) ⇒ `Object`

# File 'lib/html2doc/notes.rb', line 75

def self.transform_footnote_text(note)
  note["id"] = ""
  note.xpath(".//div").each { |div| div.replace(div.children) }
  note.xpath(".//aside | .//p").each do |p|
    p.name = "p"
    p["class"] = "MsoFootnoteText"
  end
  note.remove
end

.uncenter(math, ooxml) ⇒ `Object`

if oomml has no siblings, by default it is centered; override this with left/right if parent is so tagged

# File 'lib/html2doc/math.rb', line 175

def self.uncenter(math, ooxml)
  alignnode = math.at(".//ancestor::*[@style][local-name() = 'p' or "\
                      "local-name() = 'div' or local-name() = 'td']/@style")
  return ooxml unless alignnode && (math.next == nil && math.previous == nil)

  %w(left right).each do |dir|
    if alignnode.text.include? ("text-align:#{dir}")
      ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
        "m:val='#{dir}'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
    end
  end
  ooxml
end

.unitalic(math) ⇒ `Object`

# File 'lib/html2doc/math.rb', line 79

def self.unitalic(math)
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'p']]").each do |x|
    x.wrap("<span #{HTML_NS} style='font-style:normal;'></span>")
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'bi']]").each do |x|
    x.wrap("<span #{HTML_NS} class='nostem' style='font-weight:bold;'><em></em></span>")
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'i']]").each do |x|
    x.wrap("<span #{HTML_NS} class='nostem'><em></em></span>")
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'b']]").each do |x|
    x.wrap("<span #{HTML_NS} style='font-style:normal;font-weight:bold;'></span>")
  end
  math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'monospace']]").each do |x|
    to_plane1(x, :monospace)
  end
  math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'double-struck']]").each do |x|
    to_plane1(x, :doublestruck)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'script']]").each do |x|
    to_plane1(x, :script)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'script']]").each do |x|
    to_plane1(x, :scriptbold)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
    to_plane1(x, :fraktur)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
    to_plane1(x, :frakturbold)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    to_plane1(x, :sans)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    to_plane1(x, :sansbold)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'i']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    to_plane1(x, :sansitalic)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'bi']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    to_plane1(x, :sansbolditalic)
  end
  math
end

.unwrap_accents(doc) ⇒ `Object`

# File 'lib/html2doc/math.rb', line 40

def self.unwrap_accents(doc)
  doc.xpath("//*[@accent = 'true']").each do |x|
    x.elements.length > 1 or next
    x.elements[1].name == "mrow" and
      x.elements[1].replace(x.elements[1].children)
  end
  doc
end

.warnsvg(src) ⇒ `Object`



99
100
101

# File 'lib/html2doc/mime.rb', line 99

def self.warnsvg(src)
  warn "#{src}: SVG not supported" if /\.svg$/i.match?(src)
end

Module: Html2Doc

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.add_stylesheet(head, title, css) ⇒ Object

.asciimath_to_mathml(doc, delims) ⇒ Object

.asciimath_to_mathml1(expr) ⇒ Object

.bookmarks(docxml) ⇒ Object

.cleanup(docxml, hash) ⇒ Object

.clear_dir(dir) ⇒ Object

.contentid(mhtml) ⇒ Object

.create_dir(filename, dir) ⇒ Object

.define_head(docxml, hash) ⇒ Object

.define_head1(docxml, _dir) ⇒ Object

.esc_space(xml) ⇒ Object

.filename_substitute(head, header_filename) ⇒ Object

.footnote?(elem) ⇒ Boolean

.footnote_cleanup(docxml) ⇒ Object

.footnote_container(docxml, idx) ⇒ Object

.footnote_div_to_p(elem) ⇒ Object

.footnotes(docxml) ⇒ Object

.from_xhtml(xml) ⇒ Object

.generate_filelist(filename, dir) ⇒ Object

.header_image_cleanup(doc, dir, filename, localdir) ⇒ Object

.header_image_cleanup1(a, dir, _filename, localdir) ⇒ Object

.image_cleanup(docxml, dir, localdir) ⇒ Object

.image_resize(img, path, maxheight, maxwidth) ⇒ Object

.list2para(list) ⇒ Object

.list_add(xpath, liststyles, listtype, level) ⇒ Object

.list_add1(elem, liststyles, listtype, level) ⇒ Object

.lists(docxml, liststyles) ⇒ Object

.lists1(docxml, liststyles, style) ⇒ Object

.lists_unstyled(docxml, liststyles) ⇒ Object

.localname(src, localdir) ⇒ Object

.mathml_insert_rows(math, docnamespaces) ⇒ Object

.mathml_preserve_space(math, docnamespaces) ⇒ Object

.mathml_to_ooml(docxml) ⇒ Object

.mathml_to_ooml1(xml, docnamespaces) ⇒ Object

.mime_attachment(boundary, _filename, item, dir) ⇒ Object

.mime_boundary ⇒ Object

.mime_package(result, filename, dir) ⇒ Object

.mime_preamble(boundary, filename, result) ⇒ Object

.mime_type(item) ⇒ Object

.mkuuid ⇒ Object

.msonormal(docxml) ⇒ Object

.msword_fix(doc) ⇒ Object

.namespace(root) ⇒ Object

.ooml_clean(xml) ⇒ Object

.ooxml_cleanup(math, docnamespaces) ⇒ Object

.process(result, hash) ⇒ Object

.process_footnote_link(docxml, elem, idx, footnote) ⇒ Object

.process_footnote_link1(elem) ⇒ Object

.process_footnote_texts(docxml, footnotes) ⇒ Object

.process_header(headerfile, hash) ⇒ Object

.process_html(result, hash) ⇒ Object

.progress_conv(idx, step, total, threshold, msg) ⇒ Object

.rm_temp_files(filename, dir, dir1) ⇒ Object

.rootnamespace(root) ⇒ Object

.set_footnote_link_attrs(elem, idx) ⇒ Object

.style_list(elem, level, liststyle, listnumber) ⇒ Object

.stylesheet(_filename, _header_filename, fn) ⇒ Object

.to_plane1(xml, font) ⇒ Object

.to_xhtml(xml) ⇒ Object

.transform_footnote_text(note) ⇒ Object

.uncenter(math, ooxml) ⇒ Object

.unitalic(math) ⇒ Object

.unwrap_accents(doc) ⇒ Object

.warnsvg(src) ⇒ Object