Module: Html2Doc

Defined in:: lib/html2doc/base.rb,
lib/html2doc/math.rb,
lib/html2doc/mime.rb,
lib/html2doc/lists.rb,
lib/html2doc/notes.rb,
lib/html2doc/version.rb

Constant Summary collapse

NOKOHEAD =

"<!DOCTYPE html SYSTEM\n\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\n<head> <title></title> <meta charset=\"UTF-8\" /> </head>\n<body> </body> </html>\n".freeze

DOCTYPE =

"<!DOCTYPE html SYSTEM \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n".freeze

PRINT_VIEW =

"<!--[if gte mso 9]>\n<xml>\n<w:WordDocument>\n<w:View>Print</w:View>\n<w:Zoom>100</w:Zoom>\n<w:DoNotOptimizeForBrowser/>\n</w:WordDocument>\n</xml>\n<![endif]-->\n<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\"/>\n".freeze

IMAGE_PATH =

"//*[local-name() = 'img' or local-name() = 'imagedata']".freeze

TOPLIST =

"[not(ancestor::ul) and not(ancestor::ol)]".freeze

FN =

"<span class='MsoFootnoteReference'>"\
"<span style='mso-special-character:footnote'/></span>".freeze

VERSION =

"1.1.1".freeze

Class Method Summary collapse

.add_stylesheet(head, title, css) ⇒ Object
.asciimath_to_mathml(doc, delims) ⇒ Object
.asciimath_to_mathml1(expr) ⇒ Object
.bookmarks(docxml) ⇒ Object
.cleanup(docxml, hash) ⇒ Object
.clear_dir(dir) ⇒ Object
.contentid(mhtml) ⇒ Object
.create_dir(filename, dir) ⇒ Object
.define_head(docxml, hash) ⇒ Object
.define_head1(docxml, dir) ⇒ Object
.esc_space(xml) ⇒ Object

escape space as 2; we are removing any spaces generated by XML indentation.
.filename_substitute(head, header_filename) ⇒ Object
.footnote?(elem) ⇒ Boolean
.footnote_cleanup(docxml) ⇒ Object

We expect that the content of the footnote text received is one or more text containers, p or aside or div (which we have already converted to p).
.footnote_container(docxml, idx) ⇒ Object
.footnote_div_to_p(elem) ⇒ Object
.footnotes(docxml) ⇒ Object
.from_xhtml(xml) ⇒ Object
.generate_filelist(filename, dir) ⇒ Object
.header_image_cleanup(doc, dir, filename, localdir) ⇒ Object

do not parse the header through Nokogiri, since it will contain non-XML like <![if !supportFootnotes]>.
.header_image_cleanup1(a, dir, _filename, localdir) ⇒ Object
.image_cleanup(docxml, dir, localdir) ⇒ Object

only processes locally stored images.
.image_resize(i, path, maxheight, maxwidth) ⇒ Object

max width for Word document is 400, max height is 680.
.list2para(u) ⇒ Object
.list_add(xpath, liststyles, listtype, level) ⇒ Object
.list_add1(li, liststyles, listtype, level) ⇒ Object
.lists(docxml, liststyles) ⇒ Object
.lists1(docxml, liststyles, k) ⇒ Object
.lists_unstyled(docxml, liststyles) ⇒ Object
.mathml_insert_rows(math, docnamespaces) ⇒ Object
.mathml_preserve_space(math, docnamespaces) ⇒ Object
.mathml_to_ooml(docxml) ⇒ Object
.mime_attachment(boundary, _filename, item, dir) ⇒ Object
.mime_boundary ⇒ Object
.mime_package(result, filename, dir) ⇒ Object
.mime_preamble(boundary, filename, result) ⇒ Object
.mime_type(item) ⇒ Object
.mkuuid ⇒ Object
.msonormal(docxml) ⇒ Object
.msword_fix(doc) ⇒ Object
.namespace(root) ⇒ Object
.ooxml_cleanup(math, docnamespaces) ⇒ Object

random fixes to MathML input that OOXML needs to render properly.
.process(result, hash) ⇒ Object
.process_footnote_link(docxml, elem, idx, footnote) ⇒ Object
.process_footnote_texts(docxml, footnotes) ⇒ Object
.process_header(headerfile, hash) ⇒ Object
.process_html(result, hash) ⇒ Object
.rm_temp_files(filename, dir, dir1) ⇒ Object
.rootnamespace(root) ⇒ Object
.set_footnote_link_attrs(elem, idx) ⇒ Object
.style_list(li, level, liststyle, listnumber) ⇒ Object
.stylesheet(filename, header_filename, fn) ⇒ Object
.to_plane1(xml, font) ⇒ Object
.to_xhtml(xml) ⇒ Object
.transform_footnote_text(note) ⇒ Object
.uncenter(math, ooxml) ⇒ Object

if oomml has no siblings, by default it is centered; override this with left/right if parent is so tagged.
.unitalic(math) ⇒ Object
.unwrap_accents(doc) ⇒ Object
.warnsvg(src) ⇒ Object

Class Method Details

.add_stylesheet(head, title, css) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 169

def self.add_stylesheet(head, title, css)
  if head.children.empty?
    head.add_child css
  elsif title.nil?
    head.children.first.add_previous_sibling css
  else
    title.add_next_sibling css
  end
end

.asciimath_to_mathml(doc, delims) ⇒ `Object`

# File 'lib/html2doc/math.rb', line 23

def self.asciimath_to_mathml(doc, delims)
  return doc if delims.nil? || delims.size < 2

  m = doc.split(/(#{Regexp.escape(delims[0])}|#{Regexp.escape(delims[1])})/)
  m.each_slice(4).map.with_index do |(*a), i|
    i % 500 == 0 && m.size > 1000 && i > 0 and
      warn "MathML #{i} of #{(m.size / 4).floor}"
    a[2].nil? || a[2] = asciimath_to_mathml1(a[2])
    a.size > 1 ? a[0] + a[2] : a[0]
  end.join
end

.asciimath_to_mathml1(expr) ⇒ `Object`

# File 'lib/html2doc/math.rb', line 12

def self.asciimath_to_mathml1(expr)
  AsciiMath::MathMLBuilder.new(msword: true).append_expression(
    AsciiMath.parse(HTMLEntities.new.decode(expr)).ast,
  ).to_s
    .gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>")
rescue StandardError => e
  puts "parsing: #{expr}"
  puts e.message
  raise e
end

.bookmarks(docxml) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 192

def self.bookmarks(docxml)
  docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
    .each do |x|
    next if x["id"].empty? ||
      %w(shapetype v:shapetype shape v:shape).include?(x.name)

    if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
    else x.children.first.previous = "<a name='#{x['id']}'></a>"
    end
    x.delete("id")
  end
end

.cleanup(docxml, hash) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 54

def self.cleanup(docxml, hash)
  namespace(docxml.root)
  image_cleanup(docxml, hash[:dir1], File.dirname(hash[:filename]))
  mathml_to_ooml(docxml)
  lists(docxml, hash[:liststyles])
  footnotes(docxml)
  bookmarks(docxml)
  msonormal(docxml)
  docxml
end

.clear_dir(dir) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 27

def self.clear_dir(dir)
  Dir.foreach(dir) do |f|
    fn = File.join(dir, f)
    File.delete(fn) if f != "." && f != ".."
  end
  dir
end

.contentid(mhtml) ⇒ `Object`

# File 'lib/html2doc/mime.rb', line 69

def self.contentid(mhtml)
  mhtml.gsub %r{(<img[^>]*?src=")([^\"']+)(['"])}m do |m|
    repl = "#{$1}cid:#{File.basename($2)}#{$3}"
    /^data:|^https?:/.match($2) ? m : repl
  end.gsub %r{(<v:imagedata[^>]*?src=")([^\"']+)(['"])}m do |m|
    repl = "#{$1}cid:#{File.basename($2)}#{$3}"
    /^data:|^https?:/.match($2) ? m : repl
  end
end

.create_dir(filename, dir) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 35

def self.create_dir(filename, dir)
  dir and return clear_dir(dir)
  dir = "#{filename}_files"
  Dir.mkdir(dir) unless File.exists?(dir)
  clear_dir(dir)
end

.define_head(docxml, hash) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 159

def self.define_head(docxml, hash)
  title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']")
  head = docxml.at("//*[local-name() = 'head']")
  css = stylesheet(hash[:filename], hash[:header_file], hash[:stylesheet])
  add_stylesheet(head, title, css)
  filename_substitute(head, hash[:header_file])
  define_head1(docxml, hash[:dir1])
  rootnamespace(docxml.root)
end

.define_head1(docxml, dir) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 130

def self.define_head1(docxml, dir)
  docxml.xpath("//*[local-name() = 'head']").each do |h|
    h.children.first.add_previous_sibling "      \#{PRINT_VIEW}\n        <link rel=\"File-List\" href=\"cid:filelist.xml\"/>\n    XML\n  end\nend\n"

.esc_space(xml) ⇒ `Object`

escape space as 2; we are removing any spaces generated by XML indentation

# File 'lib/html2doc/math.rb', line 147

def self.esc_space(xml)
  xml.traverse do |n|
    next unless n.text?

    n = n.text.gsub(/ /, "&#x32;")
  end
  xml
end

.filename_substitute(head, header_filename) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 139

def self.filename_substitute(head, header_filename)
  return if header_filename.nil?

  head.xpath(".//*[local-name() = 'style']").each do |s|
    s1 = s.to_xml.gsub(/url\("[^"]+"\)/) do |m|
      /FILENAME/.match?(m) ? "url(cid:header.html)" : m
    end
    s.replace(s1)
  end
end

.footnote?(elem) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/html2doc/notes.rb', line 82

def self.footnote?(elem)
  elem["epub:type"]&.casecmp("footnote")&.zero? ||
    elem["class"]&.casecmp("footnote")&.zero?
end

.footnote_cleanup(docxml) ⇒ `Object`

We expect that the content of the footnote text received is one or more text containers, p or aside or div (which we have already converted to p). We do not expect any <a name> or links back to text; if they are present in the HTML, they need to have been cleaned out before passing to this gem

# File 'lib/html2doc/notes.rb', line 99

def self.footnote_cleanup(docxml)
  docxml.xpath('//div[@style="mso-element:footnote"]/a')
    .each do |x|
    n = x.next_element
    n&.children&.first&.add_previous_sibling(x.remove)
  end
  docxml
end

.footnote_container(docxml, idx) ⇒ `Object`

# File 'lib/html2doc/notes.rb', line 40

def self.footnote_container(docxml, idx)
  ref = docxml&.at("//a[@href='#_ftn#{idx}']")&.children&.to_xml(indent: 0)
    &.gsub(/>\n</, "><") || FN
  "    <div style='mso-element:footnote' id='ftn\#{idx}'>\n      <a style='mso-footnote-id:ftn\#{idx}' href='#_ftn\#{idx}'\n         name='_ftnref\#{idx}' title='' id='_ftnref\#{idx}'>\#{ref.strip}</a></div>\n  DIV\nend\n"

.footnote_div_to_p(elem) ⇒ `Object`

# File 'lib/html2doc/notes.rb', line 26

def self.footnote_div_to_p(elem)
  if %w{div aside}.include? elem.name
    if elem.at(".//p")
      elem.replace(elem.children)
    else
      elem.name = "p"
      elem["class"] = "MsoFootnoteText"
    end
  end
end

.footnotes(docxml) ⇒ `Object`

# File 'lib/html2doc/notes.rb', line 4

def self.footnotes(docxml)
  i = 1
  fn = []
  docxml.xpath("//a").each do |a|
    next unless process_footnote_link(docxml, a, i, fn)

    i += 1
  end
  process_footnote_texts(docxml, fn)
end

.from_xhtml(xml) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 86

def self.from_xhtml(xml)
  xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
    .sub(DOCTYPE, "")
    .gsub(%{ />}, "/>")
end

.generate_filelist(filename, dir) ⇒ `Object`

# File 'lib/html2doc/mime.rb', line 144

def self.generate_filelist(filename, dir)
  File.open(File.join(dir, "filelist.xml"), "w") do |f|
    f.write %{<xml xmlns:o="urn:schemas-microsoft-com:office:office">
      <o:MainFile HRef="../#{filename}.htm"/>}
    Dir.entries(dir).sort.each do |item|
      next if item == "." || item == ".." || /^\./.match(item)

      f.write %{  <o:File HRef="#{item}"/>\n}
    end
    f.write("</xml>\n")
  end
end

.header_image_cleanup(doc, dir, filename, localdir) ⇒ `Object`

do not parse the header through Nokogiri, since it will contain non-XML like <![if !supportFootnotes]>

# File 'lib/html2doc/mime.rb', line 123

def self.header_image_cleanup(doc, dir, filename, localdir)
  doc.split(%r{(<img [^>]*>|<v:imagedata [^>]*>)}).each_slice(2).map do |a|
    header_image_cleanup1(a, dir, filename, localdir)
  end.join
end

.header_image_cleanup1(a, dir, _filename, localdir) ⇒ `Object`

# File 'lib/html2doc/mime.rb', line 129

def self.header_image_cleanup1(a, dir, _filename, localdir)
  if a.size == 2 && !(/ src="https?:/.match a[1]) &&
      !(%r{ src="data:(image|application)/[^;]+;base64}.match a[1])
    m = / src=['"](?<src>[^"']+)['"]/.match a[1]
    #warnsvg(m[:src])
    m2 = /\.(?<suffix>[a-zA-Z_0-9]+)$/.match m[:src]
    new_filename = "#{mkuuid}.#{m2[:suffix]}"
    old_filename = %r{^([A-Z]:)?/}.match?(m[:src]) ? m[:src] :
      File.join(localdir, m[:src])
    FileUtils.cp old_filename, File.join(dir, new_filename)
    a[1].sub!(%r{ src=['"](?<src>[^"']+)['"]}, " src='cid:#{new_filename}'")
  end
  a.join
end

.image_cleanup(docxml, dir, localdir) ⇒ `Object`

only processes locally stored images

# File 'lib/html2doc/mime.rb', line 104

def self.image_cleanup(docxml, dir, localdir)
  docxml.traverse do |i|
    next unless i.element? && %w(img v:imagedata).include?(i.name)
    #warnsvg(i["src"])
    next if /^http/.match i["src"]
    next if %r{^data:(image|application)/[^;]+;base64}.match? i["src"]

    local_filename = %r{^([A-Z]:)?/}.match(i["src"]) ? i["src"] :
      File.join(localdir, i["src"])
    new_filename = "#{mkuuid}#{File.extname(i['src'])}"
    FileUtils.cp local_filename, File.join(dir, new_filename)
    i["width"], i["height"] = image_resize(i, local_filename, 680, 400)
    i["src"] = File.join(File.basename(dir), new_filename)
  end
  docxml
end

.image_resize(i, path, maxheight, maxwidth) ⇒ `Object`

max width for Word document is 400, max height is 680

# File 'lib/html2doc/mime.rb', line 80

def self.image_resize(i, path, maxheight, maxwidth)
  realSize = ImageSize.path(path).size
  s = [i["width"].to_i, i["height"].to_i]
  s = realSize if s[0].zero? && s[1].zero?
  return [nil, nil] if realSize.nil? || realSize[0].nil? || realSize[1].nil?

  s[1] = s[0] * realSize[1] / realSize[0] if s[1].zero? && !s[0].zero?
  s[0] = s[1] * realSize[0] / realSize[1] if s[0].zero? && !s[1].zero?
  s = [(s[0] * maxheight / s[1]).ceil, maxheight] if s[1] > maxheight
  s = [maxwidth, (s[1] * maxwidth / s[0]).ceil] if s[0] > maxwidth
  s
end

.list2para(u) ⇒ `Object`

# File 'lib/html2doc/lists.rb', line 50

def self.list2para(u)
  return if u.xpath("./li").empty?

  u.xpath("./li").first["class"] ||= "MsoListParagraphCxSpFirst"
  u.xpath("./li").last["class"] ||= "MsoListParagraphCxSpLast"
  u.xpath("./li/p").each { |p| p["class"] ||= "MsoListParagraphCxSpMiddle" }
  u.xpath("./li").each do |l|
    l.name = "p"
    l["class"] ||= "MsoListParagraphCxSpMiddle"
    l&.first_element_child&.name == "p" and
      l.first_element_child.replace(l.first_element_child.children)
  end
  u.replace(u.children)
end

.list_add(xpath, liststyles, listtype, level) ⇒ `Object`

# File 'lib/html2doc/lists.rb', line 33

def self.list_add(xpath, liststyles, listtype, level)
  xpath.each_with_index do |l, _i|
    @listnumber += 1 if level == 1
    l["seen"] = true if level == 1
    l["id"] ||= UUIDTools::UUID.random_create
    (l.xpath(".//li") - l.xpath(".//ol//li | .//ul//li")).each do |li|
      style_list(li, level, liststyles[listtype], @listnumber)
      list_add1(li, liststyles, listtype, level)
    end
    l.xpath(".//ul[not(ancestor::li/ancestor::*/@id = '#{l['id']}')] | "\
            ".//ol[not(ancestor::li/ancestor::*/@id = '#{l['id']}')]")
      .each do |li|
      list_add1(li.parent, liststyles, listtype, level - 1)
    end
  end
end

.list_add1(li, liststyles, listtype, level) ⇒ `Object`

# File 'lib/html2doc/lists.rb', line 19

def self.list_add1(li, liststyles, listtype, level)
  if i[ul ol].include? listtype
    list_add(li.xpath(".//ul") - li.xpath(".//ul//ul | .//ol//ul"),
             liststyles, :ul, level + 1)
    list_add(li.xpath(".//ol") - li.xpath(".//ul//ol | .//ol//ol"),
             liststyles, :ol, level + 1)
  else
    list_add(li.xpath(".//ul") - li.xpath(".//ul//ul | .//ol//ul"),
             liststyles, listtype, level + 1)
    list_add(li.xpath(".//ol") - li.xpath(".//ul//ol | .//ol//ol"),
             liststyles, listtype, level + 1)
  end
end

.lists(docxml, liststyles) ⇒ `Object`

# File 'lib/html2doc/lists.rb', line 94

def self.lists(docxml, liststyles)
  return if liststyles.nil?

  @listnumber = 0
  liststyles.each_key { |k| lists1(docxml, liststyles, k) }
  lists_unstyled(docxml, liststyles)
  liststyles.has_key?(:ul) and docxml.xpath("//ul").each { |u| list2para(u) }
  liststyles.has_key?(:ol) and docxml.xpath("//ol").each { |u| list2para(u) }
end

.lists1(docxml, liststyles, k) ⇒ `Object`

# File 'lib/html2doc/lists.rb', line 67

def self.lists1(docxml, liststyles, k)
  case k
  when :ul then list_add(docxml.xpath("//ul[not(@class)]#{TOPLIST}"),
                         liststyles, :ul, 1)
  when :ol then list_add(docxml.xpath("//ol[not(@class)]#{TOPLIST}"),
                         liststyles, :ol, 1)
  else
    list_add(docxml.xpath("//ol[@class = '#{k}']#{TOPLIST} | "\
                          "//ul[@class = '#{k}']#{TOPLIST}"),
    liststyles, k, 1)
  end
end

.lists_unstyled(docxml, liststyles) ⇒ `Object`

# File 'lib/html2doc/lists.rb', line 80

def self.lists_unstyled(docxml, liststyles)
  if liststyles.has_key?(:ul)
    list_add(docxml.xpath("//ul#{TOPLIST}[not(@seen)]"),
             liststyles, :ul, 1)
  end
  if liststyles.has_key?(:ol)
    list_add(docxml.xpath("//ol#{TOPLIST}[not(@seen)]"),
             liststyles, :ul, 1)
  end
  docxml.xpath("//ul[@seen] | //ol[@seen]").each do |l|
    l.delete("seen")
  end
end

.mathml_insert_rows(math, docnamespaces) ⇒ `Object`

# File 'lib/html2doc/math.rb', line 55

def self.mathml_insert_rows(math, docnamespaces)
  math.xpath(%w(msup msub msubsup munder mover munderover)
          .map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
    next unless x.next_element && x.next_element != "mrow"

    x.next_element.wrap("<mrow/>")
  end
  math
end

.mathml_preserve_space(math, docnamespaces) ⇒ `Object`

# File 'lib/html2doc/math.rb', line 65

def self.mathml_preserve_space(math, docnamespaces)
  math.xpath(".//xmlns:mtext", docnamespaces).each do |x|
    x.children = x.children.to_xml.gsub(/^\s/, "&#xA0;").gsub(/\s$/, "&#xA0;")
  end
  math
end

.mathml_to_ooml(docxml) ⇒ `Object`

# File 'lib/html2doc/math.rb', line 127

def self.mathml_to_ooml(docxml)
  docnamespaces = docxml.collect_namespaces
  m = docxml.xpath("//*[local-name() = 'math']")
  m.each_with_index do |x, i|
    i % 100 == 0 && m.size > 500 && i > 0 and
      warn "Math OOXML #{i} of #{m.size}"
    element = ooxml_cleanup(x, docnamespaces)
    doc = Nokogiri::XML::Document::new
    doc.root = element
    ooxml = unitalic(esc_space(@xsltemplate.transform(doc))).to_s
      .gsub(/<\?[^>]+>\s*/, "")
      .gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
      .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
    ooxml = uncenter(x, ooxml)
    x.swap(ooxml)
  end
end

.mime_attachment(boundary, _filename, item, dir) ⇒ `Object`

# File 'lib/html2doc/mime.rb', line 23

def self.mime_attachment(boundary, _filename, item, dir)
  content_type = mime_type(item)
  text_mode = %w[text application].any? { |p| content_type.start_with? p }

  path = File.join(dir, item)
  content = text_mode ? File.read(path, encoding: "utf-8") : IO.binread(path)

  encoded_file = Base64.strict_encode64(content).gsub(/(.{76})/, "\\1\n")
  "    --\#{boundary}\n    Content-ID: <\#{File.basename(item)}>\n    Content-Disposition: inline; filename=\"\#{File.basename(item)}\"\n    Content-Transfer-Encoding: base64\n    Content-Type: \#{content_type}\n\n    \#{encoded_file}\n\n  FILE\nend\n"

.mime_boundary ⇒ `Object`

# File 'lib/html2doc/mime.rb', line 50

def self.mime_boundary
  salt = UUIDTools::UUID.random_create.to_s.gsub(/-/, ".")[0..17]
  "----=_NextPart_#{salt}"
end

.mime_package(result, filename, dir) ⇒ `Object`

# File 'lib/html2doc/mime.rb', line 55

def self.mime_package(result, filename, dir)
  boundary = mime_boundary
  mhtml = mime_preamble(boundary, "#{filename}.htm", result)
  mhtml += mime_attachment(boundary, "#{filename}.htm", "filelist.xml", dir)
  Dir.foreach(dir) do |item|
    next if item == "." || item == ".." || /^\./.match(item) ||
      item == "filelist.xml"

    mhtml += mime_attachment(boundary, "#{filename}.htm", item, dir)
  end
  mhtml += "--#{boundary}--"
  File.open("#{filename}.doc", "w:UTF-8") { |f| f.write contentid(mhtml) }
end

.mime_preamble(boundary, filename, result) ⇒ `Object`

# File 'lib/html2doc/mime.rb', line 8

def self.mime_preamble(boundary, filename, result)
  "    MIME-Version: 1.0\n    Content-Type: multipart/related; boundary=\"\#{boundary}\"\n\n    --\#{boundary}\n    Content-ID: <\#{File.basename(filename)}>\n    Content-Disposition: inline; filename=\"\#{File.basename(filename)}\"\n    Content-Type: text/html; charset=\"utf-8\"\n\n    \#{result}\n\n  PREAMBLE\nend\n"

.mime_type(item) ⇒ `Object`

# File 'lib/html2doc/mime.rb', line 43

def self.mime_type(item)
  types = MIME::Types.type_for(item)
  type = types ? types.first.to_s : 'text/plain; charset="utf-8"'
  type = type + ' charset="utf-8"' if /^text/.match(type) && types
  type
end

.mkuuid ⇒ `Object`



95
96
97

# File 'lib/html2doc/mime.rb', line 95

def self.mkuuid
  UUIDTools::UUID.random_create.to_s
end

.msonormal(docxml) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 205

def self.msonormal(docxml)
  docxml.xpath("//*[local-name() = 'p'][not(self::*[@class])]").each do |p|
    p["class"] = "MsoNormal"
  end
  docxml.xpath("//*[local-name() = 'li'][not(self::*[@class])]").each do |p|
    p["class"] = "MsoNormal"
  end
end

.msword_fix(doc) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 92

def self.msword_fix(doc)
  # brain damage in MSWord parser
  doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
            '<span style="mso-special-character:footnote"></span>')
  doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
            '<div style="mso-element:footnote-list"/>')
  doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
  doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
  doc.gsub!(%r{<meta http-equiv="Content-Type"},
            "<meta http-equiv=Content-Type")
  doc.gsub!(%r{></m:jc>}, "/>")
  doc.gsub!(%r{></v:stroke>}, "/>")
  doc.gsub!(%r{></v:f>}, "/>")
  doc.gsub!(%r{></v:path>}, "/>")
  doc.gsub!(%r{></o:lock>}, "/>")
  doc.gsub!(%r{></v:imagedata>}, "/>")
  doc.gsub!(%r{></w:wrap>}, "/>")
  doc.gsub!(%r{&tab;|&amp;tab;},
            '<span style="mso-tab-count:1">&#xA0; </span>')
  doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
    a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
    a
  end.join
end

.namespace(root) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 179

def self.namespace(root)
  {
    o: "urn:schemas-microsoft-com:office:office",
    w: "urn:schemas-microsoft-com:office:word",
    v: "urn:schemas-microsoft-com:vml",
    m: "http://schemas.microsoft.com/office/2004/12/omml",
  }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
end

.ooxml_cleanup(math, docnamespaces) ⇒ `Object`

random fixes to MathML input that OOXML needs to render properly

# File 'lib/html2doc/math.rb', line 45

def self.ooxml_cleanup(math, docnamespaces)
  math = unwrap_accents(
    mathml_preserve_space(
      mathml_insert_rows(math, docnamespaces), docnamespaces
    ),
  )
  math.add_namespace(nil, "http://www.w3.org/1998/Math/MathML")
  math
end

.process(result, hash) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 8

def self.process(result, hash)
  hash[:dir1] = create_dir(hash[:filename], hash[:dir])
  result = process_html(result, hash)
  process_header(hash[:header_file], hash)
  generate_filelist(hash[:filename], hash[:dir1])
  File.open("#{hash[:filename]}.htm", "w:UTF-8") { |f| f.write(result) }
  mime_package result, hash[:filename], hash[:dir1]
  rm_temp_files(hash[:filename], hash[:dir], hash[:dir1]) unless hash[:debug]
end

.process_footnote_link(docxml, elem, idx, footnote) ⇒ `Object`

# File 'lib/html2doc/notes.rb', line 50

def self.process_footnote_link(docxml, elem, idx, footnote)
  return false unless footnote?(elem)

  href = elem["href"].gsub(/^#/, "")
  note = docxml.at("//*[@name = '#{href}' or @id = '#{href}']")
  return false if note.nil?

  set_footnote_link_attrs(elem, idx)
  if elem.at("./span[@class = 'MsoFootnoteReference']")
    elem.children.each do |c|
      if c.name == "span" && c["class"] == "MsoFootnoteReference"
        c.replace(FN)
      else
        c.wrap("<span class='MsoFootnoteReference'></span>")
      end
    end
  else
    elem.children = FN
  end
  footnote << transform_footnote_text(note)
end

.process_footnote_texts(docxml, footnotes) ⇒ `Object`

# File 'lib/html2doc/notes.rb', line 15

def self.process_footnote_texts(docxml, footnotes)
  body = docxml.at("//body")
  list = body.add_child("<div style='mso-element:footnote-list'/>")
  footnotes.each_with_index do |f, i|
    fn = list.first.add_child(footnote_container(docxml, i + 1))
    f.parent = fn.first
    footnote_div_to_p(f)
  end
  footnote_cleanup(docxml)
end

.process_header(headerfile, hash) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 18

def self.process_header(headerfile, hash)
  return if headerfile.nil?

  doc = File.read(headerfile, encoding: "utf-8")
  doc = header_image_cleanup(doc, hash[:dir1], hash[:filename],
                             File.dirname(hash[:filename]))
  File.open("#{hash[:dir1]}/header.html", "w:UTF-8") { |f| f.write(doc) }
end

.process_html(result, hash) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 42

def self.process_html(result, hash)
  docxml = to_xhtml(asciimath_to_mathml(result, hash[:asciimathdelims]))
  define_head(cleanup(docxml, hash), hash)
  msword_fix(from_xhtml(docxml))
end

.rm_temp_files(filename, dir, dir1) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 48

def self.rm_temp_files(filename, dir, dir1)
  FileUtils.rm "#{filename}.htm"
  FileUtils.rm_f "#{dir1}/header.html"
  FileUtils.rm_r dir1 unless dir
end

.rootnamespace(root) ⇒ `Object`



188
189
190

# File 'lib/html2doc/base.rb', line 188

def self.rootnamespace(root)
  root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
end

.set_footnote_link_attrs(elem, idx) ⇒ `Object`

# File 'lib/html2doc/notes.rb', line 87

def self.set_footnote_link_attrs(elem, idx)
  elem["style"] = "mso-footnote-id:ftn#{idx}"
  elem["href"] = "#_ftn#{idx}"
  elem["name"] = "_ftnref#{idx}"
  elem["title"] = ""
end

.style_list(li, level, liststyle, listnumber) ⇒ `Object`

# File 'lib/html2doc/lists.rb', line 8

def self.style_list(li, level, liststyle, listnumber)
  return unless liststyle

  if li["style"]
    li["style"] += ";"
  else
    li["style"] = ""
  end
  li["style"] += "mso-list:#{liststyle} level#{level} lfo#{listnumber};"
end

.stylesheet(filename, header_filename, fn) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 150

def self.stylesheet(filename, header_filename, fn)
  (fn.nil? || fn.empty?) and
    fn = File.join(File.dirname(__FILE__), "wordstyle.css")
  stylesheet = File.read(fn, encoding: "UTF-8")
  xml = Nokogiri::XML("<style/>")
  xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n")
  xml.root.to_s
end

.to_plane1(xml, font) ⇒ `Object`

# File 'lib/html2doc/math.rb', line 118

def self.to_plane1(xml, font)
  xml.traverse do |n|
    next unless n.text?

    n.replace(Plane1Converter.conv(HTMLEntities.new.decode(n.text), font))
  end
  xml
end

.to_xhtml(xml) ⇒ `Object`

# File 'lib/html2doc/base.rb', line 73

def self.to_xhtml(xml)
  xml.gsub!(/<\?xml[^>]*>/, "")
  unless /<!DOCTYPE /.match? xml
    xml = '<!DOCTYPE html SYSTEM
        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
  end
  Nokogiri::XML.parse(xml)
end

.transform_footnote_text(note) ⇒ `Object`

# File 'lib/html2doc/notes.rb', line 72

def self.transform_footnote_text(note)
  note["id"] = ""
  note.xpath(".//div").each { |div| div.replace(div.children) }
  note.xpath(".//aside | .//p").each do |p|
    p.name = "p"
    p["class"] = "MsoFootnoteText"
  end
  note.remove
end

.uncenter(math, ooxml) ⇒ `Object`

if oomml has no siblings, by default it is centered; override this with left/right if parent is so tagged

# File 'lib/html2doc/math.rb', line 158

def self.uncenter(math, ooxml)
  alignnode = math.at(".//ancestor::*[@style][local-name() = 'p' or "\
                   "local-name() = 'div' or local-name() = 'td']/@style")
  return ooxml unless alignnode && (math.next == nil && math.previous == nil)

  %w(left right).each do |dir|
    if alignnode.text.include? ("text-align:#{dir}")
      ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
        "m:val='#{dir}'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
    end
  end
  ooxml
end

.unitalic(math) ⇒ `Object`

# File 'lib/html2doc/math.rb', line 72

def self.unitalic(math)
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'p']]").each do |x|
    x.wrap("<span style='font-style:normal;'></span>")
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'bi']]").each do |x|
    x.wrap("<span class='nostem' style='font-weight:bold;'><em></em></span>")
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'i']]").each do |x|
    x.wrap("<span class='nostem'><em></em></span>")
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'b']]").each do |x|
    x.wrap("<span style='font-style:normal;font-weight:bold;'></span>")
  end
  math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'monospace']]").each do |x|
    to_plane1(x, :monospace)
  end
  math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'double-struck']]").each do |x|
    to_plane1(x, :doublestruck)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'script']]").each do |x|
    to_plane1(x, :script)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'script']]").each do |x|
    to_plane1(x, :scriptbold)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
    to_plane1(x, :fraktur)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
    to_plane1(x, :frakturbold)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    to_plane1(x, :sans)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    to_plane1(x, :sansbold)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'i']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    to_plane1(x, :sansitalic)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'bi']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    to_plane1(x, :sansbolditalic)
  end
  math
end

.unwrap_accents(doc) ⇒ `Object`

# File 'lib/html2doc/math.rb', line 35

def self.unwrap_accents(doc)
  doc.xpath("//*[@accent = 'true']").each do |x|
    x.elements.length > 1 or next
    x.elements[1].name == "mrow" and
      x.elements[1].replace(x.elements[1].children)
  end
  doc
end

.warnsvg(src) ⇒ `Object`



99
100
101

# File 'lib/html2doc/mime.rb', line 99

def self.warnsvg(src)
  warn "#{src}: SVG not supported" if /\.svg$/i.match?(src)
end

Module: Html2Doc

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.add_stylesheet(head, title, css) ⇒ Object

.asciimath_to_mathml(doc, delims) ⇒ Object

.asciimath_to_mathml1(expr) ⇒ Object

.bookmarks(docxml) ⇒ Object

.cleanup(docxml, hash) ⇒ Object

.clear_dir(dir) ⇒ Object

.contentid(mhtml) ⇒ Object

.create_dir(filename, dir) ⇒ Object

.define_head(docxml, hash) ⇒ Object

.define_head1(docxml, dir) ⇒ Object

.esc_space(xml) ⇒ Object

.filename_substitute(head, header_filename) ⇒ Object

.footnote?(elem) ⇒ Boolean

.footnote_cleanup(docxml) ⇒ Object

.footnote_container(docxml, idx) ⇒ Object

.footnote_div_to_p(elem) ⇒ Object

.footnotes(docxml) ⇒ Object

.from_xhtml(xml) ⇒ Object

.generate_filelist(filename, dir) ⇒ Object

.header_image_cleanup(doc, dir, filename, localdir) ⇒ Object

.header_image_cleanup1(a, dir, _filename, localdir) ⇒ Object

.image_cleanup(docxml, dir, localdir) ⇒ Object

.image_resize(i, path, maxheight, maxwidth) ⇒ Object

.list2para(u) ⇒ Object

.list_add(xpath, liststyles, listtype, level) ⇒ Object

.list_add1(li, liststyles, listtype, level) ⇒ Object

.lists(docxml, liststyles) ⇒ Object

.lists1(docxml, liststyles, k) ⇒ Object

.lists_unstyled(docxml, liststyles) ⇒ Object

.mathml_insert_rows(math, docnamespaces) ⇒ Object

.mathml_preserve_space(math, docnamespaces) ⇒ Object

.mathml_to_ooml(docxml) ⇒ Object

.mime_attachment(boundary, _filename, item, dir) ⇒ Object

.mime_boundary ⇒ Object

.mime_package(result, filename, dir) ⇒ Object

.mime_preamble(boundary, filename, result) ⇒ Object

.mime_type(item) ⇒ Object

.mkuuid ⇒ Object

.msonormal(docxml) ⇒ Object

.msword_fix(doc) ⇒ Object

.namespace(root) ⇒ Object

.ooxml_cleanup(math, docnamespaces) ⇒ Object

.process(result, hash) ⇒ Object

.process_footnote_link(docxml, elem, idx, footnote) ⇒ Object

.process_footnote_texts(docxml, footnotes) ⇒ Object

.process_header(headerfile, hash) ⇒ Object

.process_html(result, hash) ⇒ Object

.rm_temp_files(filename, dir, dir1) ⇒ Object

.rootnamespace(root) ⇒ Object

.set_footnote_link_attrs(elem, idx) ⇒ Object

.style_list(li, level, liststyle, listnumber) ⇒ Object

.stylesheet(filename, header_filename, fn) ⇒ Object

.to_plane1(xml, font) ⇒ Object

.to_xhtml(xml) ⇒ Object

.transform_footnote_text(note) ⇒ Object

.uncenter(math, ooxml) ⇒ Object

.unitalic(math) ⇒ Object

.unwrap_accents(doc) ⇒ Object

.warnsvg(src) ⇒ Object