Module: Html2Doc

Defined in:
lib/html2doc/base.rb,
lib/html2doc/math.rb,
lib/html2doc/mime.rb,
lib/html2doc/lists.rb,
lib/html2doc/notes.rb,
lib/html2doc/version.rb

Constant Summary collapse

NOKOHEAD =
<<~HERE.freeze
  <!DOCTYPE html SYSTEM
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  <html xmlns="http://www.w3.org/1999/xhtml">
  <head> <title></title> <meta charset="UTF-8" /> </head>
  <body> </body> </html>
HERE
DOCTYPE =
<<~"DOCTYPE".freeze
  <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
DOCTYPE
<<~XML.freeze
  <!--[if gte mso 9]>
  <xml>
  <w:WordDocument>
  <w:View>Print</w:View>
  <w:Zoom>100</w:Zoom>
  <w:DoNotOptimizeForBrowser/>
  </w:WordDocument>
  </xml>
  <![endif]-->
  <meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
XML
HTML_NS =
'xmlns="http://www.w3.org/1999/xhtml"'.freeze
IMAGE_PATH =
"//*[local-name() = 'img' or local-name() = 'imagedata']".freeze
TOPLIST =
"[not(ancestor::ul) and not(ancestor::ol)]".freeze
FN =
"<span class='MsoFootnoteReference'>"\
"<span style='mso-special-character:footnote'/></span>".freeze
VERSION =
"1.1.3".freeze

Class Method Summary collapse

Class Method Details

.add_stylesheet(head, title, css) ⇒ Object



170
171
172
173
174
175
176
177
178
# File 'lib/html2doc/base.rb', line 170

def self.add_stylesheet(head, title, css)
  if head.children.empty?
    head.add_child css
  elsif title.nil?
    head.children.first.add_previous_sibling css
  else
    title.add_next_sibling css
  end
end

.asciimath_to_mathml(doc, delims) ⇒ Object



23
24
25
26
27
28
29
30
31
32
# File 'lib/html2doc/math.rb', line 23

def self.asciimath_to_mathml(doc, delims)
  return doc if delims.nil? || delims.size < 2

  m = doc.split(/(#{Regexp.escape(delims[0])}|#{Regexp.escape(delims[1])})/)
  m.each_slice(4).map.with_index do |(*a), i|
    progress_conv(i, 500, (m.size / 4).floor, 1000, "AsciiMath")
    a[2].nil? || a[2] = asciimath_to_mathml1(a[2])
    a.size > 1 ? a[0] + a[2] : a[0]
  end.join
end

.asciimath_to_mathml1(expr) ⇒ Object



12
13
14
15
16
17
18
19
20
21
# File 'lib/html2doc/math.rb', line 12

def self.asciimath_to_mathml1(expr)
  AsciiMath::MathMLBuilder.new(msword: true).append_expression(
    AsciiMath.parse(HTMLEntities.new.decode(expr)).ast,
  ).to_s
    .gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>")
rescue StandardError => e
  puts "parsing: #{expr}"
  puts e.message
  raise e
end

.bookmarks(docxml) ⇒ Object



193
194
195
196
197
198
199
200
201
202
203
204
# File 'lib/html2doc/base.rb', line 193

def self.bookmarks(docxml)
  docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
    .each do |x|
    next if x["id"].empty? ||
      %w(shapetype v:shapetype shape v:shape).include?(x.name)

    if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
    else x.children.first.previous = "<a name='#{x['id']}'></a>"
    end
    x.delete("id")
  end
end

.cleanup(docxml, hash) ⇒ Object



54
55
56
57
58
59
60
61
62
63
# File 'lib/html2doc/base.rb', line 54

def self.cleanup(docxml, hash)
  namespace(docxml.root)
  image_cleanup(docxml, hash[:dir1], File.dirname(hash[:filename]))
  mathml_to_ooml(docxml)
  lists(docxml, hash[:liststyles])
  footnotes(docxml)
  bookmarks(docxml)
  msonormal(docxml)
  docxml
end

.clear_dir(dir) ⇒ Object



27
28
29
30
31
32
33
# File 'lib/html2doc/base.rb', line 27

def self.clear_dir(dir)
  Dir.foreach(dir) do |f|
    fn = File.join(dir, f)
    File.delete(fn) if f != "." && f != ".."
  end
  dir
end

.contentid(mhtml) ⇒ Object



69
70
71
72
73
74
75
76
77
# File 'lib/html2doc/mime.rb', line 69

def self.contentid(mhtml)
  mhtml.gsub %r{(<img[^>]*?src=")([^\"']+)(['"])}m do |m|
    repl = "#{$1}cid:#{File.basename($2)}#{$3}"
    /^data:|^https?:/.match($2) ? m : repl
  end.gsub %r{(<v:imagedata[^>]*?src=")([^\"']+)(['"])}m do |m|
    repl = "#{$1}cid:#{File.basename($2)}#{$3}"
    /^data:|^https?:/.match($2) ? m : repl
  end
end

.create_dir(filename, dir) ⇒ Object



35
36
37
38
39
40
# File 'lib/html2doc/base.rb', line 35

def self.create_dir(filename, dir)
  dir and return clear_dir(dir)
  dir = "#{filename}_files"
  Dir.mkdir(dir) unless File.exists?(dir)
  clear_dir(dir)
end

.define_head(docxml, hash) ⇒ Object



160
161
162
163
164
165
166
167
168
# File 'lib/html2doc/base.rb', line 160

def self.define_head(docxml, hash)
  title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']")
  head = docxml.at("//*[local-name() = 'head']")
  css = stylesheet(hash[:filename], hash[:header_file], hash[:stylesheet])
  add_stylesheet(head, title, css)
  filename_substitute(head, hash[:header_file])
  define_head1(docxml, hash[:dir1])
  rootnamespace(docxml.root)
end

.define_head1(docxml, _dir) ⇒ Object



131
132
133
134
135
136
137
138
# File 'lib/html2doc/base.rb', line 131

def self.define_head1(docxml, _dir)
  docxml.xpath("//*[local-name() = 'head']").each do |h|
    h.children.first.add_previous_sibling <<~XML
      #{PRINT_VIEW}
        <link rel="File-List" href="cid:filelist.xml"/>
    XML
  end
end

.esc_space(xml) ⇒ Object

escape space as &#x32;; we are removing any spaces generated by XML indentation



164
165
166
167
168
169
170
171
# File 'lib/html2doc/math.rb', line 164

def self.esc_space(xml)
  xml.traverse do |n|
    next unless n.text?

    n = n.text.gsub(/ /, "&#x32;")
  end
  xml
end

.filename_substitute(head, header_filename) ⇒ Object



140
141
142
143
144
145
146
147
148
149
# File 'lib/html2doc/base.rb', line 140

def self.filename_substitute(head, header_filename)
  return if header_filename.nil?

  head.xpath(".//*[local-name() = 'style']").each do |s|
    s1 = s.to_xml.gsub(/url\("[^"]+"\)/) do |m|
      /FILENAME/.match?(m) ? "url(cid:header.html)" : m
    end
    s.replace(s1)
  end
end

.footnote?(elem) ⇒ Boolean

Returns:

  • (Boolean)


85
86
87
88
# File 'lib/html2doc/notes.rb', line 85

def self.footnote?(elem)
  elem["epub:type"]&.casecmp("footnote")&.zero? ||
    elem["class"]&.casecmp("footnote")&.zero?
end

.footnote_cleanup(docxml) ⇒ Object

We expect that the content of the footnote text received is one or more text containers, p or aside or div (which we have already converted to p). We do not expect any <a name> or links back to text; if they are present in the HTML, they need to have been cleaned out before passing to this gem



102
103
104
105
106
107
108
109
# File 'lib/html2doc/notes.rb', line 102

def self.footnote_cleanup(docxml)
  docxml.xpath('//div[@style="mso-element:footnote"]/a')
    .each do |x|
    n = x.next_element
    n&.children&.first&.add_previous_sibling(x.remove)
  end
  docxml
end

.footnote_container(docxml, idx) ⇒ Object



40
41
42
43
44
45
46
47
48
# File 'lib/html2doc/notes.rb', line 40

def self.footnote_container(docxml, idx)
  ref = docxml&.at("//a[@href='#_ftn#{idx}']")&.children&.to_xml(indent: 0)
    &.gsub(/>\n</, "><") || FN
  <<~DIV
    <div style='mso-element:footnote' id='ftn#{idx}'>
      <a style='mso-footnote-id:ftn#{idx}' href='#_ftn#{idx}'
         name='_ftnref#{idx}' title='' id='_ftnref#{idx}'>#{ref.strip}</a></div>
  DIV
end

.footnote_div_to_p(elem) ⇒ Object



26
27
28
29
30
31
32
33
34
35
# File 'lib/html2doc/notes.rb', line 26

def self.footnote_div_to_p(elem)
  if %w{div aside}.include? elem.name
    if elem.at(".//p")
      elem.replace(elem.children)
    else
      elem.name = "p"
      elem["class"] = "MsoFootnoteText"
    end
  end
end

.footnotes(docxml) ⇒ Object



4
5
6
7
8
9
10
11
12
13
# File 'lib/html2doc/notes.rb', line 4

def self.footnotes(docxml)
  i = 1
  fn = []
  docxml.xpath("//a").each do |a|
    next unless process_footnote_link(docxml, a, i, fn)

    i += 1
  end
  process_footnote_texts(docxml, fn)
end

.from_xhtml(xml) ⇒ Object



86
87
88
89
90
# File 'lib/html2doc/base.rb', line 86

def self.from_xhtml(xml)
  xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
    .sub(DOCTYPE, "")
    .gsub(%{ />}, "/>")
end

.generate_filelist(filename, dir) ⇒ Object



143
144
145
146
147
148
149
150
151
152
153
154
# File 'lib/html2doc/mime.rb', line 143

def self.generate_filelist(filename, dir)
  File.open(File.join(dir, "filelist.xml"), "w") do |f|
    f.write %{<xml xmlns:o="urn:schemas-microsoft-com:office:office">
      <o:MainFile HRef="../#{filename}.htm"/>}
    Dir.entries(dir).sort.each do |item|
      next if item == "." || item == ".." || /^\./.match(item)

      f.write %{  <o:File HRef="#{item}"/>\n}
    end
    f.write("</xml>\n")
  end
end

.header_image_cleanup(doc, dir, filename, localdir) ⇒ Object

do not parse the header through Nokogiri, since it will contain non-XML like <![if !supportFootnotes]>



125
126
127
128
129
# File 'lib/html2doc/mime.rb', line 125

def self.header_image_cleanup(doc, dir, filename, localdir)
  doc.split(%r{(<img [^>]*>|<v:imagedata [^>]*>)}).each_slice(2).map do |a|
    header_image_cleanup1(a, dir, filename, localdir)
  end.join
end

.header_image_cleanup1(a, dir, _filename, localdir) ⇒ Object



131
132
133
134
135
136
137
138
139
140
141
# File 'lib/html2doc/mime.rb', line 131

def self.header_image_cleanup1(a, dir, _filename, localdir)
  if a.size == 2 && !(/ src="https?:/.match a[1]) &&
      !(%r{ src="data:(image|application)/[^;]+;base64}.match a[1])
    m = / src=['"](?<src>[^"']+)['"]/.match a[1]
    m2 = /\.(?<suffix>[a-zA-Z_0-9]+)$/.match m[:src]
    new_filename = "#{mkuuid}.#{m2[:suffix]}"
    FileUtils.cp localname(m[:src], localdir), File.join(dir, new_filename)
    a[1].sub!(%r{ src=['"](?<src>[^"']+)['"]}, " src='cid:#{new_filename}'")
  end
  a.join
end

.image_cleanup(docxml, dir, localdir) ⇒ Object

only processes locally stored images



108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/html2doc/mime.rb', line 108

def self.image_cleanup(docxml, dir, localdir)
  docxml.traverse do |i|
    next unless i.element? && %w(img v:imagedata).include?(i.name)
    next if /^http/.match? i["src"]
    next if %r{^data:(image|application)/[^;]+;base64}.match? i["src"]

    local_filename = localname(i["src"], localdir)
    new_filename = "#{mkuuid}#{File.extname(i['src'])}"
    FileUtils.cp local_filename, File.join(dir, new_filename)
    i["width"], i["height"] = image_resize(i, local_filename, 680, 400)
    i["src"] = File.join(File.basename(dir), new_filename)
  end
  docxml
end

.image_resize(img, path, maxheight, maxwidth) ⇒ Object

max width for Word document is 400, max height is 680



80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/html2doc/mime.rb', line 80

def self.image_resize(img, path, maxheight, maxwidth)
  realsize = ImageSize.path(path).size
  s = [img["width"].to_i, img["height"].to_i]
  s = realsize if s[0].zero? && s[1].zero?
  return [nil, nil] if realsize.nil? || realsize[0].nil? || realsize[1].nil?

  s[1] = s[0] * realsize[1] / realsize[0] if s[1].zero? && !s[0].zero?
  s[0] = s[1] * realsize[0] / realsize[1] if s[0].zero? && !s[1].zero?
  s = [(s[0] * maxheight / s[1]).ceil, maxheight] if s[1] > maxheight
  s = [maxwidth, (s[1] * maxwidth / s[0]).ceil] if s[0] > maxwidth
  s
end

.list2para(list) ⇒ Object



49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/html2doc/lists.rb', line 49

def self.list2para(list)
  return if list.xpath("./li").empty?

  list.xpath("./li").first["class"] ||= "MsoListParagraphCxSpFirst"
  list.xpath("./li").last["class"] ||= "MsoListParagraphCxSpLast"
  list.xpath("./li/p").each { |p| p["class"] ||= "MsoListParagraphCxSpMiddle" }
  list.xpath("./li").each do |l|
    l.name = "p"
    l["class"] ||= "MsoListParagraphCxSpMiddle"
    l&.first_element_child&.name == "p" and
      l.first_element_child.replace(l.first_element_child.children)
  end
  list.replace(list.children)
end

.list_add(xpath, liststyles, listtype, level) ⇒ Object



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/html2doc/lists.rb', line 32

def self.list_add(xpath, liststyles, listtype, level)
  xpath.each_with_index do |l, _i|
    @listnumber += 1 if level == 1
    l["seen"] = true if level == 1
    l["id"] ||= UUIDTools::UUID.random_create
    (l.xpath(".//li") - l.xpath(".//ol//li | .//ul//li")).each do |li|
      style_list(li, level, liststyles[listtype], @listnumber)
      list_add1(li, liststyles, listtype, level)
    end
    l.xpath(".//ul[not(ancestor::li/ancestor::*/@id = '#{l['id']}')] | "\
            ".//ol[not(ancestor::li/ancestor::*/@id = '#{l['id']}')]")
      .each do |li|
      list_add1(li.parent, liststyles, listtype, level - 1)
    end
  end
end

.list_add1(elem, liststyles, listtype, level) ⇒ Object



18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/html2doc/lists.rb', line 18

def self.list_add1(elem, liststyles, listtype, level)
  if %i[ul ol].include? listtype
    list_add(elem.xpath(".//ul") - elem.xpath(".//ul//ul | .//ol//ul"),
             liststyles, :ul, level + 1)
    list_add(elem.xpath(".//ol") - elem.xpath(".//ul//ol | .//ol//ol"),
             liststyles, :ol, level + 1)
  else
    list_add(elem.xpath(".//ul") - elem.xpath(".//ul//ul | .//ol//ul"),
             liststyles, listtype, level + 1)
    list_add(elem.xpath(".//ol") - elem.xpath(".//ul//ol | .//ol//ol"),
             liststyles, listtype, level + 1)
  end
end

.lists(docxml, liststyles) ⇒ Object



91
92
93
94
95
96
97
98
99
# File 'lib/html2doc/lists.rb', line 91

def self.lists(docxml, liststyles)
  return if liststyles.nil?

  @listnumber = 0
  liststyles.each_key { |k| lists1(docxml, liststyles, k) }
  lists_unstyled(docxml, liststyles)
  liststyles.has_key?(:ul) and docxml.xpath("//ul").each { |u| list2para(u) }
  liststyles.has_key?(:ol) and docxml.xpath("//ol").each { |u| list2para(u) }
end

.lists1(docxml, liststyles, style) ⇒ Object



66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/html2doc/lists.rb', line 66

def self.lists1(docxml, liststyles, style)
  case style
  when :ul then list_add(docxml.xpath("//ul[not(@class)]#{TOPLIST}"),
                         liststyles, :ul, 1)
  when :ol then list_add(docxml.xpath("//ol[not(@class)]#{TOPLIST}"),
                         liststyles, :ol, 1)
  else
    list_add(docxml.xpath("//ol[@class = '#{style}']#{TOPLIST} | "\
                          "//ul[@class = '#{style}']#{TOPLIST}"),
    liststyles, style, 1)
  end
end

.lists_unstyled(docxml, liststyles) ⇒ Object



79
80
81
82
83
84
85
86
87
88
89
# File 'lib/html2doc/lists.rb', line 79

def self.lists_unstyled(docxml, liststyles)
  liststyles.has_key?(:ul) and
    list_add(docxml.xpath("//ul#{TOPLIST}[not(@seen)]"),
             liststyles, :ul, 1)
  liststyles.has_key?(:ol) and
    list_add(docxml.xpath("//ol#{TOPLIST}[not(@seen)]"),
             liststyles, :ul, 1)
  docxml.xpath("//ul[@seen] | //ol[@seen]").each do |l|
    l.delete("seen")
  end
end

.localname(src, localdir) ⇒ Object



103
104
105
# File 'lib/html2doc/mime.rb', line 103

def self.localname(src, localdir)
  %r{^([A-Z]:)?/}.match?(src) ? src : File.join(localdir, src)
end

.mathml_insert_rows(math, docnamespaces) ⇒ Object



60
61
62
63
64
65
66
67
68
# File 'lib/html2doc/math.rb', line 60

def self.mathml_insert_rows(math, docnamespaces)
  math.xpath(%w(msup msub msubsup munder mover munderover)
          .map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
    next unless x.next_element && x.next_element != "mrow"

    x.next_element.wrap("<mrow/>")
  end
  math
end

.mathml_preserve_space(math, docnamespaces) ⇒ Object



70
71
72
73
74
75
# File 'lib/html2doc/math.rb', line 70

def self.mathml_preserve_space(math, docnamespaces)
  math.xpath(".//xmlns:mtext", docnamespaces).each do |x|
    x.children = x.children.to_xml.gsub(/^\s/, "&#xA0;").gsub(/\s$/, "&#xA0;")
  end
  math
end

.mathml_to_ooml(docxml) ⇒ Object



134
135
136
137
138
139
140
141
# File 'lib/html2doc/math.rb', line 134

def self.mathml_to_ooml(docxml)
  docnamespaces = docxml.collect_namespaces
  m = docxml.xpath("//*[local-name() = 'math']")
  m.each_with_index do |x, i|
    progress_conv(i, 100, m.size, 500, "Math OOXML")
    mathml_to_ooml1(x, docnamespaces)
  end
end

.mathml_to_ooml1(xml, docnamespaces) ⇒ Object



154
155
156
157
158
159
160
# File 'lib/html2doc/math.rb', line 154

def self.mathml_to_ooml1(xml, docnamespaces)
  doc = Nokogiri::XML::Document::new
  doc.root = ooxml_cleanup(xml, docnamespaces)
    ooxml = ooml_clean(unitalic(esc_space(@xsltemplate.transform(doc))))
  ooxml = uncenter(xml, ooxml)
  xml.swap(ooxml)
end

.mime_attachment(boundary, _filename, item, dir) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/html2doc/mime.rb', line 23

def self.mime_attachment(boundary, _filename, item, dir)
  content_type = mime_type(item)
  text_mode = %w[text application].any? { |p| content_type.start_with? p }

  path = File.join(dir, item)
  content = text_mode ? File.read(path, encoding: "utf-8") : IO.binread(path)

  encoded_file = Base64.strict_encode64(content).gsub(/(.{76})/, "\\1\n")
  <<~"FILE"
    --#{boundary}
    Content-ID: <#{File.basename(item)}>
    Content-Disposition: inline; filename="#{File.basename(item)}"
    Content-Transfer-Encoding: base64
    Content-Type: #{content_type}

    #{encoded_file}

  FILE
end

.mime_boundaryObject



50
51
52
53
# File 'lib/html2doc/mime.rb', line 50

def self.mime_boundary
  salt = UUIDTools::UUID.random_create.to_s.gsub(/-/, ".")[0..17]
  "----=_NextPart_#{salt}"
end

.mime_package(result, filename, dir) ⇒ Object



55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/html2doc/mime.rb', line 55

def self.mime_package(result, filename, dir)
  boundary = mime_boundary
  mhtml = mime_preamble(boundary, "#{filename}.htm", result)
  mhtml += mime_attachment(boundary, "#{filename}.htm", "filelist.xml", dir)
  Dir.foreach(dir) do |item|
    next if item == "." || item == ".." || /^\./.match(item) ||
      item == "filelist.xml"

    mhtml += mime_attachment(boundary, "#{filename}.htm", item, dir)
  end
  mhtml += "--#{boundary}--"
  File.open("#{filename}.doc", "w:UTF-8") { |f| f.write contentid(mhtml) }
end

.mime_preamble(boundary, filename, result) ⇒ Object



8
9
10
11
12
13
14
15
16
17
18
19
20
21
# File 'lib/html2doc/mime.rb', line 8

def self.mime_preamble(boundary, filename, result)
  <<~"PREAMBLE"
    MIME-Version: 1.0
    Content-Type: multipart/related; boundary="#{boundary}"

    --#{boundary}
    Content-ID: <#{File.basename(filename)}>
    Content-Disposition: inline; filename="#{File.basename(filename)}"
    Content-Type: text/html; charset="utf-8"

    #{result}

  PREAMBLE
end

.mime_type(item) ⇒ Object



43
44
45
46
47
48
# File 'lib/html2doc/mime.rb', line 43

def self.mime_type(item)
  types = MIME::Types.type_for(item)
  type = types ? types.first.to_s : 'text/plain; charset="utf-8"'
  type = %(#{type} charset="utf-8") if /^text/.match(type) && types
  type
end

.mkuuidObject



95
96
97
# File 'lib/html2doc/mime.rb', line 95

def self.mkuuid
  UUIDTools::UUID.random_create.to_s
end

.msonormal(docxml) ⇒ Object



206
207
208
209
210
211
212
213
# File 'lib/html2doc/base.rb', line 206

def self.msonormal(docxml)
  docxml.xpath("//*[local-name() = 'p'][not(self::*[@class])]").each do |p|
    p["class"] = "MsoNormal"
  end
  docxml.xpath("//*[local-name() = 'li'][not(self::*[@class])]").each do |p|
    p["class"] = "MsoNormal"
  end
end

.msword_fix(doc) ⇒ Object



92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# File 'lib/html2doc/base.rb', line 92

def self.msword_fix(doc)
  # brain damage in MSWord parser
  doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
            '<span style="mso-special-character:footnote"></span>')
  doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
            '<div style="mso-element:footnote-list"/>')
  doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
  doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
  doc.gsub!(%r{<meta http-equiv="Content-Type"},
            "<meta http-equiv=Content-Type")
  doc.gsub!(%r{></m:jc>}, "/>")
  doc.gsub!(%r{></v:stroke>}, "/>")
  doc.gsub!(%r{></v:f>}, "/>")
  doc.gsub!(%r{></v:path>}, "/>")
  doc.gsub!(%r{></o:lock>}, "/>")
  doc.gsub!(%r{></v:imagedata>}, "/>")
  doc.gsub!(%r{></w:wrap>}, "/>")
  doc.gsub!(%r{<(/)?m:(span|em)\b}, "<\\1\\2")
  doc.gsub!(%r{&tab;|&amp;tab;},
            '<span style="mso-tab-count:1">&#xA0; </span>')
  doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
    a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
    a
  end.join
end

.namespace(root) ⇒ Object



180
181
182
183
184
185
186
187
# File 'lib/html2doc/base.rb', line 180

def self.namespace(root)
  {
    o: "urn:schemas-microsoft-com:office:office",
    w: "urn:schemas-microsoft-com:office:word",
    v: "urn:schemas-microsoft-com:vml",
    m: "http://schemas.microsoft.com/office/2004/12/omml",
  }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
end

.ooml_clean(xml) ⇒ Object

We need span and em not to be namespaced. Word can’t deal with explicit namespaces. We will end up stripping them out again under Nokogiri 1.11, which correctly insists on inheriting namespace from parent.



147
148
149
150
151
152
# File 'lib/html2doc/math.rb', line 147

def self.ooml_clean(xml)
  xml.to_s
    .gsub(/<\?[^>]+>\s*/, "")
    .gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
    .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
end

.ooxml_cleanup(math, docnamespaces) ⇒ Object

random fixes to MathML input that OOXML needs to render properly



50
51
52
53
54
55
56
57
58
# File 'lib/html2doc/math.rb', line 50

def self.ooxml_cleanup(math, docnamespaces)
  math = unwrap_accents(
    mathml_preserve_space(
      mathml_insert_rows(math, docnamespaces), docnamespaces
    ),
  )
  math.add_namespace(nil, "http://www.w3.org/1998/Math/MathML")
  math
end

.process(result, hash) ⇒ Object



8
9
10
11
12
13
14
15
16
# File 'lib/html2doc/base.rb', line 8

def self.process(result, hash)
  hash[:dir1] = create_dir(hash[:filename], hash[:dir])
  result = process_html(result, hash)
  process_header(hash[:header_file], hash)
  generate_filelist(hash[:filename], hash[:dir1])
  File.open("#{hash[:filename]}.htm", "w:UTF-8") { |f| f.write(result) }
  mime_package result, hash[:filename], hash[:dir1]
  rm_temp_files(hash[:filename], hash[:dir], hash[:dir1]) unless hash[:debug]
end


50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/html2doc/notes.rb', line 50

def self.process_footnote_link(docxml, elem, idx, footnote)
  return false unless footnote?(elem)

  href = elem["href"].gsub(/^#/, "")
  note = docxml.at("//*[@name = '#{href}' or @id = '#{href}']")
  return false if note.nil?

  set_footnote_link_attrs(elem, idx)
  if elem.at("./span[@class = 'MsoFootnoteReference']")
    process_footnote_link1(elem)
  else elem.children = FN
  end
  footnote << transform_footnote_text(note)
end

.process_footnote_link1(elem) ⇒ Object



65
66
67
68
69
70
71
72
73
# File 'lib/html2doc/notes.rb', line 65

def self.process_footnote_link1(elem)
  elem.children.each do |c|
    if c.name == "span" && c["class"] == "MsoFootnoteReference"
      c.replace(FN)
    else
      c.wrap("<span class='MsoFootnoteReference'></span>")
    end
  end
end

.process_footnote_texts(docxml, footnotes) ⇒ Object



15
16
17
18
19
20
21
22
23
24
# File 'lib/html2doc/notes.rb', line 15

def self.process_footnote_texts(docxml, footnotes)
  body = docxml.at("//body")
  list = body.add_child("<div style='mso-element:footnote-list'/>")
  footnotes.each_with_index do |f, i|
    fn = list.first.add_child(footnote_container(docxml, i + 1))
    f.parent = fn.first
    footnote_div_to_p(f)
  end
  footnote_cleanup(docxml)
end

.process_header(headerfile, hash) ⇒ Object



18
19
20
21
22
23
24
25
# File 'lib/html2doc/base.rb', line 18

def self.process_header(headerfile, hash)
  return if headerfile.nil?

  doc = File.read(headerfile, encoding: "utf-8")
  doc = header_image_cleanup(doc, hash[:dir1], hash[:filename],
                             File.dirname(hash[:filename]))
  File.open("#{hash[:dir1]}/header.html", "w:UTF-8") { |f| f.write(doc) }
end

.process_html(result, hash) ⇒ Object



42
43
44
45
46
# File 'lib/html2doc/base.rb', line 42

def self.process_html(result, hash)
  docxml = to_xhtml(asciimath_to_mathml(result, hash[:asciimathdelims]))
  define_head(cleanup(docxml, hash), hash)
  msword_fix(from_xhtml(docxml))
end

.progress_conv(idx, step, total, threshold, msg) ⇒ Object



34
35
36
37
38
# File 'lib/html2doc/math.rb', line 34

def self.progress_conv(idx, step, total, threshold, msg)
  return unless (idx % step).zero? && total > threshold && idx.positive?

  warn "#{msg} #{idx} of #{total}"
end

.rm_temp_files(filename, dir, dir1) ⇒ Object



48
49
50
51
52
# File 'lib/html2doc/base.rb', line 48

def self.rm_temp_files(filename, dir, dir1)
  FileUtils.rm "#{filename}.htm"
  FileUtils.rm_f "#{dir1}/header.html"
  FileUtils.rm_r dir1 unless dir
end

.rootnamespace(root) ⇒ Object



189
190
191
# File 'lib/html2doc/base.rb', line 189

def self.rootnamespace(root)
  root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
end


90
91
92
93
94
95
# File 'lib/html2doc/notes.rb', line 90

def self.set_footnote_link_attrs(elem, idx)
  elem["style"] = "mso-footnote-id:ftn#{idx}"
  elem["href"] = "#_ftn#{idx}"
  elem["name"] = "_ftnref#{idx}"
  elem["title"] = ""
end

.style_list(elem, level, liststyle, listnumber) ⇒ Object



7
8
9
10
11
12
13
14
15
16
# File 'lib/html2doc/lists.rb', line 7

def self.style_list(elem, level, liststyle, listnumber)
  return unless liststyle

  if elem["style"]
    elem["style"] += ";"
  else
    elem["style"] = ""
  end
  elem["style"] += "mso-list:#{liststyle} level#{level} lfo#{listnumber};"
end

.stylesheet(_filename, _header_filename, fn) ⇒ Object



151
152
153
154
155
156
157
158
# File 'lib/html2doc/base.rb', line 151

def self.stylesheet(_filename, _header_filename, fn)
  (fn.nil? || fn.empty?) and
    fn = File.join(File.dirname(__FILE__), "wordstyle.css")
  stylesheet = File.read(fn, encoding: "UTF-8")
  xml = Nokogiri::XML("<style/>")
  xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n")
  xml.root.to_s
end

.to_plane1(xml, font) ⇒ Object



125
126
127
128
129
130
131
132
# File 'lib/html2doc/math.rb', line 125

def self.to_plane1(xml, font)
  xml.traverse do |n|
    next unless n.text?

    n.replace(Plane1Converter.conv(HTMLEntities.new.decode(n.text), font))
  end
  xml
end

.to_xhtml(xml) ⇒ Object



73
74
75
76
77
78
79
80
# File 'lib/html2doc/base.rb', line 73

def self.to_xhtml(xml)
  xml.gsub!(/<\?xml[^>]*>/, "")
  unless /<!DOCTYPE /.match? xml
    xml = '<!DOCTYPE html SYSTEM
        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
  end
  Nokogiri::XML.parse(xml)
end

.transform_footnote_text(note) ⇒ Object



75
76
77
78
79
80
81
82
83
# File 'lib/html2doc/notes.rb', line 75

def self.transform_footnote_text(note)
  note["id"] = ""
  note.xpath(".//div").each { |div| div.replace(div.children) }
  note.xpath(".//aside | .//p").each do |p|
    p.name = "p"
    p["class"] = "MsoFootnoteText"
  end
  note.remove
end

.uncenter(math, ooxml) ⇒ Object

if oomml has no siblings, by default it is centered; override this with left/right if parent is so tagged



175
176
177
178
179
180
181
182
183
184
185
186
187
# File 'lib/html2doc/math.rb', line 175

def self.uncenter(math, ooxml)
  alignnode = math.at(".//ancestor::*[@style][local-name() = 'p' or "\
                      "local-name() = 'div' or local-name() = 'td']/@style")
  return ooxml unless alignnode && (math.next == nil && math.previous == nil)

  %w(left right).each do |dir|
    if alignnode.text.include? ("text-align:#{dir}")
      ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
        "m:val='#{dir}'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
    end
  end
  ooxml
end

.unitalic(math) ⇒ Object



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/html2doc/math.rb', line 79

def self.unitalic(math)
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'p']]").each do |x|
    x.wrap("<span #{HTML_NS} style='font-style:normal;'></span>")
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'bi']]").each do |x|
    x.wrap("<span #{HTML_NS} class='nostem' style='font-weight:bold;'><em></em></span>")
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'i']]").each do |x|
    x.wrap("<span #{HTML_NS} class='nostem'><em></em></span>")
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'b']]").each do |x|
    x.wrap("<span #{HTML_NS} style='font-style:normal;font-weight:bold;'></span>")
  end
  math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'monospace']]").each do |x|
    to_plane1(x, :monospace)
  end
  math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'double-struck']]").each do |x|
    to_plane1(x, :doublestruck)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'script']]").each do |x|
    to_plane1(x, :script)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'script']]").each do |x|
    to_plane1(x, :scriptbold)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
    to_plane1(x, :fraktur)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
    to_plane1(x, :frakturbold)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    to_plane1(x, :sans)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    to_plane1(x, :sansbold)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'i']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    to_plane1(x, :sansitalic)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'bi']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    to_plane1(x, :sansbolditalic)
  end
  math
end

.unwrap_accents(doc) ⇒ Object



40
41
42
43
44
45
46
47
# File 'lib/html2doc/math.rb', line 40

def self.unwrap_accents(doc)
  doc.xpath("//*[@accent = 'true']").each do |x|
    x.elements.length > 1 or next
    x.elements[1].name == "mrow" and
      x.elements[1].replace(x.elements[1].children)
  end
  doc
end

.warnsvg(src) ⇒ Object



99
100
101
# File 'lib/html2doc/mime.rb', line 99

def self.warnsvg(src)
  warn "#{src}: SVG not supported" if /\.svg$/i.match?(src)
end