Module: Html2Doc

Defined in:
lib/html2doc/base.rb,
lib/html2doc/math.rb,
lib/html2doc/mime.rb,
lib/html2doc/lists.rb,
lib/html2doc/notes.rb,
lib/html2doc/version.rb

Constant Summary collapse

NOKOHEAD =
"<!DOCTYPE html SYSTEM\n\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\n<head> <title></title> <meta charset=\"UTF-8\" /> </head>\n<body> </body> </html>\n".freeze
DOCTYPE =
"<!DOCTYPE html SYSTEM \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n".freeze
"<!--[if gte mso 9]>\n<xml>\n<w:WordDocument>\n<w:View>Print</w:View>\n<w:Zoom>100</w:Zoom>\n<w:DoNotOptimizeForBrowser/>\n</w:WordDocument>\n</xml>\n<![endif]-->\n<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\"/>\n".freeze
IMAGE_PATH =
"//*[local-name() = 'img' or local-name() = 'imagedata']".freeze
TOPLIST =
"[not(ancestor::ul) and not(ancestor::ol)]".freeze
FN =
"<span class='MsoFootnoteReference'>"\
"<span style='mso-special-character:footnote'/></span>".freeze
VERSION =
"1.1.1".freeze

Class Method Summary collapse

Class Method Details

.add_stylesheet(head, title, css) ⇒ Object



169
170
171
172
173
174
175
176
177
# File 'lib/html2doc/base.rb', line 169

def self.add_stylesheet(head, title, css)
  if head.children.empty?
    head.add_child css
  elsif title.nil?
    head.children.first.add_previous_sibling css
  else
    title.add_next_sibling css
  end
end

.asciimath_to_mathml(doc, delims) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
# File 'lib/html2doc/math.rb', line 23

def self.asciimath_to_mathml(doc, delims)
  return doc if delims.nil? || delims.size < 2

  m = doc.split(/(#{Regexp.escape(delims[0])}|#{Regexp.escape(delims[1])})/)
  m.each_slice(4).map.with_index do |(*a), i|
    i % 500 == 0 && m.size > 1000 && i > 0 and
      warn "MathML #{i} of #{(m.size / 4).floor}"
    a[2].nil? || a[2] = asciimath_to_mathml1(a[2])
    a.size > 1 ? a[0] + a[2] : a[0]
  end.join
end

.asciimath_to_mathml1(expr) ⇒ Object



12
13
14
15
16
17
18
19
20
21
# File 'lib/html2doc/math.rb', line 12

def self.asciimath_to_mathml1(expr)
  AsciiMath::MathMLBuilder.new(msword: true).append_expression(
    AsciiMath.parse(HTMLEntities.new.decode(expr)).ast,
  ).to_s
    .gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>")
rescue StandardError => e
  puts "parsing: #{expr}"
  puts e.message
  raise e
end

.bookmarks(docxml) ⇒ Object



192
193
194
195
196
197
198
199
200
201
202
203
# File 'lib/html2doc/base.rb', line 192

def self.bookmarks(docxml)
  docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
    .each do |x|
    next if x["id"].empty? ||
      %w(shapetype v:shapetype shape v:shape).include?(x.name)

    if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
    else x.children.first.previous = "<a name='#{x['id']}'></a>"
    end
    x.delete("id")
  end
end

.cleanup(docxml, hash) ⇒ Object



54
55
56
57
58
59
60
61
62
63
# File 'lib/html2doc/base.rb', line 54

def self.cleanup(docxml, hash)
  namespace(docxml.root)
  image_cleanup(docxml, hash[:dir1], File.dirname(hash[:filename]))
  mathml_to_ooml(docxml)
  lists(docxml, hash[:liststyles])
  footnotes(docxml)
  bookmarks(docxml)
  msonormal(docxml)
  docxml
end

.clear_dir(dir) ⇒ Object



27
28
29
30
31
32
33
# File 'lib/html2doc/base.rb', line 27

def self.clear_dir(dir)
  Dir.foreach(dir) do |f|
    fn = File.join(dir, f)
    File.delete(fn) if f != "." && f != ".."
  end
  dir
end

.contentid(mhtml) ⇒ Object



69
70
71
72
73
74
75
76
77
# File 'lib/html2doc/mime.rb', line 69

def self.contentid(mhtml)
  mhtml.gsub %r{(<img[^>]*?src=")([^\"']+)(['"])}m do |m|
    repl = "#{$1}cid:#{File.basename($2)}#{$3}"
    /^data:|^https?:/.match($2) ? m : repl
  end.gsub %r{(<v:imagedata[^>]*?src=")([^\"']+)(['"])}m do |m|
    repl = "#{$1}cid:#{File.basename($2)}#{$3}"
    /^data:|^https?:/.match($2) ? m : repl
  end
end

.create_dir(filename, dir) ⇒ Object



35
36
37
38
39
40
# File 'lib/html2doc/base.rb', line 35

def self.create_dir(filename, dir)
  dir and return clear_dir(dir)
  dir = "#{filename}_files"
  Dir.mkdir(dir) unless File.exists?(dir)
  clear_dir(dir)
end

.define_head(docxml, hash) ⇒ Object



159
160
161
162
163
164
165
166
167
# File 'lib/html2doc/base.rb', line 159

def self.define_head(docxml, hash)
  title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']")
  head = docxml.at("//*[local-name() = 'head']")
  css = stylesheet(hash[:filename], hash[:header_file], hash[:stylesheet])
  add_stylesheet(head, title, css)
  filename_substitute(head, hash[:header_file])
  define_head1(docxml, hash[:dir1])
  rootnamespace(docxml.root)
end

.define_head1(docxml, dir) ⇒ Object



130
131
132
133
134
135
136
137
# File 'lib/html2doc/base.rb', line 130

def self.define_head1(docxml, dir)
  docxml.xpath("//*[local-name() = 'head']").each do |h|
    h.children.first.add_previous_sibling "      \#{PRINT_VIEW}\n        <link rel=\"File-List\" href=\"cid:filelist.xml\"/>\n    XML\n  end\nend\n"

.esc_space(xml) ⇒ Object

escape space as &#x32;; we are removing any spaces generated by XML indentation



147
148
149
150
151
152
153
154
# File 'lib/html2doc/math.rb', line 147

def self.esc_space(xml)
  xml.traverse do |n|
    next unless n.text?

    n = n.text.gsub(/ /, "&#x32;")
  end
  xml
end

.filename_substitute(head, header_filename) ⇒ Object



139
140
141
142
143
144
145
146
147
148
# File 'lib/html2doc/base.rb', line 139

def self.filename_substitute(head, header_filename)
  return if header_filename.nil?

  head.xpath(".//*[local-name() = 'style']").each do |s|
    s1 = s.to_xml.gsub(/url\("[^"]+"\)/) do |m|
      /FILENAME/.match?(m) ? "url(cid:header.html)" : m
    end
    s.replace(s1)
  end
end

.footnote?(elem) ⇒ Boolean

Returns:

  • (Boolean)


82
83
84
85
# File 'lib/html2doc/notes.rb', line 82

def self.footnote?(elem)
  elem["epub:type"]&.casecmp("footnote")&.zero? ||
    elem["class"]&.casecmp("footnote")&.zero?
end

.footnote_cleanup(docxml) ⇒ Object

We expect that the content of the footnote text received is one or more text containers, p or aside or div (which we have already converted to p). We do not expect any <a name> or links back to text; if they are present in the HTML, they need to have been cleaned out before passing to this gem



99
100
101
102
103
104
105
106
# File 'lib/html2doc/notes.rb', line 99

def self.footnote_cleanup(docxml)
  docxml.xpath('//div[@style="mso-element:footnote"]/a')
    .each do |x|
    n = x.next_element
    n&.children&.first&.add_previous_sibling(x.remove)
  end
  docxml
end

.footnote_container(docxml, idx) ⇒ Object



40
41
42
43
44
45
46
47
48
# File 'lib/html2doc/notes.rb', line 40

def self.footnote_container(docxml, idx)
  ref = docxml&.at("//a[@href='#_ftn#{idx}']")&.children&.to_xml(indent: 0)
    &.gsub(/>\n</, "><") || FN
  "    <div style='mso-element:footnote' id='ftn\#{idx}'>\n      <a style='mso-footnote-id:ftn\#{idx}' href='#_ftn\#{idx}'\n         name='_ftnref\#{idx}' title='' id='_ftnref\#{idx}'>\#{ref.strip}</a></div>\n  DIV\nend\n"

.footnote_div_to_p(elem) ⇒ Object



26
27
28
29
30
31
32
33
34
35
# File 'lib/html2doc/notes.rb', line 26

def self.footnote_div_to_p(elem)
  if %w{div aside}.include? elem.name
    if elem.at(".//p")
      elem.replace(elem.children)
    else
      elem.name = "p"
      elem["class"] = "MsoFootnoteText"
    end
  end
end

.footnotes(docxml) ⇒ Object



4
5
6
7
8
9
10
11
12
13
# File 'lib/html2doc/notes.rb', line 4

def self.footnotes(docxml)
  i = 1
  fn = []
  docxml.xpath("//a").each do |a|
    next unless process_footnote_link(docxml, a, i, fn)

    i += 1
  end
  process_footnote_texts(docxml, fn)
end

.from_xhtml(xml) ⇒ Object



86
87
88
89
90
# File 'lib/html2doc/base.rb', line 86

def self.from_xhtml(xml)
  xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
    .sub(DOCTYPE, "")
    .gsub(%{ />}, "/>")
end

.generate_filelist(filename, dir) ⇒ Object



144
145
146
147
148
149
150
151
152
153
154
155
# File 'lib/html2doc/mime.rb', line 144

def self.generate_filelist(filename, dir)
  File.open(File.join(dir, "filelist.xml"), "w") do |f|
    f.write %{<xml xmlns:o="urn:schemas-microsoft-com:office:office">
      <o:MainFile HRef="../#{filename}.htm"/>}
    Dir.entries(dir).sort.each do |item|
      next if item == "." || item == ".." || /^\./.match(item)

      f.write %{  <o:File HRef="#{item}"/>\n}
    end
    f.write("</xml>\n")
  end
end

.header_image_cleanup(doc, dir, filename, localdir) ⇒ Object

do not parse the header through Nokogiri, since it will contain non-XML like <![if !supportFootnotes]>



123
124
125
126
127
# File 'lib/html2doc/mime.rb', line 123

def self.header_image_cleanup(doc, dir, filename, localdir)
  doc.split(%r{(<img [^>]*>|<v:imagedata [^>]*>)}).each_slice(2).map do |a|
    header_image_cleanup1(a, dir, filename, localdir)
  end.join
end

.header_image_cleanup1(a, dir, _filename, localdir) ⇒ Object



129
130
131
132
133
134
135
136
137
138
139
140
141
142
# File 'lib/html2doc/mime.rb', line 129

def self.header_image_cleanup1(a, dir, _filename, localdir)
  if a.size == 2 && !(/ src="https?:/.match a[1]) &&
      !(%r{ src="data:(image|application)/[^;]+;base64}.match a[1])
    m = / src=['"](?<src>[^"']+)['"]/.match a[1]
    #warnsvg(m[:src])
    m2 = /\.(?<suffix>[a-zA-Z_0-9]+)$/.match m[:src]
    new_filename = "#{mkuuid}.#{m2[:suffix]}"
    old_filename = %r{^([A-Z]:)?/}.match?(m[:src]) ? m[:src] :
      File.join(localdir, m[:src])
    FileUtils.cp old_filename, File.join(dir, new_filename)
    a[1].sub!(%r{ src=['"](?<src>[^"']+)['"]}, " src='cid:#{new_filename}'")
  end
  a.join
end

.image_cleanup(docxml, dir, localdir) ⇒ Object

only processes locally stored images



104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/html2doc/mime.rb', line 104

def self.image_cleanup(docxml, dir, localdir)
  docxml.traverse do |i|
    next unless i.element? && %w(img v:imagedata).include?(i.name)
    #warnsvg(i["src"])
    next if /^http/.match i["src"]
    next if %r{^data:(image|application)/[^;]+;base64}.match? i["src"]

    local_filename = %r{^([A-Z]:)?/}.match(i["src"]) ? i["src"] :
      File.join(localdir, i["src"])
    new_filename = "#{mkuuid}#{File.extname(i['src'])}"
    FileUtils.cp local_filename, File.join(dir, new_filename)
    i["width"], i["height"] = image_resize(i, local_filename, 680, 400)
    i["src"] = File.join(File.basename(dir), new_filename)
  end
  docxml
end

.image_resize(i, path, maxheight, maxwidth) ⇒ Object

max width for Word document is 400, max height is 680



80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/html2doc/mime.rb', line 80

def self.image_resize(i, path, maxheight, maxwidth)
  realSize = ImageSize.path(path).size
  s = [i["width"].to_i, i["height"].to_i]
  s = realSize if s[0].zero? && s[1].zero?
  return [nil, nil] if realSize.nil? || realSize[0].nil? || realSize[1].nil?

  s[1] = s[0] * realSize[1] / realSize[0] if s[1].zero? && !s[0].zero?
  s[0] = s[1] * realSize[0] / realSize[1] if s[0].zero? && !s[1].zero?
  s = [(s[0] * maxheight / s[1]).ceil, maxheight] if s[1] > maxheight
  s = [maxwidth, (s[1] * maxwidth / s[0]).ceil] if s[0] > maxwidth
  s
end

.list2para(u) ⇒ Object



50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/html2doc/lists.rb', line 50

def self.list2para(u)
  return if u.xpath("./li").empty?

  u.xpath("./li").first["class"] ||= "MsoListParagraphCxSpFirst"
  u.xpath("./li").last["class"] ||= "MsoListParagraphCxSpLast"
  u.xpath("./li/p").each { |p| p["class"] ||= "MsoListParagraphCxSpMiddle" }
  u.xpath("./li").each do |l|
    l.name = "p"
    l["class"] ||= "MsoListParagraphCxSpMiddle"
    l&.first_element_child&.name == "p" and
      l.first_element_child.replace(l.first_element_child.children)
  end
  u.replace(u.children)
end

.list_add(xpath, liststyles, listtype, level) ⇒ Object



33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/html2doc/lists.rb', line 33

def self.list_add(xpath, liststyles, listtype, level)
  xpath.each_with_index do |l, _i|
    @listnumber += 1 if level == 1
    l["seen"] = true if level == 1
    l["id"] ||= UUIDTools::UUID.random_create
    (l.xpath(".//li") - l.xpath(".//ol//li | .//ul//li")).each do |li|
      style_list(li, level, liststyles[listtype], @listnumber)
      list_add1(li, liststyles, listtype, level)
    end
    l.xpath(".//ul[not(ancestor::li/ancestor::*/@id = '#{l['id']}')] | "\
            ".//ol[not(ancestor::li/ancestor::*/@id = '#{l['id']}')]")
      .each do |li|
      list_add1(li.parent, liststyles, listtype, level - 1)
    end
  end
end

.list_add1(li, liststyles, listtype, level) ⇒ Object



19
20
21
22
23
24
25
26
27
28
29
30
31
# File 'lib/html2doc/lists.rb', line 19

def self.list_add1(li, liststyles, listtype, level)
  if i[ul ol].include? listtype
    list_add(li.xpath(".//ul") - li.xpath(".//ul//ul | .//ol//ul"),
             liststyles, :ul, level + 1)
    list_add(li.xpath(".//ol") - li.xpath(".//ul//ol | .//ol//ol"),
             liststyles, :ol, level + 1)
  else
    list_add(li.xpath(".//ul") - li.xpath(".//ul//ul | .//ol//ul"),
             liststyles, listtype, level + 1)
    list_add(li.xpath(".//ol") - li.xpath(".//ul//ol | .//ol//ol"),
             liststyles, listtype, level + 1)
  end
end

.lists(docxml, liststyles) ⇒ Object



94
95
96
97
98
99
100
101
102
# File 'lib/html2doc/lists.rb', line 94

def self.lists(docxml, liststyles)
  return if liststyles.nil?

  @listnumber = 0
  liststyles.each_key { |k| lists1(docxml, liststyles, k) }
  lists_unstyled(docxml, liststyles)
  liststyles.has_key?(:ul) and docxml.xpath("//ul").each { |u| list2para(u) }
  liststyles.has_key?(:ol) and docxml.xpath("//ol").each { |u| list2para(u) }
end

.lists1(docxml, liststyles, k) ⇒ Object



67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/html2doc/lists.rb', line 67

def self.lists1(docxml, liststyles, k)
  case k
  when :ul then list_add(docxml.xpath("//ul[not(@class)]#{TOPLIST}"),
                         liststyles, :ul, 1)
  when :ol then list_add(docxml.xpath("//ol[not(@class)]#{TOPLIST}"),
                         liststyles, :ol, 1)
  else
    list_add(docxml.xpath("//ol[@class = '#{k}']#{TOPLIST} | "\
                          "//ul[@class = '#{k}']#{TOPLIST}"),
    liststyles, k, 1)
  end
end

.lists_unstyled(docxml, liststyles) ⇒ Object



80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/html2doc/lists.rb', line 80

def self.lists_unstyled(docxml, liststyles)
  if liststyles.has_key?(:ul)
    list_add(docxml.xpath("//ul#{TOPLIST}[not(@seen)]"),
             liststyles, :ul, 1)
  end
  if liststyles.has_key?(:ol)
    list_add(docxml.xpath("//ol#{TOPLIST}[not(@seen)]"),
             liststyles, :ul, 1)
  end
  docxml.xpath("//ul[@seen] | //ol[@seen]").each do |l|
    l.delete("seen")
  end
end

.mathml_insert_rows(math, docnamespaces) ⇒ Object



55
56
57
58
59
60
61
62
63
# File 'lib/html2doc/math.rb', line 55

def self.mathml_insert_rows(math, docnamespaces)
  math.xpath(%w(msup msub msubsup munder mover munderover)
          .map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
    next unless x.next_element && x.next_element != "mrow"

    x.next_element.wrap("<mrow/>")
  end
  math
end

.mathml_preserve_space(math, docnamespaces) ⇒ Object



65
66
67
68
69
70
# File 'lib/html2doc/math.rb', line 65

def self.mathml_preserve_space(math, docnamespaces)
  math.xpath(".//xmlns:mtext", docnamespaces).each do |x|
    x.children = x.children.to_xml.gsub(/^\s/, "&#xA0;").gsub(/\s$/, "&#xA0;")
  end
  math
end

.mathml_to_ooml(docxml) ⇒ Object



127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# File 'lib/html2doc/math.rb', line 127

def self.mathml_to_ooml(docxml)
  docnamespaces = docxml.collect_namespaces
  m = docxml.xpath("//*[local-name() = 'math']")
  m.each_with_index do |x, i|
    i % 100 == 0 && m.size > 500 && i > 0 and
      warn "Math OOXML #{i} of #{m.size}"
    element = ooxml_cleanup(x, docnamespaces)
    doc = Nokogiri::XML::Document::new
    doc.root = element
    ooxml = unitalic(esc_space(@xsltemplate.transform(doc))).to_s
      .gsub(/<\?[^>]+>\s*/, "")
      .gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
      .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
    ooxml = uncenter(x, ooxml)
    x.swap(ooxml)
  end
end

.mime_attachment(boundary, _filename, item, dir) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/html2doc/mime.rb', line 23

def self.mime_attachment(boundary, _filename, item, dir)
  content_type = mime_type(item)
  text_mode = %w[text application].any? { |p| content_type.start_with? p }

  path = File.join(dir, item)
  content = text_mode ? File.read(path, encoding: "utf-8") : IO.binread(path)

  encoded_file = Base64.strict_encode64(content).gsub(/(.{76})/, "\\1\n")
  "    --\#{boundary}\n    Content-ID: <\#{File.basename(item)}>\n    Content-Disposition: inline; filename=\"\#{File.basename(item)}\"\n    Content-Transfer-Encoding: base64\n    Content-Type: \#{content_type}\n\n    \#{encoded_file}\n\n  FILE\nend\n"

.mime_boundaryObject



50
51
52
53
# File 'lib/html2doc/mime.rb', line 50

def self.mime_boundary
  salt = UUIDTools::UUID.random_create.to_s.gsub(/-/, ".")[0..17]
  "----=_NextPart_#{salt}"
end

.mime_package(result, filename, dir) ⇒ Object



55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/html2doc/mime.rb', line 55

def self.mime_package(result, filename, dir)
  boundary = mime_boundary
  mhtml = mime_preamble(boundary, "#{filename}.htm", result)
  mhtml += mime_attachment(boundary, "#{filename}.htm", "filelist.xml", dir)
  Dir.foreach(dir) do |item|
    next if item == "." || item == ".." || /^\./.match(item) ||
      item == "filelist.xml"

    mhtml += mime_attachment(boundary, "#{filename}.htm", item, dir)
  end
  mhtml += "--#{boundary}--"
  File.open("#{filename}.doc", "w:UTF-8") { |f| f.write contentid(mhtml) }
end

.mime_preamble(boundary, filename, result) ⇒ Object



8
9
10
11
12
13
14
15
16
17
18
19
20
21
# File 'lib/html2doc/mime.rb', line 8

def self.mime_preamble(boundary, filename, result)
  "    MIME-Version: 1.0\n    Content-Type: multipart/related; boundary=\"\#{boundary}\"\n\n    --\#{boundary}\n    Content-ID: <\#{File.basename(filename)}>\n    Content-Disposition: inline; filename=\"\#{File.basename(filename)}\"\n    Content-Type: text/html; charset=\"utf-8\"\n\n    \#{result}\n\n  PREAMBLE\nend\n"

.mime_type(item) ⇒ Object



43
44
45
46
47
48
# File 'lib/html2doc/mime.rb', line 43

def self.mime_type(item)
  types = MIME::Types.type_for(item)
  type = types ? types.first.to_s : 'text/plain; charset="utf-8"'
  type = type + ' charset="utf-8"' if /^text/.match(type) && types
  type
end

.mkuuidObject



95
96
97
# File 'lib/html2doc/mime.rb', line 95

def self.mkuuid
  UUIDTools::UUID.random_create.to_s
end

.msonormal(docxml) ⇒ Object



205
206
207
208
209
210
211
212
# File 'lib/html2doc/base.rb', line 205

def self.msonormal(docxml)
  docxml.xpath("//*[local-name() = 'p'][not(self::*[@class])]").each do |p|
    p["class"] = "MsoNormal"
  end
  docxml.xpath("//*[local-name() = 'li'][not(self::*[@class])]").each do |p|
    p["class"] = "MsoNormal"
  end
end

.msword_fix(doc) ⇒ Object



92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/html2doc/base.rb', line 92

def self.msword_fix(doc)
  # brain damage in MSWord parser
  doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
            '<span style="mso-special-character:footnote"></span>')
  doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
            '<div style="mso-element:footnote-list"/>')
  doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
  doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
  doc.gsub!(%r{<meta http-equiv="Content-Type"},
            "<meta http-equiv=Content-Type")
  doc.gsub!(%r{></m:jc>}, "/>")
  doc.gsub!(%r{></v:stroke>}, "/>")
  doc.gsub!(%r{></v:f>}, "/>")
  doc.gsub!(%r{></v:path>}, "/>")
  doc.gsub!(%r{></o:lock>}, "/>")
  doc.gsub!(%r{></v:imagedata>}, "/>")
  doc.gsub!(%r{></w:wrap>}, "/>")
  doc.gsub!(%r{&tab;|&amp;tab;},
            '<span style="mso-tab-count:1">&#xA0; </span>')
  doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
    a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
    a
  end.join
end

.namespace(root) ⇒ Object



179
180
181
182
183
184
185
186
# File 'lib/html2doc/base.rb', line 179

def self.namespace(root)
  {
    o: "urn:schemas-microsoft-com:office:office",
    w: "urn:schemas-microsoft-com:office:word",
    v: "urn:schemas-microsoft-com:vml",
    m: "http://schemas.microsoft.com/office/2004/12/omml",
  }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
end

.ooxml_cleanup(math, docnamespaces) ⇒ Object

random fixes to MathML input that OOXML needs to render properly



45
46
47
48
49
50
51
52
53
# File 'lib/html2doc/math.rb', line 45

def self.ooxml_cleanup(math, docnamespaces)
  math = unwrap_accents(
    mathml_preserve_space(
      mathml_insert_rows(math, docnamespaces), docnamespaces
    ),
  )
  math.add_namespace(nil, "http://www.w3.org/1998/Math/MathML")
  math
end

.process(result, hash) ⇒ Object



8
9
10
11
12
13
14
15
16
# File 'lib/html2doc/base.rb', line 8

def self.process(result, hash)
  hash[:dir1] = create_dir(hash[:filename], hash[:dir])
  result = process_html(result, hash)
  process_header(hash[:header_file], hash)
  generate_filelist(hash[:filename], hash[:dir1])
  File.open("#{hash[:filename]}.htm", "w:UTF-8") { |f| f.write(result) }
  mime_package result, hash[:filename], hash[:dir1]
  rm_temp_files(hash[:filename], hash[:dir], hash[:dir1]) unless hash[:debug]
end


50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/html2doc/notes.rb', line 50

def self.process_footnote_link(docxml, elem, idx, footnote)
  return false unless footnote?(elem)

  href = elem["href"].gsub(/^#/, "")
  note = docxml.at("//*[@name = '#{href}' or @id = '#{href}']")
  return false if note.nil?

  set_footnote_link_attrs(elem, idx)
  if elem.at("./span[@class = 'MsoFootnoteReference']")
    elem.children.each do |c|
      if c.name == "span" && c["class"] == "MsoFootnoteReference"
        c.replace(FN)
      else
        c.wrap("<span class='MsoFootnoteReference'></span>")
      end
    end
  else
    elem.children = FN
  end
  footnote << transform_footnote_text(note)
end

.process_footnote_texts(docxml, footnotes) ⇒ Object



15
16
17
18
19
20
21
22
23
24
# File 'lib/html2doc/notes.rb', line 15

def self.process_footnote_texts(docxml, footnotes)
  body = docxml.at("//body")
  list = body.add_child("<div style='mso-element:footnote-list'/>")
  footnotes.each_with_index do |f, i|
    fn = list.first.add_child(footnote_container(docxml, i + 1))
    f.parent = fn.first
    footnote_div_to_p(f)
  end
  footnote_cleanup(docxml)
end

.process_header(headerfile, hash) ⇒ Object



18
19
20
21
22
23
24
25
# File 'lib/html2doc/base.rb', line 18

def self.process_header(headerfile, hash)
  return if headerfile.nil?

  doc = File.read(headerfile, encoding: "utf-8")
  doc = header_image_cleanup(doc, hash[:dir1], hash[:filename],
                             File.dirname(hash[:filename]))
  File.open("#{hash[:dir1]}/header.html", "w:UTF-8") { |f| f.write(doc) }
end

.process_html(result, hash) ⇒ Object



42
43
44
45
46
# File 'lib/html2doc/base.rb', line 42

def self.process_html(result, hash)
  docxml = to_xhtml(asciimath_to_mathml(result, hash[:asciimathdelims]))
  define_head(cleanup(docxml, hash), hash)
  msword_fix(from_xhtml(docxml))
end

.rm_temp_files(filename, dir, dir1) ⇒ Object



48
49
50
51
52
# File 'lib/html2doc/base.rb', line 48

def self.rm_temp_files(filename, dir, dir1)
  FileUtils.rm "#{filename}.htm"
  FileUtils.rm_f "#{dir1}/header.html"
  FileUtils.rm_r dir1 unless dir
end

.rootnamespace(root) ⇒ Object



188
189
190
# File 'lib/html2doc/base.rb', line 188

def self.rootnamespace(root)
  root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
end


87
88
89
90
91
92
# File 'lib/html2doc/notes.rb', line 87

def self.set_footnote_link_attrs(elem, idx)
  elem["style"] = "mso-footnote-id:ftn#{idx}"
  elem["href"] = "#_ftn#{idx}"
  elem["name"] = "_ftnref#{idx}"
  elem["title"] = ""
end

.style_list(li, level, liststyle, listnumber) ⇒ Object



8
9
10
11
12
13
14
15
16
17
# File 'lib/html2doc/lists.rb', line 8

def self.style_list(li, level, liststyle, listnumber)
  return unless liststyle

  if li["style"]
    li["style"] += ";"
  else
    li["style"] = ""
  end
  li["style"] += "mso-list:#{liststyle} level#{level} lfo#{listnumber};"
end

.stylesheet(filename, header_filename, fn) ⇒ Object



150
151
152
153
154
155
156
157
# File 'lib/html2doc/base.rb', line 150

def self.stylesheet(filename, header_filename, fn)
  (fn.nil? || fn.empty?) and
    fn = File.join(File.dirname(__FILE__), "wordstyle.css")
  stylesheet = File.read(fn, encoding: "UTF-8")
  xml = Nokogiri::XML("<style/>")
  xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n")
  xml.root.to_s
end

.to_plane1(xml, font) ⇒ Object



118
119
120
121
122
123
124
125
# File 'lib/html2doc/math.rb', line 118

def self.to_plane1(xml, font)
  xml.traverse do |n|
    next unless n.text?

    n.replace(Plane1Converter.conv(HTMLEntities.new.decode(n.text), font))
  end
  xml
end

.to_xhtml(xml) ⇒ Object



73
74
75
76
77
78
79
80
# File 'lib/html2doc/base.rb', line 73

def self.to_xhtml(xml)
  xml.gsub!(/<\?xml[^>]*>/, "")
  unless /<!DOCTYPE /.match? xml
    xml = '<!DOCTYPE html SYSTEM
        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
  end
  Nokogiri::XML.parse(xml)
end

.transform_footnote_text(note) ⇒ Object



72
73
74
75
76
77
78
79
80
# File 'lib/html2doc/notes.rb', line 72

def self.transform_footnote_text(note)
  note["id"] = ""
  note.xpath(".//div").each { |div| div.replace(div.children) }
  note.xpath(".//aside | .//p").each do |p|
    p.name = "p"
    p["class"] = "MsoFootnoteText"
  end
  note.remove
end

.uncenter(math, ooxml) ⇒ Object

if oomml has no siblings, by default it is centered; override this with left/right if parent is so tagged



158
159
160
161
162
163
164
165
166
167
168
169
170
# File 'lib/html2doc/math.rb', line 158

def self.uncenter(math, ooxml)
  alignnode = math.at(".//ancestor::*[@style][local-name() = 'p' or "\
                   "local-name() = 'div' or local-name() = 'td']/@style")
  return ooxml unless alignnode && (math.next == nil && math.previous == nil)

  %w(left right).each do |dir|
    if alignnode.text.include? ("text-align:#{dir}")
      ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
        "m:val='#{dir}'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
    end
  end
  ooxml
end

.unitalic(math) ⇒ Object



72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# File 'lib/html2doc/math.rb', line 72

def self.unitalic(math)
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'p']]").each do |x|
    x.wrap("<span style='font-style:normal;'></span>")
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'bi']]").each do |x|
    x.wrap("<span class='nostem' style='font-weight:bold;'><em></em></span>")
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'i']]").each do |x|
    x.wrap("<span class='nostem'><em></em></span>")
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'b']]").each do |x|
    x.wrap("<span style='font-style:normal;font-weight:bold;'></span>")
  end
  math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'monospace']]").each do |x|
    to_plane1(x, :monospace)
  end
  math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'double-struck']]").each do |x|
    to_plane1(x, :doublestruck)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'script']]").each do |x|
    to_plane1(x, :script)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'script']]").each do |x|
    to_plane1(x, :scriptbold)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
    to_plane1(x, :fraktur)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
    to_plane1(x, :frakturbold)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    to_plane1(x, :sans)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    to_plane1(x, :sansbold)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'i']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    to_plane1(x, :sansitalic)
  end
  math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'bi']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    to_plane1(x, :sansbolditalic)
  end
  math
end

.unwrap_accents(doc) ⇒ Object



35
36
37
38
39
40
41
42
# File 'lib/html2doc/math.rb', line 35

def self.unwrap_accents(doc)
  doc.xpath("//*[@accent = 'true']").each do |x|
    x.elements.length > 1 or next
    x.elements[1].name == "mrow" and
      x.elements[1].replace(x.elements[1].children)
  end
  doc
end

.warnsvg(src) ⇒ Object



99
100
101
# File 'lib/html2doc/mime.rb', line 99

def self.warnsvg(src)
  warn "#{src}: SVG not supported" if /\.svg$/i.match?(src)
end