Module: Html2Doc

Defined in:
lib/html2doc/base.rb,
lib/html2doc/math.rb,
lib/html2doc/mime.rb,
lib/html2doc/lists.rb,
lib/html2doc/notes.rb,
lib/html2doc/version.rb

Constant Summary collapse

NOKOHEAD =
<<~HERE.freeze
  <!DOCTYPE html SYSTEM
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  <html xmlns="http://www.w3.org/1999/xhtml">
  <head> <title></title> <meta charset="UTF-8" /> </head>
  <body> </body> </html>
HERE
DOCTYPE =
<<~"DOCTYPE".freeze
  <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
DOCTYPE
<<~XML.freeze
  <!--[if gte mso 9]>
  <xml>
  <w:WordDocument>
  <w:View>Print</w:View>
  <w:Zoom>100</w:Zoom>
  <w:DoNotOptimizeForBrowser/>
  </w:WordDocument>
  </xml>
  <![endif]-->
  <meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
XML
IMAGE_PATH =
"//*[local-name() = 'img' or local-name() = 'imagedata']".freeze
TOPLIST =
"[not(ancestor::ul) and not(ancestor::ol)]".freeze
FN =
"<span class='MsoFootnoteReference'>"\
"<span style='mso-special-character:footnote'/></span>".freeze
VERSION =
"1.1.0".freeze

Class Method Summary collapse

Class Method Details

.add_stylesheet(head, title, css) ⇒ Object



168
169
170
171
172
173
174
175
176
# File 'lib/html2doc/base.rb', line 168

def self.add_stylesheet(head, title, css)
  if head.children.empty?
    head.add_child css
  elsif title.nil?
    head.children.first.add_previous_sibling css
  else
    title.add_next_sibling css
  end
end

.asciimath_to_mathml(doc, delims) ⇒ Object



24
25
26
27
28
29
30
31
32
33
# File 'lib/html2doc/math.rb', line 24

def self.asciimath_to_mathml(doc, delims)
  return doc if delims.nil? || delims.size < 2
  m = doc.split(/(#{Regexp.escape(delims[0])}|#{Regexp.escape(delims[1])})/)
  m.each_slice(4).map.with_index do |(*a), i|
    i % 500 == 0 && m.size > 1000 && i > 0 and
      warn "MathML #{i} of #{(m.size / 4).floor}"
    a[2].nil? || a[2] = asciimath_to_mathml1(a[2])
    a.size > 1 ? a[0] + a[2] : a[0]
  end.join
end

.asciimath_to_mathml1(x) ⇒ Object



12
13
14
15
16
17
18
19
20
21
22
# File 'lib/html2doc/math.rb', line 12

def self.asciimath_to_mathml1(x)
  begin
    AsciiMath::MathMLBuilder.new(:msword => true).append_expression(
      AsciiMath.parse(HTMLEntities.new.decode(x)).ast).to_s.
    gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>")
  rescue StandardError => e
    puts "parsing: #{x}"
    puts e.message
    raise e
  end
end

.bookmarks(docxml) ⇒ Object



191
192
193
194
195
196
197
198
199
200
201
202
# File 'lib/html2doc/base.rb', line 191

def self.bookmarks(docxml)
  docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]").each do |x|
    next if x["id"].empty?
    next if %w(shapetype v:shapetype shape v:shape).include? x.name
    if x.children.empty?
      x.add_child("<a name='#{x["id"]}'></a>")
    else
      x.children.first.previous = "<a name='#{x["id"]}'></a>"
    end
    x.delete("id")
  end
end

.cleanup(docxml, hash) ⇒ Object



54
55
56
57
58
59
60
61
62
63
# File 'lib/html2doc/base.rb', line 54

def self.cleanup(docxml, hash)
  namespace(docxml.root)
  image_cleanup(docxml, hash[:dir1], File.dirname(hash[:filename]))
  mathml_to_ooml(docxml)
  lists(docxml, hash[:liststyles])
  footnotes(docxml)
  bookmarks(docxml)
  msonormal(docxml)
  docxml
end

.clear_dir(dir) ⇒ Object



27
28
29
30
31
32
33
# File 'lib/html2doc/base.rb', line 27

def self.clear_dir(dir)
  Dir.foreach(dir) do |f|
    fn = File.join(dir, f)
    File.delete(fn) if f != '.' && f != '..'
  end
  dir
end

.contentid(mhtml) ⇒ Object



68
69
70
71
72
73
# File 'lib/html2doc/mime.rb', line 68

def self.contentid(mhtml)
  mhtml.gsub %r{(<img[^>]*?src=")([^\"']+)(['"])}m do |m|
    repl = "#{$1}cid:#{File.basename($2)}#{$3}"
    /^data:|^https?:/.match($2) ? m : repl
  end
end

.create_dir(filename, dir) ⇒ Object



35
36
37
38
39
40
# File 'lib/html2doc/base.rb', line 35

def self.create_dir(filename, dir)
  dir and return clear_dir(dir)
  dir = "#{filename}_files"
  Dir.mkdir(dir) unless File.exists?(dir)
  clear_dir(dir)
end

.define_head(docxml, hash) ⇒ Object



159
160
161
162
163
164
165
166
# File 'lib/html2doc/base.rb', line 159

def self.define_head(docxml, hash)
  title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']")
  head = docxml.at("//*[local-name() = 'head']")
  css = stylesheet(hash[:filename], hash[:header_file], hash[:stylesheet])
  add_stylesheet(head, title, css)
  define_head1(docxml, hash[:dir1])
  rootnamespace(docxml.root)
end

.define_head1(docxml, dir) ⇒ Object



130
131
132
133
134
135
136
137
# File 'lib/html2doc/base.rb', line 130

def self.define_head1(docxml, dir)
  docxml.xpath("//*[local-name() = 'head']").each do |h|
    h.children.first.add_previous_sibling <<~XML
    #{PRINT_VIEW}
      <link rel="File-List" href="cid:filelist.xml"/>
    XML
  end
end

.esc_space(xml) ⇒ Object

escape space as &#x32;; we are removing any spaces generated by XML indentation



142
143
144
145
146
147
148
# File 'lib/html2doc/math.rb', line 142

def self.esc_space(xml)
  xml.traverse do |n|
    next unless n.text?
    n = n.text.gsub(/ /, "&#x32;")
  end
  xml
end

.filename_substitute(stylesheet, header_filename, filename) ⇒ Object



139
140
141
142
143
144
145
146
147
# File 'lib/html2doc/base.rb', line 139

def self.filename_substitute(stylesheet, header_filename, filename)
  if header_filename.nil?
    stylesheet
  else
    stylesheet.gsub(/url\("[^"]+"\)/) do |m|
      /FILENAME/.match(m) ? "url(cid:header.html)" : m
    end
  end
end

.footnote?(a) ⇒ Boolean

Returns:

  • (Boolean)


79
80
81
82
# File 'lib/html2doc/notes.rb', line 79

def self.footnote?(a)
  a["epub:type"]&.casecmp("footnote")&.zero? ||
    a["class"]&.casecmp("footnote")&.zero?
end

.footnote_cleanup(docxml) ⇒ Object

We expect that the content of the footnote text received is one or more text containers, p or aside or div (which we have already converted to p). We do not expect any <a name> or links back to text; if they are present in the HTML, they need to have been cleaned out before passing to this gem



96
97
98
99
100
101
102
103
# File 'lib/html2doc/notes.rb', line 96

def self.footnote_cleanup(docxml)
  docxml.xpath('//div[@style="mso-element:footnote"]/a').
    each do |x|
    n = x.next_element
    n&.children&.first&.add_previous_sibling(x.remove)
  end
  docxml
end

.footnote_container(docxml, i) ⇒ Object



39
40
41
42
43
44
45
46
47
# File 'lib/html2doc/notes.rb', line 39

def self.footnote_container(docxml, i)
  ref = docxml&.at("//a[@href='#_ftn#{i}']")&.children&.to_xml(indent: 0).
    gsub(/>\n</, "><") || FN
  <<~DIV
    <div style='mso-element:footnote' id='ftn#{i}'>
      <a style='mso-footnote-id:ftn#{i}' href='#_ftn#{i}'
         name='_ftnref#{i}' title='' id='_ftnref#{i}'>#{ref.strip}</a></div>
  DIV
end

.footnote_div_to_p(f) ⇒ Object



25
26
27
28
29
30
31
32
33
34
# File 'lib/html2doc/notes.rb', line 25

def self.footnote_div_to_p(f)
  if %w{div aside}.include? f.name
    if f.at(".//p")
      f.replace(f.children)
    else
      f.name = "p"
      f["class"] = "MsoFootnoteText"
    end
  end
end

.footnotes(docxml) ⇒ Object



4
5
6
7
8
9
10
11
12
# File 'lib/html2doc/notes.rb', line 4

def self.footnotes(docxml)
  i = 1
  fn = []
  docxml.xpath("//a").each do |a|
    next unless process_footnote_link(docxml, a, i, fn)
    i += 1
  end
  process_footnote_texts(docxml, fn)
end

.from_xhtml(xml) ⇒ Object



86
87
88
89
90
# File 'lib/html2doc/base.rb', line 86

def self.from_xhtml(xml)
  xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "").
    sub(DOCTYPE, "").
    gsub(%{ />}, "/>")
end

.generate_filelist(filename, dir) ⇒ Object



137
138
139
140
141
142
143
144
145
146
147
# File 'lib/html2doc/mime.rb', line 137

def self.generate_filelist(filename, dir)
  File.open(File.join(dir, "filelist.xml"), "w") do |f|
    f.write %{<xml xmlns:o="urn:schemas-microsoft-com:office:office">
      <o:MainFile HRef="../#{filename}.htm"/>}
    Dir.entries(dir).sort.each do |item|
      next if item == "." || item == ".." || /^\./.match(item)
      f.write %{  <o:File HRef="#{item}"/>\n}
    end
    f.write("</xml>\n")
  end
end

.header_image_cleanup(doc, dir, filename, localdir) ⇒ Object

do not parse the header through Nokogiri, since it will contain non-XML like <![if !supportFootnotes]>



117
118
119
120
121
# File 'lib/html2doc/mime.rb', line 117

def self.header_image_cleanup(doc, dir, filename, localdir)
  doc.split(%r{(<img [^>]*>|<v:imagedata [^>]*>)}).each_slice(2).map do |a|
    header_image_cleanup1(a, dir, filename, localdir)
  end.join
end

.header_image_cleanup1(a, dir, filename, localdir) ⇒ Object



123
124
125
126
127
128
129
130
131
132
133
134
135
# File 'lib/html2doc/mime.rb', line 123

def self.header_image_cleanup1(a, dir, filename, localdir)
  if a.size == 2 && !(/ src="https?:/.match a[1]) &&
      !(%r{ src="data:(image|application)/[^;]+;base64}.match a[1])
    m = / src=['"](?<src>[^"']+)['"]/.match a[1]
    #warnsvg(m[:src])
    m2 = /\.(?<suffix>[a-zA-Z_0-9]+)$/.match m[:src]
    new_filename = "#{mkuuid}.#{m2[:suffix]}"
    old_filename = %r{^([A-Z]:)?/}.match(m[:src]) ? m[:src] : File.join(localdir, m[:src])
    FileUtils.cp old_filename, File.join(dir, new_filename)
    a[1].sub!(%r{ src=['"](?<src>[^"']+)['"]}, " src='cid:#{new_filename}'")
  end
  a.join
end

.image_cleanup(docxml, dir, localdir) ⇒ Object

only processes locally stored images



99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/html2doc/mime.rb', line 99

def self.image_cleanup(docxml, dir, localdir)
  docxml.traverse do |i|
    next unless i.element? && %w(img v:imagedata).include?(i.name)
    #warnsvg(i["src"])
    next if /^http/.match i["src"]
    next if %r{^data:(image|application)/[^;]+;base64}.match i["src"]
    local_filename = %r{^([A-Z]:)?/}.match(i["src"]) ? i["src"] :
      File.join(localdir, i["src"])
    new_filename = "#{mkuuid}#{File.extname(i["src"])}"
    FileUtils.cp local_filename, File.join(dir, new_filename)
    i["width"], i["height"] = image_resize(i, local_filename, 680, 400)
    i["src"] = File.join(File.basename(dir), new_filename)
  end
  docxml
end

.image_resize(i, path, maxheight, maxwidth) ⇒ Object

max width for Word document is 400, max height is 680



76
77
78
79
80
81
82
83
84
85
86
# File 'lib/html2doc/mime.rb', line 76

def self.image_resize(i, path, maxheight, maxwidth)
  realSize = ImageSize.path(path).size
  s = [i["width"].to_i, i["height"].to_i]
  s = realSize if s[0].zero? && s[1].zero?
  return [nil, nil] if realSize.nil? || realSize[0].nil? || realSize[1].nil?
  s[1] = s[0] * realSize[1] / realSize[0] if s[1].zero? && !s[0].zero?
  s[0] = s[1] * realSize[0] / realSize[1] if s[0].zero? && !s[1].zero?
  s = [(s[0] * maxheight / s[1]).ceil, maxheight] if s[1] > maxheight
  s = [maxwidth, (s[1] * maxwidth / s[0]).ceil] if s[0] > maxwidth
  s
end

.list2para(u) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/html2doc/lists.rb', line 48

def self.list2para(u)
  return if u.xpath("./li").empty?
  u.xpath("./li").first["class"] ||= "MsoListParagraphCxSpFirst"
  u.xpath("./li").last["class"] ||= "MsoListParagraphCxSpLast"
  u.xpath("./li/p").each { |p| p["class"] ||= "MsoListParagraphCxSpMiddle" }
  u.xpath("./li").each do |l|
    l.name = "p"
    l["class"] ||= "MsoListParagraphCxSpMiddle"
    l&.first_element_child&.name == "p" and
      l.first_element_child.replace(l.first_element_child.children)
  end
  u.replace(u.children)
end

.list_add(xpath, liststyles, listtype, level) ⇒ Object



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/html2doc/lists.rb', line 32

def self.list_add(xpath, liststyles, listtype, level)
  xpath.each_with_index do |list, i|
    @listnumber += 1 if level == 1
    list["seen"] = true if level == 1
    list["id"] ||= UUIDTools::UUID.random_create
    (list.xpath(".//li") - list.xpath(".//ol//li | .//ul//li")).each do |li|
      style_list(li, level, liststyles[listtype], @listnumber)
      list_add1(li, liststyles, listtype, level)
    end
    list.xpath(".//ul[not(ancestor::li/ancestor::*/@id = '#{list['id']}')] | "\
               ".//ol[not(ancestor::li/ancestor::*/@id = '#{list['id']}')]").each do |li|
      list_add1(li.parent, liststyles, listtype, level-1)
    end
  end
end

.list_add1(li, liststyles, listtype, level) ⇒ Object



18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/html2doc/lists.rb', line 18

def self.list_add1(li, liststyles, listtype, level)
  if [:ul, :ol].include? listtype
        list_add(li.xpath(".//ul") - li.xpath(".//ul//ul | .//ol//ul"),
                 liststyles, :ul, level + 1)
        list_add(li.xpath(".//ol") - li.xpath(".//ul//ol | .//ol//ol"),
                 liststyles, :ol, level + 1)
      else
        list_add(li.xpath(".//ul") - li.xpath(".//ul//ul | .//ol//ul"),
                 liststyles, listtype, level + 1)
        list_add(li.xpath(".//ol") - li.xpath(".//ul//ol | .//ol//ol"),
                 liststyles, listtype, level + 1)
      end
end

.lists(docxml, liststyles) ⇒ Object



87
88
89
90
91
92
93
94
# File 'lib/html2doc/lists.rb', line 87

def self.lists(docxml, liststyles)
  return if liststyles.nil?
  @listnumber = 0
  liststyles.each_key { |k| lists1(docxml, liststyles, k) }
  lists_unstyled(docxml, liststyles)
  liststyles.has_key?(:ul) and docxml.xpath("//ul").each { |u| list2para(u) }
  liststyles.has_key?(:ol) and docxml.xpath("//ol").each { |u| list2para(u) }
end

.lists1(docxml, liststyles, k) ⇒ Object



64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/html2doc/lists.rb', line 64

def self.lists1(docxml, liststyles, k)
  case k
  when :ul then list_add(docxml.xpath("//ul[not(@class)]#{TOPLIST}"),
                          liststyles, :ul, 1)
  when :ol then list_add(docxml.xpath("//ol[not(@class)]#{TOPLIST}"),
                         liststyles, :ol, 1)
  else
    list_add(docxml.xpath("//ol[@class = '#{k.to_s}']#{TOPLIST} | "\
                          "//ul[@class = '#{k.to_s}']#{TOPLIST}"),
    liststyles, k, 1)
  end
end

.lists_unstyled(docxml, liststyles) ⇒ Object



77
78
79
80
81
82
83
84
85
# File 'lib/html2doc/lists.rb', line 77

def self.lists_unstyled(docxml, liststyles)
  list_add(docxml.xpath("//ul#{TOPLIST}[not(@seen)]"),
           liststyles, :ul, 1) if liststyles.has_key?(:ul)
  list_add(docxml.xpath("//ol#{TOPLIST}[not(@seen)]"),
           liststyles, :ul, 1) if liststyles.has_key?(:ol)
  docxml.xpath("//ul[@seen] | //ol[@seen]").each do |l|
    l.delete("seen")
  end
end

.mathml_insert_rows(m, docnamespaces) ⇒ Object



52
53
54
55
56
57
58
59
# File 'lib/html2doc/math.rb', line 52

def self.mathml_insert_rows(m, docnamespaces)
  m.xpath(%w(msup msub msubsup munder mover munderover).
          map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
    next unless x.next_element && x.next_element != "mrow"
    x.next_element.wrap("<mrow/>")
  end
  m
end

.mathml_preserve_space(m, docnamespaces) ⇒ Object



61
62
63
64
65
66
# File 'lib/html2doc/math.rb', line 61

def self.mathml_preserve_space(m, docnamespaces)
  m.xpath(".//xmlns:mtext", docnamespaces).each do |x|
    x.children = x.children.to_xml.gsub(/^\s/, "&#xA0;").gsub(/\s$/, "&#xA0;")
  end
  m
end

.mathml_to_ooml(docxml) ⇒ Object



122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/html2doc/math.rb', line 122

def self.mathml_to_ooml(docxml)
  docnamespaces = docxml.collect_namespaces
  m = docxml.xpath("//*[local-name() = 'math']")
  m.each_with_index do |x, i|
    i % 100 == 0 && m.size > 500 && i > 0 and
      warn "Math OOXML #{i} of #{m.size}"
    element = ooxml_cleanup(x, docnamespaces)
    doc = Nokogiri::XML::Document::new()
    doc.root = element
    ooxml = (unitalic(esc_space(@xsltemplate.transform(doc)))).to_s.
      gsub(/<\?[^>]+>\s*/, "").
      gsub(/ xmlns(:[^=]+)?="[^"]+"/, "").
      gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
    ooxml = uncenter(x, ooxml)
    x.swap(ooxml)
  end
end

.mime_attachment(boundary, filename, item, dir) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/html2doc/mime.rb', line 23

def self.mime_attachment(boundary, filename, item, dir)
  content_type = mime_type(item)
  text_mode = %w[text application].any? { |p| content_type.start_with? p }

  path = File.join(dir, item)
  content = text_mode ? File.read(path, encoding: "utf-8") : IO.binread(path)

  encoded_file = Base64.strict_encode64(content).gsub(/(.{76})/, "\\1\n")
  <<~"FILE"
  --#{boundary}
  Content-ID: <#{File.basename(item)}>
  Content-Disposition: inline; filename="#{File.basename(item)}"
  Content-Transfer-Encoding: base64
  Content-Type: #{content_type}

  #{encoded_file}

  FILE
end

.mime_boundaryObject



50
51
52
53
# File 'lib/html2doc/mime.rb', line 50

def self.mime_boundary
  salt = UUIDTools::UUID.random_create.to_s.gsub(/-/, ".")[0..17]
  "----=_NextPart_#{salt}"
end

.mime_package(result, filename, dir) ⇒ Object



55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/html2doc/mime.rb', line 55

def self.mime_package(result, filename, dir)
  boundary = mime_boundary
  mhtml = mime_preamble(boundary, "#{filename}.htm", result)
  mhtml += mime_attachment(boundary, "#{filename}.htm", "filelist.xml", dir)
  Dir.foreach(dir) do |item|
    next if item == "." || item == ".." || /^\./.match(item) ||
      item == "filelist.xml"
    mhtml += mime_attachment(boundary, "#{filename}.htm", item, dir)
  end
  mhtml += "--#{boundary}--"
  File.open("#{filename}.doc", "w:UTF-8") { |f| f.write contentid(mhtml) }
end

.mime_preamble(boundary, filename, result) ⇒ Object



8
9
10
11
12
13
14
15
16
17
18
19
20
21
# File 'lib/html2doc/mime.rb', line 8

def self.mime_preamble(boundary, filename, result)
  <<~"PREAMBLE"
  MIME-Version: 1.0
  Content-Type: multipart/related; boundary="#{boundary}"

  --#{boundary}
  Content-ID: <#{File.basename(filename)}>
  Content-Disposition: inline; filename="#{File.basename(filename)}"
  Content-Type: text/html; charset="utf-8"

  #{result}

  PREAMBLE
end

.mime_type(item) ⇒ Object



43
44
45
46
47
48
# File 'lib/html2doc/mime.rb', line 43

def self.mime_type(item)
  types = MIME::Types.type_for(item)
  type = types ? types.first.to_s : 'text/plain; charset="utf-8"'
  type = type + ' charset="utf-8"' if /^text/.match(type) && types
  type
end

.mkuuidObject



90
91
92
# File 'lib/html2doc/mime.rb', line 90

def self.mkuuid
  UUIDTools::UUID.random_create.to_s
end

.msonormal(docxml) ⇒ Object



204
205
206
207
208
209
210
211
# File 'lib/html2doc/base.rb', line 204

def self.msonormal(docxml)
  docxml.xpath("//*[local-name() = 'p'][not(self::*[@class])]").each do |p|
    p["class"] = "MsoNormal"
  end
  docxml.xpath("//*[local-name() = 'li'][not(self::*[@class])]").each do |p|
    p["class"] = "MsoNormal"
  end
end

.msword_fix(r) ⇒ Object



92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/html2doc/base.rb', line 92

def self.msword_fix(r)
  # brain damage in MSWord parser
  r.gsub!(%r{<span style="mso-special-character:footnote"/>},
          '<span style="mso-special-character:footnote"></span>')
  r.gsub!(%r{<div style="mso-element:footnote-list"></div>},
          '<div style="mso-element:footnote-list"/>')
  r.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
  r.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
  r.gsub!(%r{<meta http-equiv="Content-Type"},
          "<meta http-equiv=Content-Type")
  r.gsub!(%r{></m:jc>}, "/>")
  r.gsub!(%r{></v:stroke>}, "/>")
  r.gsub!(%r{></v:f>}, "/>")
  r.gsub!(%r{></v:path>}, "/>")
  r.gsub!(%r{></o:lock>}, "/>")
  r.gsub!(%r{></v:imagedata>}, "/>")
  r.gsub!(%r{></w:wrap>}, "/>")
  r.gsub!(%r{&tab;|&amp;tab;}, '<span style="mso-tab-count:1">&#xA0; </span>')
  r = r.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
    a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
    a
  end.join
  r
end

.namespace(root) ⇒ Object



178
179
180
181
182
183
184
185
# File 'lib/html2doc/base.rb', line 178

def self.namespace(root)
  {
    o: "urn:schemas-microsoft-com:office:office",
    w: "urn:schemas-microsoft-com:office:word",
    v: "urn:schemas-microsoft-com:vml",
    m: "http://schemas.microsoft.com/office/2004/12/omml",
  }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
end

.ooxml_cleanup(m, docnamespaces) ⇒ Object

random fixes to MathML input that OOXML needs to render properly



45
46
47
48
49
50
# File 'lib/html2doc/math.rb', line 45

def self.ooxml_cleanup(m, docnamespaces)
  m = unwrap_accents(mathml_preserve_space(
    mathml_insert_rows(m, docnamespaces), docnamespaces))
  m.add_namespace(nil, "http://www.w3.org/1998/Math/MathML")
  m
end

.process(result, hash) ⇒ Object



10
11
12
13
14
15
16
17
18
# File 'lib/html2doc/base.rb', line 10

def self.process(result, hash)
  hash[:dir1] = create_dir(hash[:filename], hash[:dir])
  result = process_html(result, hash)
  process_header(hash[:header_file], hash)
  generate_filelist(hash[:filename], hash[:dir1])
  File.open("#{hash[:filename]}.htm", "w:UTF-8") { |f| f.write(result) }
  mime_package result, hash[:filename], hash[:dir1]
  rm_temp_files(hash[:filename], hash[:dir], hash[:dir1]) unless hash[:debug]
end


49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/html2doc/notes.rb', line 49

def self.process_footnote_link(docxml, a, i, fn)
  return false unless footnote?(a)
  href = a["href"].gsub(/^#/, "")
  note = docxml.at("//*[@name = '#{href}' or @id = '#{href}']")
  return false if note.nil?
  set_footnote_link_attrs(a, i)
  if a.at("./span[@class = 'MsoFootnoteReference']")
    a.children.each do |c|
      if c.name == "span" and c["class"] == "MsoFootnoteReference"
        c.replace(FN)
      else
        c.wrap("<span class='MsoFootnoteReference'></span>")
      end
    end
  else
    a.children = FN
  end
  fn << transform_footnote_text(note)
end

.process_footnote_texts(docxml, footnotes) ⇒ Object



14
15
16
17
18
19
20
21
22
23
# File 'lib/html2doc/notes.rb', line 14

def self.process_footnote_texts(docxml, footnotes)
  body = docxml.at("//body")
  list = body.add_child("<div style='mso-element:footnote-list'/>")
  footnotes.each_with_index do |f, i|
    fn = list.first.add_child(footnote_container(docxml, i + 1))
    f.parent = fn.first
    footnote_div_to_p(f)
  end
  footnote_cleanup(docxml)
end

.process_header(headerfile, hash) ⇒ Object



20
21
22
23
24
25
# File 'lib/html2doc/base.rb', line 20

def self.process_header(headerfile, hash)
  return if headerfile.nil?
  doc = File.read(headerfile, encoding: "utf-8")
  doc = header_image_cleanup(doc, hash[:dir1], hash[:filename], File.dirname(hash[:filename]))
  File.open("#{hash[:dir1]}/header.html", "w:UTF-8") { |f| f.write(doc) }
end

.process_html(result, hash) ⇒ Object



42
43
44
45
46
# File 'lib/html2doc/base.rb', line 42

def self.process_html(result, hash)
  docxml = to_xhtml(asciimath_to_mathml(result, hash[:asciimathdelims]))
  define_head(cleanup(docxml, hash), hash)
  msword_fix(from_xhtml(docxml))
end

.rm_temp_files(filename, dir, dir1) ⇒ Object



48
49
50
51
52
# File 'lib/html2doc/base.rb', line 48

def self.rm_temp_files(filename, dir, dir1)
  FileUtils.rm "#{filename}.htm"
  FileUtils.rm_f "#{dir1}/header.html"
  FileUtils.rm_r dir1 unless dir
end

.rootnamespace(root) ⇒ Object



187
188
189
# File 'lib/html2doc/base.rb', line 187

def self.rootnamespace(root)
  root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
end


84
85
86
87
88
89
# File 'lib/html2doc/notes.rb', line 84

def self.set_footnote_link_attrs(a, i)
  a["style"] = "mso-footnote-id:ftn#{i}"
  a["href"] = "#_ftn#{i}"
  a["name"] = "_ftnref#{i}"
  a["title"] = ""
end

.style_list(li, level, liststyle, listnumber) ⇒ Object



8
9
10
11
12
13
14
15
16
# File 'lib/html2doc/lists.rb', line 8

def self.style_list(li, level, liststyle, listnumber)
  return unless liststyle
  if li["style"]
    li["style"] += ";"
  else
    li["style"] = ""
  end
  li["style"] += "mso-list:#{liststyle} level#{level} lfo#{listnumber};"
end

.stylesheet(filename, header_filename, fn) ⇒ Object



149
150
151
152
153
154
155
156
157
# File 'lib/html2doc/base.rb', line 149

def self.stylesheet(filename, header_filename, fn)
  (fn.nil? || fn.empty?) &&
    fn = File.join(File.dirname(__FILE__), "wordstyle.css")
  stylesheet = File.read(fn, encoding: "UTF-8")
  stylesheet = filename_substitute(stylesheet, header_filename, filename)
  xml = Nokogiri::XML("<style/>")
  xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n")
  xml.root.to_s
end

.to_xhtml(xml) ⇒ Object



73
74
75
76
77
78
79
80
# File 'lib/html2doc/base.rb', line 73

def self.to_xhtml(xml)
  xml.gsub!(/<\?xml[^>]*>/, "")
  unless /<!DOCTYPE /.match xml
    xml = '<!DOCTYPE html SYSTEM
        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
  end
  Nokogiri::XML.parse(xml)
end

.toPlane1(x, font) ⇒ Object



114
115
116
117
118
119
120
# File 'lib/html2doc/math.rb', line 114

def self.toPlane1(x, font)
  x.traverse do |n|
    next unless n.text?
    n.replace(Plane1Converter.conv(HTMLEntities.new.decode(n.text), font))
  end
  x
end

.transform_footnote_text(note) ⇒ Object



69
70
71
72
73
74
75
76
77
# File 'lib/html2doc/notes.rb', line 69

def self.transform_footnote_text(note)
  note["id"] = ""
  note.xpath(".//div").each { |div| div.replace(div.children) }
  note.xpath(".//aside | .//p").each do |p|
    p.name = "p"
    p["class"] = "MsoFootnoteText"
  end
  note.remove
end

.uncenter(m, ooxml) ⇒ Object

if oomml has no siblings, by default it is centered; override this with left/right if parent is so tagged



152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# File 'lib/html2doc/math.rb', line 152

def self.uncenter(m, ooxml)
  if m.next == nil && m.previous == nil
    alignnode = m.at(".//ancestor::*[@style][local-name() = 'p' or "\
                     "local-name() = 'div' or local-name() = 'td']/@style")
    return ooxml unless alignnode
    if alignnode.text.include? ("text-align:left")
      ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
        "m:val='left'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
    elsif alignnode.text.include? ("text-align:right")
      ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
        "m:val='right'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
    end
  end
  ooxml
end

.unitalic(m) ⇒ Object



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/html2doc/math.rb', line 68

def self.unitalic(m)
  m.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'p']]").each do |x|
    x.wrap("<span style='font-style:normal;'></span>")
  end
  m.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'bi']]").each do |x|
    x.wrap("<span class='nostem' style='font-weight:bold;'><em></em></span>")
  end
  m.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'i']]").each do |x|
    x.wrap("<span class='nostem'><em></em></span>")
  end
  m.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'b']]").each do |x|
    x.wrap("<span style='font-style:normal;font-weight:bold;'></span>")
  end
  m.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'monospace']]").each do |x|
    toPlane1(x, :monospace)
  end
  m.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'double-struck']]").each do |x|
    toPlane1(x, :doublestruck)
  end
  m.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'script']]").each do |x|
    toPlane1(x, :script)
  end
  m.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'script']]").each do |x|
    toPlane1(x, :scriptbold)
  end
  m.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
    toPlane1(x, :fraktur)
  end
  m.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
    toPlane1(x, :frakturbold)
  end
  m.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    toPlane1(x, :sans)
  end
  m.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    toPlane1(x, :sansbold)
  end
  m.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'i']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    toPlane1(x, :sansitalic)
  end
  m.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'bi']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    toPlane1(x, :sansbolditalic)
  end
  m
end

.unwrap_accents(doc) ⇒ Object



35
36
37
38
39
40
41
42
# File 'lib/html2doc/math.rb', line 35

def self.unwrap_accents(doc)
  doc.xpath("//*[@accent = 'true']").each do |x|
    x.elements.length > 1 or next
    x.elements[1].name == "mrow" and
      x.elements[1].replace(x.elements[1].children)
  end
  doc
end

.warnsvg(src) ⇒ Object



94
95
96
# File 'lib/html2doc/mime.rb', line 94

def self.warnsvg(src)
  warn "#{src}: SVG not supported" if /\.svg$/i.match(src)
end