Module: Html2Doc

Defined in:
lib/html2doc/base.rb,
lib/html2doc/math.rb,
lib/html2doc/mime.rb,
lib/html2doc/lists.rb,
lib/html2doc/notes.rb,
lib/html2doc/version.rb

Constant Summary collapse

NOKOHEAD =
<<~HERE.freeze
  <!DOCTYPE html SYSTEM
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  <html xmlns="http://www.w3.org/1999/xhtml">
  <head> <title></title> <meta charset="UTF-8" /> </head>
  <body> </body> </html>
HERE
DOCTYPE =
<<~"DOCTYPE".freeze
  <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
DOCTYPE
<<~XML.freeze
  <!--[if gte mso 9]>
  <xml>
  <w:WordDocument>
  <w:View>Print</w:View>
  <w:Zoom>100</w:Zoom>
  <w:DoNotOptimizeForBrowser/>
  </w:WordDocument>
  </xml>
  <![endif]-->
  <meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
XML
IMAGE_PATH =
"//*[local-name() = 'img' or local-name() = 'imagedata']".freeze
TOPLIST =
"[not(ancestor::ul) and not(ancestor::ol)]".freeze
FN =
"<span class='MsoFootnoteReference'>"\
"<span style='mso-special-character:footnote'/></span>".freeze
VERSION =
"1.0.6".freeze

Class Method Summary collapse

Class Method Details

.add_stylesheet(head, title, css) ⇒ Object



159
160
161
162
163
164
165
166
167
# File 'lib/html2doc/base.rb', line 159

def self.add_stylesheet(head, title, css)
  if head.children.empty?
    head.add_child css
  elsif title.nil?
    head.children.first.add_previous_sibling css
  else
    title.add_next_sibling css
  end
end

.asciimath_to_mathml(doc, delims) ⇒ Object



18
19
20
21
22
23
24
25
26
27
# File 'lib/html2doc/math.rb', line 18

def self.asciimath_to_mathml(doc, delims)
  return doc if delims.nil? || delims.size < 2
  m = doc.split(/(#{Regexp.escape(delims[0])}|#{Regexp.escape(delims[1])})/)
  m.each_slice(4).map.with_index do |(*a), i|
    i % 500 == 0 && m.size > 1000 && i > 0 and
      warn "MathML #{i} of #{(m.size / 4).floor}"
    a[2].nil? || a[2] = asciimath_to_mathml1(a[2])
    a.size > 1 ? a[0] + a[2] : a[0]
  end.join
end

.asciimath_to_mathml1(x) ⇒ Object



12
13
14
15
16
# File 'lib/html2doc/math.rb', line 12

def self.asciimath_to_mathml1(x)
  AsciiMath::MathMLBuilder.new(:msword => true).append_expression(
    AsciiMath.parse(HTMLEntities.new.decode(x)).ast).to_s.
      gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>")
end

.bookmarks(docxml) ⇒ Object



182
183
184
185
186
187
188
189
190
191
192
193
# File 'lib/html2doc/base.rb', line 182

def self.bookmarks(docxml)
  docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]").each do |x|
    next if x["id"].empty?
    next if %w(shapetype v:shapetype shape v:shape).include? x.name
    if x.children.empty?
      x.add_child("<a name='#{x["id"]}'></a>")
    else
      x.children.first.previous = "<a name='#{x["id"]}'></a>"
    end
    x.delete("id")
  end
end

.cleanup(docxml, hash) ⇒ Object



46
47
48
49
50
51
52
53
54
55
# File 'lib/html2doc/base.rb', line 46

def self.cleanup(docxml, hash)
  namespace(docxml.root)
  image_cleanup(docxml, hash[:dir1], File.dirname(hash[:filename]))
  mathml_to_ooml(docxml)
  lists(docxml, hash[:liststyles])
  footnotes(docxml)
  bookmarks(docxml)
  msonormal(docxml)
  docxml
end

.create_dir(filename, dir) ⇒ Object



27
28
29
30
31
32
# File 'lib/html2doc/base.rb', line 27

def self.create_dir(filename, dir)
  return dir if dir
  dir = "#{filename}_files"
  Dir.mkdir(dir) unless File.exists?(dir)
  dir
end

.define_head(docxml, hash) ⇒ Object



150
151
152
153
154
155
156
157
# File 'lib/html2doc/base.rb', line 150

def self.define_head(docxml, hash)
  title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']")
  head = docxml.at("//*[local-name() = 'head']")
  css = stylesheet(hash[:filename], hash[:header_file], hash[:stylesheet])
  add_stylesheet(head, title, css)
  define_head1(docxml, hash[:dir1])
  rootnamespace(docxml.root)
end

.define_head1(docxml, dir) ⇒ Object



122
123
124
125
126
127
128
129
# File 'lib/html2doc/base.rb', line 122

def self.define_head1(docxml, dir)
  docxml.xpath("//*[local-name() = 'head']").each do |h|
    h.children.first.add_previous_sibling <<~XML
    #{PRINT_VIEW}
      <link rel="File-List" href="#{File.basename(dir)}/filelist.xml"/>
    XML
  end
end

.esc_space(xml) ⇒ Object

escape space as &#x32;; we are removing any spaces generated by XML indentation



136
137
138
139
140
141
142
# File 'lib/html2doc/math.rb', line 136

def self.esc_space(xml)
  xml.traverse do |n|
    next unless n.text?
    n = n.text.gsub(/ /, "&#x32;")
  end
  xml
end

.filename_substitute(stylesheet, header_filename, filename) ⇒ Object



131
132
133
134
135
136
137
138
# File 'lib/html2doc/base.rb', line 131

def self.filename_substitute(stylesheet, header_filename, filename)
  if header_filename.nil?
    stylesheet.gsub!(/\n[^\n]*FILENAME[^\n]*i\n/, "\n")
  else
    stylesheet.gsub!(/FILENAME/, File.basename(filename))
  end
  stylesheet
end

.footnote?(a) ⇒ Boolean

Returns:

  • (Boolean)


79
80
81
82
# File 'lib/html2doc/notes.rb', line 79

def self.footnote?(a)
  a["epub:type"]&.casecmp("footnote")&.zero? ||
    a["class"]&.casecmp("footnote")&.zero?
end

.footnote_cleanup(docxml) ⇒ Object

We expect that the content of the footnote text received is one or more text containers, p or aside or div (which we have already converted to p). We do not expect any <a name> or links back to text; if they are present in the HTML, they need to have been cleaned out before passing to this gem



96
97
98
99
100
101
102
103
# File 'lib/html2doc/notes.rb', line 96

def self.footnote_cleanup(docxml)
  docxml.xpath('//div[@style="mso-element:footnote"]/a').
    each do |x|
    n = x.next_element
    n&.children&.first&.add_previous_sibling(x.remove)
  end
  docxml
end

.footnote_container(docxml, i) ⇒ Object



39
40
41
42
43
44
45
46
47
# File 'lib/html2doc/notes.rb', line 39

def self.footnote_container(docxml, i)
  ref = docxml&.at("//a[@href='#_ftn#{i}']")&.children&.to_xml(indent: 0).
    gsub(/>\n</, "><") || FN
  <<~DIV
    <div style='mso-element:footnote' id='ftn#{i}'>
      <a style='mso-footnote-id:ftn#{i}' href='#_ftn#{i}'
         name='_ftnref#{i}' title='' id='_ftnref#{i}'>#{ref.strip}</a></div>
  DIV
end

.footnote_div_to_p(f) ⇒ Object



25
26
27
28
29
30
31
32
33
34
# File 'lib/html2doc/notes.rb', line 25

def self.footnote_div_to_p(f)
  if %w{div aside}.include? f.name
    if f.at(".//p")
      f.replace(f.children)
    else
      f.name = "p"
      f["class"] = "MsoFootnoteText"
    end
  end
end

.footnotes(docxml) ⇒ Object



4
5
6
7
8
9
10
11
12
# File 'lib/html2doc/notes.rb', line 4

def self.footnotes(docxml)
  i = 1
  fn = []
  docxml.xpath("//a").each do |a|
    next unless process_footnote_link(docxml, a, i, fn)
    i += 1
  end
  process_footnote_texts(docxml, fn)
end

.from_xhtml(xml) ⇒ Object



78
79
80
81
82
# File 'lib/html2doc/base.rb', line 78

def self.from_xhtml(xml)
  xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "").
    sub(DOCTYPE, "").
    gsub(%{ />}, "/>")
end

.generate_filelist(filename, dir) ⇒ Object



128
129
130
131
132
133
134
135
136
137
138
# File 'lib/html2doc/mime.rb', line 128

def self.generate_filelist(filename, dir)
  File.open(File.join(dir, "filelist.xml"), "w") do |f|
    f.write %{<xml xmlns:o="urn:schemas-microsoft-com:office:office">
      <o:MainFile HRef="../#{filename}.htm"/>}
    Dir.entries(dir).sort.each do |item|
      next if item == "." || item == ".." || /^\./.match(item)
      f.write %{  <o:File HRef="#{item}"/>\n}
    end
    f.write("</xml>\n")
  end
end

.header_image_cleanup(doc, dir, filename, localdir) ⇒ Object

do not parse the header through Nokogiri, since it will contain non-XML like <![if !supportFootnotes]>



108
109
110
111
112
# File 'lib/html2doc/mime.rb', line 108

def self.header_image_cleanup(doc, dir, filename, localdir)
  doc.split(%r{(<img [^>]*>|<v:imagedata [^>]*>)}).each_slice(2).map do |a|
    header_image_cleanup1(a, dir, filename, localdir)
  end.join
end

.header_image_cleanup1(a, dir, filename, localdir) ⇒ Object



114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/html2doc/mime.rb', line 114

def self.header_image_cleanup1(a, dir, filename, localdir)
  if a.size == 2 && !(/ src="https?:/.match a[1]) &&
      !(%r{ src="data:image/[^;]+;base64}.match a[1])
    m = / src=['"](?<src>[^"']+)['"]/.match a[1]
    #warnsvg(m[:src])
    m2 = /\.(?<suffix>[a-zA-Z_0-9]+)$/.match m[:src]
    new_filename = "#{mkuuid}.#{m2[:suffix]}"
    old_filename = %r{^([A-Z]:)?/}.match(m[:src]) ? m[:src] : File.join(localdir, m[:src])
    FileUtils.cp old_filename, File.join(dir, new_filename)
    a[1].sub!(%r{ src=['"](?<src>[^"']+)['"]}, " src='file:///C:/Doc/#{filename}_files/#{new_filename}'")
  end
  a.join
end

.image_cleanup(docxml, dir, localdir) ⇒ Object

only processes locally stored images



90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/html2doc/mime.rb', line 90

def self.image_cleanup(docxml, dir, localdir)
  docxml.traverse do |i|
    next unless i.element? && %w(img v:imagedata).include?(i.name)
    #warnsvg(i["src"])
    next if /^http/.match i["src"]
    next if %r{^data:image/[^;]+;base64}.match i["src"]
    local_filename = %r{^([A-Z]:)?/}.match(i["src"]) ? i["src"] :
      File.join(localdir, i["src"])
    new_filename = "#{mkuuid}#{File.extname(i["src"])}"
    FileUtils.cp local_filename, File.join(dir, new_filename)
    i["width"], i["height"] = image_resize(i, local_filename, 680, 400)
    i["src"] = File.join(File.basename(dir), new_filename)
  end
  docxml
end

.image_resize(i, path, maxheight, maxwidth) ⇒ Object

max width for Word document is 400, max height is 680



67
68
69
70
71
72
73
74
75
76
77
# File 'lib/html2doc/mime.rb', line 67

def self.image_resize(i, path, maxheight, maxwidth)
  realSize = ImageSize.path(path).size
  s = [i["width"].to_i, i["height"].to_i]
  s = realSize if s[0].zero? && s[1].zero?
  return [nil, nil] if realSize[0].nil? || realSize[1].nil?
  s[1] = s[0] * realSize[1] / realSize[0] if s[1].zero? && !s[0].zero?
  s[0] = s[1] * realSize[0] / realSize[1] if s[0].zero? && !s[1].zero?
  s = [(s[0] * maxheight / s[1]).ceil, maxheight] if s[1] > maxheight
  s = [maxwidth, (s[1] * maxwidth / s[0]).ceil] if s[0] > maxwidth
  s
end

.list2para(u) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/html2doc/lists.rb', line 48

def self.list2para(u)
  return if u.xpath("./li").empty?
  u.xpath("./li").first["class"] ||= "MsoListParagraphCxSpFirst"
  u.xpath("./li").last["class"] ||= "MsoListParagraphCxSpLast"
  u.xpath("./li/p").each { |p| p["class"] ||= "MsoListParagraphCxSpMiddle" }
  u.xpath("./li").each do |l|
    l.name = "p"
    l["class"] ||= "MsoListParagraphCxSpMiddle"
    l&.first_element_child&.name == "p" and
      l.first_element_child.replace(l.first_element_child.children)
  end
  u.replace(u.children)
end

.list_add(xpath, liststyles, listtype, level) ⇒ Object



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/html2doc/lists.rb', line 32

def self.list_add(xpath, liststyles, listtype, level)
  xpath.each_with_index do |list, i|
    @listnumber += 1 if level == 1
    list["seen"] = true if level == 1
    list["id"] ||= UUIDTools::UUID.random_create
    (list.xpath(".//li") - list.xpath(".//ol//li | .//ul//li")).each do |li|
      style_list(li, level, liststyles[listtype], @listnumber)
      list_add1(li, liststyles, listtype, level)
    end
    list.xpath(".//ul[not(ancestor::li/ancestor::*/@id = '#{list['id']}')] | "\
               ".//ol[not(ancestor::li/ancestor::*/@id = '#{list['id']}')]").each do |li|
      list_add1(li.parent, liststyles, listtype, level-1)
    end
  end
end

.list_add1(li, liststyles, listtype, level) ⇒ Object



18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/html2doc/lists.rb', line 18

def self.list_add1(li, liststyles, listtype, level)
  if [:ul, :ol].include? listtype
        list_add(li.xpath(".//ul") - li.xpath(".//ul//ul | .//ol//ul"),
                 liststyles, :ul, level + 1)
        list_add(li.xpath(".//ol") - li.xpath(".//ul//ol | .//ol//ol"),
                 liststyles, :ol, level + 1)
      else
        list_add(li.xpath(".//ul") - li.xpath(".//ul//ul | .//ol//ul"),
                 liststyles, listtype, level + 1)
        list_add(li.xpath(".//ol") - li.xpath(".//ul//ol | .//ol//ol"),
                 liststyles, listtype, level + 1)
      end
end

.lists(docxml, liststyles) ⇒ Object



87
88
89
90
91
92
93
94
# File 'lib/html2doc/lists.rb', line 87

def self.lists(docxml, liststyles)
  return if liststyles.nil?
  @listnumber = 0
  liststyles.each_key { |k| lists1(docxml, liststyles, k) }
  lists_unstyled(docxml, liststyles)
  liststyles.has_key?(:ul) and docxml.xpath("//ul").each { |u| list2para(u) }
  liststyles.has_key?(:ol) and docxml.xpath("//ol").each { |u| list2para(u) }
end

.lists1(docxml, liststyles, k) ⇒ Object



64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/html2doc/lists.rb', line 64

def self.lists1(docxml, liststyles, k)
  case k
  when :ul then list_add(docxml.xpath("//ul[not(@class)]#{TOPLIST}"),
                          liststyles, :ul, 1)
  when :ol then list_add(docxml.xpath("//ol[not(@class)]#{TOPLIST}"),
                         liststyles, :ol, 1)
  else
    list_add(docxml.xpath("//ol[@class = '#{k.to_s}']#{TOPLIST} | "\
                          "//ul[@class = '#{k.to_s}']#{TOPLIST}"),
    liststyles, k, 1)
  end
end

.lists_unstyled(docxml, liststyles) ⇒ Object



77
78
79
80
81
82
83
84
85
# File 'lib/html2doc/lists.rb', line 77

def self.lists_unstyled(docxml, liststyles)
  list_add(docxml.xpath("//ul#{TOPLIST}[not(@seen)]"),
           liststyles, :ul, 1) if liststyles.has_key?(:ul)
  list_add(docxml.xpath("//ol#{TOPLIST}[not(@seen)]"),
           liststyles, :ul, 1) if liststyles.has_key?(:ol)
  docxml.xpath("//ul[@seen] | //ol[@seen]").each do |l|
    l.delete("seen")
  end
end

.mathml_insert_rows(m, docnamespaces) ⇒ Object



46
47
48
49
50
51
52
53
# File 'lib/html2doc/math.rb', line 46

def self.mathml_insert_rows(m, docnamespaces)
  m.xpath(%w(msup msub msubsup munder mover munderover).
          map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
    next unless x.next_element && x.next_element != "mrow"
    x.next_element.wrap("<mrow/>")
  end
  m
end

.mathml_preserve_space(m, docnamespaces) ⇒ Object



55
56
57
58
59
60
# File 'lib/html2doc/math.rb', line 55

def self.mathml_preserve_space(m, docnamespaces)
  m.xpath(".//xmlns:mtext", docnamespaces).each do |x|
    x.children = x.children.to_xml.gsub(/^\s/, "&#xA0;").gsub(/\s$/, "&#xA0;")
  end
  m
end

.mathml_to_ooml(docxml) ⇒ Object



116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# File 'lib/html2doc/math.rb', line 116

def self.mathml_to_ooml(docxml)
  docnamespaces = docxml.collect_namespaces
  m = docxml.xpath("//*[local-name() = 'math']")
  m.each_with_index do |x, i|
    i % 100 == 0 && m.size > 500 && i > 0 and
      warn "Math OOXML #{i} of #{m.size}"
    element = ooxml_cleanup(x, docnamespaces)
    doc = Nokogiri::XML::Document::new()
    doc.root = element
    ooxml = (unitalic(esc_space(@xsltemplate.transform(doc)))).to_s.
      gsub(/<\?[^>]+>\s*/, "").
      gsub(/ xmlns(:[^=]+)?="[^"]+"/, "").
      gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
    ooxml = uncenter(x, ooxml)
    x.swap(ooxml)
  end
end

.mime_attachment(boundary, filename, item, dir) ⇒ Object



22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/html2doc/mime.rb', line 22

def self.mime_attachment(boundary, filename, item, dir)
  content_type = mime_type(item)
  text_mode = %w[text application].any? { |p| content_type.start_with? p }

  path = File.join(dir, item)
  content = text_mode ? File.read(path, encoding: "utf-8") : IO.binread(path)

  encoded_file = Base64.strict_encode64(content).gsub(/(.{76})/, "\\1\n")
  <<~"FILE"
  --#{boundary}
  Content-Location: file:///C:/Doc/#{File.basename(filename)}_files/#{item}
  Content-Transfer-Encoding: base64
  Content-Type: #{content_type}

  #{encoded_file}

  FILE
end

.mime_boundaryObject



48
49
50
51
# File 'lib/html2doc/mime.rb', line 48

def self.mime_boundary
  salt = UUIDTools::UUID.random_create.to_s.gsub(/-/, ".")[0..17]
  "----=_NextPart_#{salt}"
end

.mime_package(result, filename, dir) ⇒ Object



53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/html2doc/mime.rb', line 53

def self.mime_package(result, filename, dir)
  boundary = mime_boundary
  mhtml = mime_preamble(boundary, filename, result)
  mhtml += mime_attachment(boundary, filename, "filelist.xml", dir)
  Dir.foreach(dir) do |item|
    next if item == "." || item == ".." || /^\./.match(item) ||
      item == "filelist.xml"
    mhtml += mime_attachment(boundary, filename, item, dir)
  end
  mhtml += "--#{boundary}--"
  File.open("#{filename}.doc", "w:UTF-8") { |f| f.write mhtml }
end

.mime_preamble(boundary, filename, result) ⇒ Object



8
9
10
11
12
13
14
15
16
17
18
19
20
# File 'lib/html2doc/mime.rb', line 8

def self.mime_preamble(boundary, filename, result)
  <<~"PREAMBLE"
  MIME-Version: 1.0
  Content-Type: multipart/related; boundary="#{boundary}"

  --#{boundary}
  Content-Location: file:///C:/Doc/#{File.basename(filename)}.htm
  Content-Type: text/html; charset="utf-8"

  #{result}

  PREAMBLE
end

.mime_type(item) ⇒ Object



41
42
43
44
45
46
# File 'lib/html2doc/mime.rb', line 41

def self.mime_type(item)
  types = MIME::Types.type_for(item)
  type = types ? types.first.to_s : 'text/plain; charset="utf-8"'
  type = type + ' charset="utf-8"' if /^text/.match(type) && types
  type
end

.mkuuidObject



81
82
83
# File 'lib/html2doc/mime.rb', line 81

def self.mkuuid
  UUIDTools::UUID.random_create.to_s
end

.msonormal(docxml) ⇒ Object



195
196
197
198
199
200
201
202
# File 'lib/html2doc/base.rb', line 195

def self.msonormal(docxml)
  docxml.xpath("//*[local-name() = 'p'][not(self::*[@class])]").each do |p|
    p["class"] = "MsoNormal"
  end
  docxml.xpath("//*[local-name() = 'li'][not(self::*[@class])]").each do |p|
    p["class"] = "MsoNormal"
  end
end

.msword_fix(r) ⇒ Object



84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/html2doc/base.rb', line 84

def self.msword_fix(r)
  # brain damage in MSWord parser
  r.gsub!(%r{<span style="mso-special-character:footnote"/>},
          '<span style="mso-special-character:footnote"></span>')
  r.gsub!(%r{<div style="mso-element:footnote-list"></div>},
          '<div style="mso-element:footnote-list"/>')
  r.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
  r.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
  r.gsub!(%r{<meta http-equiv="Content-Type"},
          "<meta http-equiv=Content-Type")
  r.gsub!(%r{></m:jc>}, "/>")
  r.gsub!(%r{></v:stroke>}, "/>")
  r.gsub!(%r{></v:f>}, "/>")
  r.gsub!(%r{></v:path>}, "/>")
  r.gsub!(%r{></o:lock>}, "/>")
  r.gsub!(%r{></v:imagedata>}, "/>")
  r.gsub!(%r{></w:wrap>}, "/>")
  r.gsub!(%r{&tab;|&amp;tab;}, '<span style="mso-tab-count:1">&#xA0; </span>')
  r = r.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
    a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
    a
  end.join
  r
end

.namespace(root) ⇒ Object



169
170
171
172
173
174
175
176
# File 'lib/html2doc/base.rb', line 169

def self.namespace(root)
  {
    o: "urn:schemas-microsoft-com:office:office",
    w: "urn:schemas-microsoft-com:office:word",
    v: "urn:schemas-microsoft-com:vml",
    m: "http://schemas.microsoft.com/office/2004/12/omml",
  }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
end

.ooxml_cleanup(m, docnamespaces) ⇒ Object

random fixes to MathML input that OOXML needs to render properly



39
40
41
42
43
44
# File 'lib/html2doc/math.rb', line 39

def self.ooxml_cleanup(m, docnamespaces)
  m = unwrap_accents(mathml_preserve_space(
    mathml_insert_rows(m, docnamespaces), docnamespaces))
  m.add_namespace(nil, "http://www.w3.org/1998/Math/MathML")
  m
end

.process(result, hash) ⇒ Object



10
11
12
13
14
15
16
17
18
# File 'lib/html2doc/base.rb', line 10

def self.process(result, hash)
  hash[:dir1] = create_dir(hash[:filename], hash[:dir])
  result = process_html(result, hash)
  process_header(hash[:header_file], hash)
  generate_filelist(hash[:filename], hash[:dir1])
  File.open("#{hash[:filename]}.htm", "w:UTF-8") { |f| f.write(result) }
  mime_package result, hash[:filename], hash[:dir1]
  rm_temp_files(hash[:filename], hash[:dir], hash[:dir1]) unless hash[:debug]
end


49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/html2doc/notes.rb', line 49

def self.process_footnote_link(docxml, a, i, fn)
  return false unless footnote?(a)
  href = a["href"].gsub(/^#/, "")
  note = docxml.at("//*[@name = '#{href}' or @id = '#{href}']")
  return false if note.nil?
  set_footnote_link_attrs(a, i)
  if a.at("./span[@class = 'MsoFootnoteReference']")
    a.children.each do |c|
      if c.name == "span" and c["class"] == "MsoFootnoteReference"
        c.replace(FN)
      else
        c.wrap("<span class='MsoFootnoteReference'></span>")
      end
    end
  else
    a.children = FN
  end
  fn << transform_footnote_text(note)
end

.process_footnote_texts(docxml, footnotes) ⇒ Object



14
15
16
17
18
19
20
21
22
23
# File 'lib/html2doc/notes.rb', line 14

def self.process_footnote_texts(docxml, footnotes)
  body = docxml.at("//body")
  list = body.add_child("<div style='mso-element:footnote-list'/>")
  footnotes.each_with_index do |f, i|
    fn = list.first.add_child(footnote_container(docxml, i + 1))
    f.parent = fn.first
    footnote_div_to_p(f)
  end
  footnote_cleanup(docxml)
end

.process_header(headerfile, hash) ⇒ Object



20
21
22
23
24
25
# File 'lib/html2doc/base.rb', line 20

def self.process_header(headerfile, hash)
  return if headerfile.nil?
  doc = File.read(headerfile, encoding: "utf-8")
  doc = header_image_cleanup(doc, hash[:dir1], hash[:filename], File.dirname(hash[:filename]))
  File.open("#{hash[:dir1]}/header.html", "w:UTF-8") { |f| f.write(doc) }
end

.process_html(result, hash) ⇒ Object



34
35
36
37
38
# File 'lib/html2doc/base.rb', line 34

def self.process_html(result, hash)
  docxml = to_xhtml(asciimath_to_mathml(result, hash[:asciimathdelims]))
  define_head(cleanup(docxml, hash), hash)
  msword_fix(from_xhtml(docxml))
end

.rm_temp_files(filename, dir, dir1) ⇒ Object



40
41
42
43
44
# File 'lib/html2doc/base.rb', line 40

def self.rm_temp_files(filename, dir, dir1)
  FileUtils.rm "#{filename}.htm"
  FileUtils.rm_f "#{dir1}/header.html"
  FileUtils.rm_r dir1 unless dir
end

.rootnamespace(root) ⇒ Object



178
179
180
# File 'lib/html2doc/base.rb', line 178

def self.rootnamespace(root)
  root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
end


84
85
86
87
88
89
# File 'lib/html2doc/notes.rb', line 84

def self.set_footnote_link_attrs(a, i)
  a["style"] = "mso-footnote-id:ftn#{i}"
  a["href"] = "#_ftn#{i}"
  a["name"] = "_ftnref#{i}"
  a["title"] = ""
end

.style_list(li, level, liststyle, listnumber) ⇒ Object



8
9
10
11
12
13
14
15
16
# File 'lib/html2doc/lists.rb', line 8

def self.style_list(li, level, liststyle, listnumber)
  return unless liststyle
  if li["style"]
    li["style"] += ";"
  else
    li["style"] = ""
  end
  li["style"] += "mso-list:#{liststyle} level#{level} lfo#{listnumber};"
end

.stylesheet(filename, header_filename, fn) ⇒ Object



140
141
142
143
144
145
146
147
148
# File 'lib/html2doc/base.rb', line 140

def self.stylesheet(filename, header_filename, fn)
  (fn.nil? || fn.empty?) &&
    fn = File.join(File.dirname(__FILE__), "wordstyle.css")
  stylesheet = File.read(fn, encoding: "UTF-8")
  stylesheet = filename_substitute(stylesheet, header_filename, filename)
  xml = Nokogiri::XML("<style/>")
  xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n")
  xml.root.to_s
end

.to_xhtml(xml) ⇒ Object



65
66
67
68
69
70
71
72
# File 'lib/html2doc/base.rb', line 65

def self.to_xhtml(xml)
  xml.gsub!(/<\?xml[^>]*>/, "")
  unless /<!DOCTYPE /.match xml
    xml = '<!DOCTYPE html SYSTEM
        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
  end
  Nokogiri::XML.parse(xml)
end

.toPlane1(x, font) ⇒ Object



108
109
110
111
112
113
114
# File 'lib/html2doc/math.rb', line 108

def self.toPlane1(x, font)
  x.traverse do |n|
    next unless n.text?
    n.replace(Plane1Converter.conv(HTMLEntities.new.decode(n.text), font))
  end
  x
end

.transform_footnote_text(note) ⇒ Object



69
70
71
72
73
74
75
76
77
# File 'lib/html2doc/notes.rb', line 69

def self.transform_footnote_text(note)
  note["id"] = ""
  note.xpath(".//div").each { |div| div.replace(div.children) }
  note.xpath(".//aside | .//p").each do |p|
    p.name = "p"
    p["class"] = "MsoFootnoteText"
  end
  note.remove
end

.uncenter(m, ooxml) ⇒ Object

if oomml has no siblings, by default it is centered; override this with left/right if parent is so tagged



146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/html2doc/math.rb', line 146

def self.uncenter(m, ooxml)
  if m.next == nil && m.previous == nil
    alignnode = m.at(".//ancestor::*[@style][local-name() = 'p' or "\
                     "local-name() = 'div' or local-name() = 'td']/@style")
    return ooxml unless alignnode
    if alignnode.text.include? ("text-align:left")
      ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
        "m:val='left'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
    elsif alignnode.text.include? ("text-align:right")
      ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
        "m:val='right'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
    end
  end
  ooxml
end

.unitalic(m) ⇒ Object



62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/html2doc/math.rb', line 62

def self.unitalic(m)
  m.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'p']]").each do |x|
    x.wrap("<span style='font-style:normal;'></span>")
  end
  m.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'bi']]").each do |x|
    x.wrap("<span class='nostem' style='font-weight:bold;'><em></em></span>")
  end
  m.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'i']]").each do |x|
    x.wrap("<span class='nostem'><em></em></span>")
  end
  m.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'b']]").each do |x|
    x.wrap("<span style='font-style:normal;font-weight:bold;'></span>")
  end
  m.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'monospace']]").each do |x|
    toPlane1(x, :monospace)
  end
  m.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'double-struck']]").each do |x|
    toPlane1(x, :doublestruck)
  end
  m.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'script']]").each do |x|
    toPlane1(x, :script)
  end
  m.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'script']]").each do |x|
    toPlane1(x, :scriptbold)
  end
  m.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
    toPlane1(x, :fraktur)
  end
  m.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
    toPlane1(x, :frakturbold)
  end
  m.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    toPlane1(x, :sans)
  end
  m.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    toPlane1(x, :sansbold)
  end
  m.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'i']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    toPlane1(x, :sansitalic)
  end
  m.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'bi']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
    toPlane1(x, :sansbolditalic)
  end
  m
end

.unwrap_accents(doc) ⇒ Object



29
30
31
32
33
34
35
36
# File 'lib/html2doc/math.rb', line 29

def self.unwrap_accents(doc)
  doc.xpath("//*[@accent = 'true']").each do |x|
    x.elements.length > 1 or next
    x.elements[1].name == "mrow" and
      x.elements[1].replace(x.elements[1].children)
  end
  doc
end

.warnsvg(src) ⇒ Object



85
86
87
# File 'lib/html2doc/mime.rb', line 85

def self.warnsvg(src)
  warn "#{src}: SVG not supported" if /\.svg$/i.match(src)
end