Module: Html2Doc

Defined in:
lib/html2doc/base.rb,
lib/html2doc/math.rb,
lib/html2doc/mime.rb,
lib/html2doc/lists.rb,
lib/html2doc/notes.rb,
lib/html2doc/version.rb

Constant Summary collapse

NOKOHEAD =
<<~HERE.freeze
  <!DOCTYPE html SYSTEM
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  <html xmlns="http://www.w3.org/1999/xhtml">
  <head> <title></title> <meta charset="UTF-8" /> </head>
  <body> </body> </html>
HERE
DOCTYPE =
<<~"DOCTYPE".freeze
  <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
DOCTYPE
<<~XML.freeze
  <!--[if gte mso 9]>
  <xml>
  <w:WordDocument>
  <w:View>Print</w:View>
  <w:Zoom>100</w:Zoom>
  <w:DoNotOptimizeForBrowser/>
  </w:WordDocument>
  </xml>
  <![endif]-->
  <meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
XML
VERSION =
"0.6.8".freeze

Class Method Summary collapse

Class Method Details

.add_stylesheet(head, title, css) ⇒ Object



142
143
144
145
146
147
148
149
150
# File 'lib/html2doc/base.rb', line 142

def self.add_stylesheet(head, title, css)
  if head.children.empty?
    head.add_child css
  elsif title.nil?
    head.children.first.add_previous_sibling css
  else
    title.add_next_sibling css
  end
end

.asciimath_to_mathml(doc, delims) ⇒ Object



18
19
20
21
22
23
24
25
# File 'lib/html2doc/math.rb', line 18

def self.asciimath_to_mathml(doc, delims)
  return doc if delims.nil? || delims.size < 2
  doc.split(/(#{Regexp.escape(delims[0])}|#{Regexp.escape(delims[1])})/).
    each_slice(4).map do |a|
    a[2].nil? || a[2] = asciimath_to_mathml1(a[2])
    a.size > 1 ? a[0] + a[2] : a[0]
  end.join
end

.asciimath_to_mathml1(x) ⇒ Object



13
14
15
16
# File 'lib/html2doc/math.rb', line 13

def self.asciimath_to_mathml1(x)
  AsciiMath.parse(HTMLEntities.new.decode(x)).to_mathml.
      gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>")
end

.bookmarks(docxml) ⇒ Object



161
162
163
164
165
166
167
168
169
170
171
# File 'lib/html2doc/base.rb', line 161

def self.bookmarks(docxml)
  docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]").each do |x|
    next if x["id"].empty?
    if x.children.empty?
      x.add_child("<a name='#{x["id"]}'></a>")
    else
      x.children.first.previous = "<a name='#{x["id"]}'></a>"
    end
    x.delete("id")
  end
end

.cleanup(docxml, hash) ⇒ Object



41
42
43
44
45
46
47
48
49
# File 'lib/html2doc/base.rb', line 41

def self.cleanup(docxml, hash)
  image_cleanup(docxml, hash[:dir1])
  mathml_to_ooml(docxml)
  lists(docxml, hash[:liststyles])
  footnotes(docxml)
  bookmarks(docxml)
  msonormal(docxml)
  docxml
end

.create_dir(filename, dir) ⇒ Object



23
24
25
26
27
28
# File 'lib/html2doc/base.rb', line 23

def self.create_dir(filename, dir)
  return dir if dir
  dir = "#{filename}_files"
  Dir.mkdir(dir) unless File.exists?(dir)
  dir
end

.define_head(docxml, hash) ⇒ Object



133
134
135
136
137
138
139
140
# File 'lib/html2doc/base.rb', line 133

def self.define_head(docxml, hash)
  title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']")
  head = docxml.at("//*[local-name() = 'head']")
  css = stylesheet(hash[:filename], hash[:header_file], hash[:stylesheet])
  add_stylesheet(head, title, css)
  define_head1(docxml, hash[:dir1])
  namespace(docxml.root)
end

.define_head1(docxml, dir) ⇒ Object



105
106
107
108
109
110
111
112
# File 'lib/html2doc/base.rb', line 105

def self.define_head1(docxml, dir)
  docxml.xpath("//*[local-name() = 'head']").each do |h|
    h.children.first.add_previous_sibling <<~XML
    #{PRINT_VIEW}
      <link rel="File-List" href="#{dir}/filelist.xml"/>
    XML
  end
end

.filename_substitute(stylesheet, header_filename, filename) ⇒ Object



114
115
116
117
118
119
120
121
# File 'lib/html2doc/base.rb', line 114

def self.filename_substitute(stylesheet, header_filename, filename)
  if header_filename.nil?
    stylesheet.gsub!(/\n[^\n]*FILENAME[^\n]*i\n/, "\n")
  else
    stylesheet.gsub!(/FILENAME/, filename)
  end
  stylesheet
end

.footnote?(a) ⇒ Boolean

Returns:

  • (Boolean)


67
68
69
70
# File 'lib/html2doc/notes.rb', line 67

def self.footnote?(a)
  a["epub:type"]&.casecmp("footnote")&.zero? ||
    a["class"]&.casecmp("footnote")&.zero?
end

.footnote_cleanup(docxml) ⇒ Object

We expect that the content of the footnote text received is one or more text containers, p or aside or div (which we have already converted to p). We do not expect any <a name> or links back to text; if they are present in the HTML, they need to have been cleaned out before passing to this gem



84
85
86
87
88
89
90
91
# File 'lib/html2doc/notes.rb', line 84

def self.footnote_cleanup(docxml)
  docxml.xpath('//div[@style="mso-element:footnote"]/a').
    each do |x|
    n = x.next_element
    n&.children&.first&.add_previous_sibling(x.remove)
  end
  docxml
end

.footnote_container(i) ⇒ Object



36
37
38
39
40
41
42
43
44
# File 'lib/html2doc/notes.rb', line 36

def self.footnote_container(i)
  <<~DIV
    <div style='mso-element:footnote' id='ftn#{i}'>
      <a style='mso-footnote-id:ftn#{i}' href='#_ftn#{i}'
         name='_ftnref#{i}' title='' id='_ftnref#{i}'><span
         class='MsoFootnoteReference'><span
         style='mso-special-character:footnote'></span></span></div>
  DIV
end

.footnote_div_to_p(f) ⇒ Object



25
26
27
28
29
30
31
32
33
34
# File 'lib/html2doc/notes.rb', line 25

def self.footnote_div_to_p(f)
  if %w{div aside}.include? f.name
    if f.at(".//p")
      f.replace(f.children)
    else
      f.name = "p"
      f["class"] = "MsoFootnoteText"
    end
  end
end

.footnotes(docxml) ⇒ Object



4
5
6
7
8
9
10
11
12
# File 'lib/html2doc/notes.rb', line 4

def self.footnotes(docxml)
  i = 1
  fn = []
  docxml.xpath("//a").each do |a|
    next unless process_footnote_link(docxml, a, i, fn)
    i += 1
  end
  process_footnote_texts(docxml, fn)
end

.from_xhtml(xml) ⇒ Object



72
73
74
75
76
# File 'lib/html2doc/base.rb', line 72

def self.from_xhtml(xml)
  xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "").
    sub(DOCTYPE, "").
    gsub(%{ />}, "/>")
end

.generate_filelist(filename, dir) ⇒ Object



87
88
89
90
91
92
93
94
95
96
97
# File 'lib/html2doc/mime.rb', line 87

def self.generate_filelist(filename, dir)
  File.open(File.join(dir, "filelist.xml"), "w") do |f|
    f.write %{<xml xmlns:o="urn:schemas-microsoft-com:office:office">
      <o:MainFile HRef="../#{filename}.htm"/>}
    Dir.entries(dir).sort.each do |item|
      next if item == "." || item == ".." || /^\./.match(item)
      f.write %{  <o:File HRef="#{item}"/>\n}
    end
    f.write("</xml>\n")
  end
end

.image_cleanup(docxml, dir) ⇒ Object



74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/html2doc/mime.rb', line 74

def self.image_cleanup(docxml, dir)
  docxml.xpath("//*[local-name() = 'img']").each do |i|
    matched = /\.(?<suffix>\S+)$/.match i["src"]
    uuid = UUIDTools::UUID.random_create.to_s
    new_full_filename = File.join(dir, "#{uuid}.#{matched[:suffix]}")
    # presupposes that the image source is local
    system "cp #{i['src']} #{new_full_filename}"
    i["width"], i["height"] = image_resize(i, 400, 680)
    i["src"] = new_full_filename
  end
  docxml
end

.image_resize(i, maxheight, maxwidth) ⇒ Object



61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/html2doc/mime.rb', line 61

def self.image_resize(i, maxheight, maxwidth)
  size = [i["width"].to_i, i["height"].to_i]
  size = ImageSize.path(i["src"]).size if size[0].zero? && size[1].zero?
  # max height for Word document is 400, max width is 680
  if size[0] > maxheight
    size = [maxheight, (size[1] * maxheight / size[0]).ceil]
  end
  if size[1] > maxwidth
    size = [(size[0] * maxwidth / size[1]).ceil, maxwidth]
  end
  size
end

.list_add(xpath, liststyles, listtype, level, listnumber) ⇒ Object



19
20
21
22
23
24
25
26
27
28
# File 'lib/html2doc/lists.rb', line 19

def self.list_add(xpath, liststyles, listtype, level, listnumber)
  xpath.each_with_index do |list, i|
    listnumber = i + 1 if level == 1
    (list.xpath(".//li") - list.xpath(".//ol//li | .//ul//li")).each do |li|
      style_list(li, level, liststyles[listtype], listnumber)
      list_add(li.xpath(".//ul") - li.xpath(".//ul//ul | .//ol//ul"), liststyles, :ul, level + 1, listnumber)
      list_add(li.xpath(".//ol") - li.xpath(".//ul//ol | .//ol//ol"), liststyles, :ol, level + 1, listnumber)
    end
  end
end

.lists(docxml, liststyles) ⇒ Object



30
31
32
33
34
35
36
37
38
# File 'lib/html2doc/lists.rb', line 30

def self.lists(docxml, liststyles)
  return if liststyles.nil?
  if liststyles.has_key?(:ul)
    list_add(docxml.xpath("//ul[not(ancestor::ul) and not(ancestor::ol)]"), liststyles, :ul, 1, nil)
  end
  if liststyles.has_key?(:ol)
    list_add(docxml.xpath("//ol[not(ancestor::ul) and not(ancestor::ol)]"), liststyles, :ol, 1, nil)
  end
end

.mathml_to_ooml(docxml) ⇒ Object



38
39
40
41
42
43
44
45
46
# File 'lib/html2doc/math.rb', line 38

def self.mathml_to_ooml(docxml)
  docxml.xpath("//*[local-name() = 'math']").each do |m|
    @xslt.xml = ooxml_cleanup(m)
    ooxml = @xslt.serve.gsub(/<\?[^>]+>\s*/, "").
      gsub(/ xmlns(:[^=]+)?="[^"]+"/, "").
      gsub(%r{<(/)?([a-z])}, "<\\1m:\\2")
    m.swap(ooxml)
  end
end

.mime_attachment(boundary, filename, item, dir) ⇒ Object



21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/html2doc/mime.rb', line 21

def self.mime_attachment(boundary, filename, item, dir)
  encoded_file = Base64.strict_encode64(
    File.read("#{dir}/#{item}"),
  ).gsub(/(.{76})/, "\\1\n")
  <<~"FILE"
  --#{boundary}
  Content-Location: file:///C:/Doc/#{filename}_files/#{item}
  Content-Transfer-Encoding: base64
  Content-Type: #{mime_type(item)}

  #{encoded_file}

  FILE
end

.mime_boundaryObject



43
44
45
46
# File 'lib/html2doc/mime.rb', line 43

def self.mime_boundary
  salt = UUIDTools::UUID.random_create.to_s.gsub(/-/, ".")[0..17]
  "----=_NextPart_#{salt}"
end

.mime_package(result, filename, dir) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/html2doc/mime.rb', line 48

def self.mime_package(result, filename, dir)
  boundary = mime_boundary
  mhtml = mime_preamble(boundary, filename, result)
  mhtml += mime_attachment(boundary, filename, "filelist.xml", dir)
  Dir.foreach(dir) do |item|
    next if item == "." || item == ".." || /^\./.match(item) ||
      item == "filelist.xml"
    mhtml += mime_attachment(boundary, filename, item, dir)
  end
  mhtml += "--#{boundary}--"
  File.open("#{filename}.doc", "w") { |f| f.write mhtml }
end

.mime_preamble(boundary, filename, result) ⇒ Object



7
8
9
10
11
12
13
14
15
16
17
18
19
# File 'lib/html2doc/mime.rb', line 7

def self.mime_preamble(boundary, filename, result)
  <<~"PREAMBLE"
  MIME-Version: 1.0
  Content-Type: multipart/related; boundary="#{boundary}"

  --#{boundary}
  Content-Location: file:///C:/Doc/#{filename}.htm
  Content-Type: text/html; charset="utf-8"

  #{result}

  PREAMBLE
end

.mime_type(item) ⇒ Object



36
37
38
39
40
41
# File 'lib/html2doc/mime.rb', line 36

def self.mime_type(item)
  types = MIME::Types.type_for(item)
  type = types ? types.first.to_s : 'text/plain; charset="utf-8"'
  type = type + ' charset="utf-8"' if /^text/.match?(type) && types
  type
end

.msonormal(docxml) ⇒ Object



173
174
175
176
177
178
179
180
# File 'lib/html2doc/base.rb', line 173

def self.msonormal(docxml)
  docxml.xpath("//*[local-name() = 'p'][not(self::*[@class])]").each do |p|
    p["class"] = "MsoNormal"
  end
  docxml.xpath("//*[local-name() = 'li'][not(self::*[@class])]").each do |p|
    p["class"] = "MsoNormal"
  end
end

.msword_fix(r) ⇒ Object



78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'lib/html2doc/base.rb', line 78

def self.msword_fix(r)
  # brain damage in MSWord parser
  r.gsub!(%r{<span style="mso-special-character:footnote"/>},
          '<span style="mso-special-character:footnote"></span>')
  r.gsub!(%r{<div style="mso-element:footnote-list"></div>},
          '<div style="mso-element:footnote-list"/>')
  r.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
  r.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
  r.gsub!(%r{<meta http-equiv="Content-Type"},
          "<meta http-equiv=Content-Type")
  r.gsub!(%r{&tab;|&amp;tab;}, '<span style="mso-tab-count:1">&#xA0; </span>')
  r
end

.namespace(root) ⇒ Object



152
153
154
155
156
157
158
159
# File 'lib/html2doc/base.rb', line 152

def self.namespace(root)
  {
    o: "urn:schemas-microsoft-com:office:office",
    w: "urn:schemas-microsoft-com:office:word",
    m: "http://schemas.microsoft.com/office/2004/12/omml",
  }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
  root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
end

.ooxml_cleanup(m) ⇒ Object

random fixes to MathML input that OOXML needs to render properly



28
29
30
31
32
33
34
35
36
# File 'lib/html2doc/math.rb', line 28

def self.ooxml_cleanup(m)
  m.xpath(".//xmlns:msup[name(preceding-sibling::*[1])='munderover']",
          m.document.collect_namespaces).each do |x|
    x1 = x.replace("<mrow></mrow>").first
    x1.children = x
  end
  m.add_namespace(nil, "http://www.w3.org/1998/Math/MathML")
  m.to_s
end

.process(result, hash) ⇒ Object



12
13
14
15
16
17
18
19
20
21
# File 'lib/html2doc/base.rb', line 12

def self.process(result, hash)
  hash[:dir1] = create_dir(hash[:filename], hash[:dir])
  result = process_html(result, hash)
  hash[:header_file].nil? ||
    system("cp #{hash[:header_file]} #{hash[:dir1]}/header.html")
  generate_filelist(hash[:filename], hash[:dir1])
  File.open("#{hash[:filename]}.htm", "w") { |f| f.write(result) }
  mime_package result, hash[:filename], hash[:dir1]
  rm_temp_files(hash[:filename], hash[:dir], hash[:dir1])
end


46
47
48
49
50
51
52
53
54
55
# File 'lib/html2doc/notes.rb', line 46

def self.process_footnote_link(docxml, a, i, fn)
  return false unless footnote?(a)
  href = a["href"].gsub(/^#/, "")
  note = docxml.at("//*[@name = '#{href}' or @id = '#{href}']")
  return false if note.nil?
  set_footnote_link_attrs(a, i)
  a.children = "<span class='MsoFootnoteReference'>"\
    "<span style='mso-special-character:footnote'/></span>"
  fn << transform_footnote_text(note)
end

.process_footnote_texts(docxml, footnotes) ⇒ Object



14
15
16
17
18
19
20
21
22
23
# File 'lib/html2doc/notes.rb', line 14

def self.process_footnote_texts(docxml, footnotes)
  body = docxml.at("//body")
  list = body.add_child("<div style='mso-element:footnote-list'/>")
  footnotes.each_with_index do |f, i|
    fn = list.first.add_child(footnote_container(i + 1))
    f.parent = fn.first
    footnote_div_to_p(f)
  end
  footnote_cleanup(docxml)
end

.process_html(result, hash) ⇒ Object



30
31
32
33
34
# File 'lib/html2doc/base.rb', line 30

def self.process_html(result, hash)
  docxml = to_xhtml(asciimath_to_mathml(result, hash[:asciimathdelims]))
  define_head(cleanup(docxml, hash), hash)
  msword_fix(from_xhtml(docxml))
end

.rm_temp_files(filename, dir, dir1) ⇒ Object



36
37
38
39
# File 'lib/html2doc/base.rb', line 36

def self.rm_temp_files(filename, dir, dir1)
  system "rm #{filename}.htm"
  system "rm -r #{dir1}" unless dir
end


72
73
74
75
76
77
# File 'lib/html2doc/notes.rb', line 72

def self.set_footnote_link_attrs(a, i)
  a["style"] = "mso-footnote-id:ftn#{i}"
  a["href"] = "#_ftn#{i}"
  a["name"] = "_ftnref#{i}"
  a["title"] = ""
end

.style_list(li, level, liststyle, listnumber) ⇒ Object



9
10
11
12
13
14
15
16
17
# File 'lib/html2doc/lists.rb', line 9

def self.style_list(li, level, liststyle, listnumber)
  return unless liststyle
  if li["style"]
    li["style"] += ";"
  else
    li["style"] = ""
  end
  li["style"] += "mso-list:#{liststyle} level#{level} lfo#{listnumber};"
end

.stylesheet(filename, header_filename, fn) ⇒ Object



123
124
125
126
127
128
129
130
131
# File 'lib/html2doc/base.rb', line 123

def self.stylesheet(filename, header_filename, fn)
  (fn.nil? || fn.empty?) &&
    fn = File.join(File.dirname(__FILE__), "wordstyle.css")
  stylesheet = File.read(fn, encoding: "UTF-8")
  stylesheet = filename_substitute(stylesheet, header_filename, filename)
  xml = Nokogiri::XML("<style/>")
  xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n")
  xml.root.to_s
end

.to_xhtml(xml) ⇒ Object



59
60
61
62
63
64
65
66
# File 'lib/html2doc/base.rb', line 59

def self.to_xhtml(xml)
  xml.gsub!(/<\?xml[^>]*>/, "")
  unless /<!DOCTYPE /.match? xml
    xml = '<!DOCTYPE html SYSTEM
        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
  end
  Nokogiri::XML.parse(xml)
end

.transform_footnote_text(note) ⇒ Object



57
58
59
60
61
62
63
64
65
# File 'lib/html2doc/notes.rb', line 57

def self.transform_footnote_text(note)
  note["id"] = ""
  note.xpath(".//div").each { |div| div.replace(div.children) }
  note.xpath(".//aside | .//p").each do |p|
    p.name = "p"
    p["class"] = "MsoFootnoteText"
  end
  note.remove
end