Module: Html2Doc

Defined in:
lib/html2doc/base.rb,
lib/html2doc/mime.rb,
lib/html2doc/version.rb

Constant Summary collapse

VERSION =
"0.0.1".freeze

Class Method Summary collapse

Class Method Details

.cleanup(docxml, dir) ⇒ Object



16
17
18
19
# File 'lib/html2doc/mime.rb', line 16

def self.cleanup(docxml, dir)
  image_cleanup(docxml, dir)
  msonormal(docxml)
end

.define_head(docxml, dir, filename, header_file) ⇒ Object



95
96
97
98
99
100
101
102
103
104
# File 'lib/html2doc/base.rb', line 95

def self.define_head(docxml, dir, filename, header_file)
  title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']")
  head = docxml.at("//*[local-name() = 'head']")
  if title.nil?
    head.children.first.add_previous_sibling stylesheet(filename, header_file)
  else
    title.add_next_sibling stylesheet(filename, header_file)
  end
  self.define_head1(docxml, dir)
end

.define_head1(docxml, dir) ⇒ Object



64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/html2doc/base.rb', line 64

def self.define_head1(docxml, dir)
  docxml.xpath("//*[local-name() = 'head']").each do |h|
    h.children.first.add_previous_sibling "          <!--[if gte mso 9]>\n          <xml>\n          <w:WordDocument>\n          <w:View>Print</w:View>\n          <w:Zoom>100</w:Zoom>\n          <w:DoNotOptimizeForBrowser/>\n          </w:WordDocument>\n          </xml>\n          <![endif]-->\n          <meta http-equiv=Content-Type content=\"text/html; charset=utf-8\"/>\n          <link rel=\"File-List\" href=\"\#{dir}/filelist.xml\"/>\n    XML\n  end\nend\n"

.generate_filelist(filename, dir) ⇒ Object



158
159
160
161
162
163
164
165
166
167
168
169
170
# File 'lib/html2doc/base.rb', line 158

def self.generate_filelist(filename, dir)
  File.open(File.join(dir, "filelist.xml"), "w") do |f|
    f.write("              <xml xmlns:o=\"urn:schemas-microsoft-com:office:office\">\n               <o:MainFile HRef=\"../\#{filename}.htm\"/>\n    XML\n    Dir.foreach(dir) do |item|\n      next if item == \".\" || item == \"..\" || /^\\./.match(item)\n      f.write %{  <o:File HRef=\"\#{item}\"/>\\n}\n    end\n    f.write(\"</xml>\\n\")\n  end\nend\n")

.image_cleanup(docxml, dir) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/html2doc/base.rb', line 48

def self.image_cleanup(docxml, dir)
  docxml.xpath("//*[local-name() = 'img']").each do |i|
    matched = /\.(?<suffix>\S+)$/.match i["src"]
    uuid = UUIDTools::UUID.random_create.to_s
    new_full_filename = File.join(dir, "#{uuid}.#{matched[:suffix]}")
    # presupposes that the image source is local
    system "cp #{i['src']} #{new_full_filename}"
    # image_size = image_resize(i["src"])
    i["width"], i["height"] = image_resize(i["src"])
    i["src"] = new_full_filename
    #i["height"] = image_size[1]
    #i["width"] = image_size[0]
  end
  docxml
end

.image_resize(orig_filename) ⇒ Object



34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/html2doc/base.rb', line 34

def self.image_resize(orig_filename)
  image_size = ImageSize.path(orig_filename).size
  # max width for Word document is 400, max height is 680
  if image_size[0] > 400
    image_size[1] = (image_size[1] * 400 / image_size[0]).ceil
    image_size[0] = 400
  end
  if image_size[1] > 680
    image_size[0] = (image_size[0] * 680 / image_size[1]).ceil
    image_size[1] = 680
  end
  image_size
end

.mime_attachment(boundary, filename, item, dir) ⇒ Object



120
121
122
123
124
125
126
127
128
129
130
131
132
133
# File 'lib/html2doc/base.rb', line 120

def self.mime_attachment(boundary, filename, item, dir)
  encoded_file = Base64.strict_encode64(
    File.read("#{dir}/#{item}"),
  ).gsub(/(.{76})/, "\\1\n")
  "  --\#{boundary}\n  Content-Location: file:///C:/Doc/\#{filename}_files/\#{item}\n  Content-Transfer-Encoding: base64\n  Content-Type: \#{mime_type(item)}\n\n  \#{encoded_file}\n\n  FILE\nend\n"

.mime_boundaryObject



142
143
144
145
# File 'lib/html2doc/base.rb', line 142

def self.mime_boundary
  salt = UUIDTools::UUID.random_create.to_s.gsub(/-/, ".")[0..17]
  "----=_NextPart_#{salt}"
end

.mime_package(result, filename, dir) ⇒ Object



147
148
149
150
151
152
153
154
155
156
# File 'lib/html2doc/base.rb', line 147

def self.mime_package(result, filename, dir)
  boundary = mime_boundary
  mhtml = mime_preamble(boundary, filename, result)
  Dir.foreach(dir) do |item|
    next if item == "." || item == ".." || /^\./.match(item)
    mhtml += mime_attachment(boundary, filename, item, dir)
  end
  mhtml += "--#{boundary}--"
  File.open("#{filename}.doc", "w") { |f| f.write mhtml }
end

.mime_preamble(boundary, filename, result) ⇒ Object



106
107
108
109
110
111
112
113
114
115
116
117
118
# File 'lib/html2doc/base.rb', line 106

def self.mime_preamble(boundary, filename, result)
  "  MIME-Version: 1.0\n  Content-Type: multipart/related; boundary=\"\#{boundary}\"\n\n  --\#{boundary}\n  Content-Location: file:///C:/Doc/\#{filename}.htm\n  Content-Type: text/html; charset=\"utf-8\"\n\n  \#{result}\n\n  PREAMBLE\nend\n"

.mime_type(item) ⇒ Object



135
136
137
138
139
140
# File 'lib/html2doc/base.rb', line 135

def self.mime_type(item)
  types = MIME::Types.type_for(item)
  type = types ? types.first.to_s : 'text/plain; charset="utf-8"'
  type = type + ' charset="utf-8"' if /^text/.match?(type) && types
  type
end

.msonormal(docxml) ⇒ Object



189
190
191
192
193
194
195
196
# File 'lib/html2doc/mime.rb', line 189

def self.msonormal(docxml)
  docxml.xpath("//*[local-name() = 'p'][not(self::*[@class])]").each do |p|
    p["class"] = "MsoNormal"
  end
  docxml.xpath("//*[local-name() = 'li'][not(self::*[@class])]").each do |p|
    p["class"] = "MsoNormal"
  end
end

.msword_fix(r) ⇒ Object



22
23
24
25
26
27
28
29
30
31
32
# File 'lib/html2doc/base.rb', line 22

def self.msword_fix(r)
  # brain damage in MSWord parser
  r.gsub(%r{<span style="mso-special-character:footnote"/>},
         '<span style="mso-special-character:footnote"></span>').
         gsub(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>").
         gsub(%r{<link rel="File-List"}, "<link rel=File-List").
         gsub(%r{<meta http-equiv="Content-Type"},
              "<meta http-equiv=Content-Type").
              gsub(%r{&tab;|&amp;tab;},
                   '<span style="mso-tab-count:1">&#xA0; </span>')
end

.namespace(root) ⇒ Object



114
115
116
117
118
119
120
121
# File 'lib/html2doc/mime.rb', line 114

def self.namespace(root)
  {
    o: "urn:schemas-microsoft-com:office:office",
    w: "urn:schemas-microsoft-com:office:word",
    m: "http://schemas.microsoft.com/office/2004/12/omml",
  }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
  root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
end

.process(result, filename, header_file, dir) ⇒ Object



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# File 'lib/html2doc/base.rb', line 5

def self.process(result, filename, header_file, dir)
  # preserve HTML escapes
  unless /<!DOCTYPE html/.match? result
    result.gsub!(/<\?xml version="1.0"\?>/, "")
    result = "<!DOCTYPE html SYSTEM " +
      "'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'>" + result
  end
  docxml = Nokogiri::XML(result)
  image_cleanup(docxml, dir)
  define_head(docxml, dir, filename, header_file)
  result = self.msword_fix(docxml.to_xml)
  system "cp #{header_file} #{dir}/header.html" unless header_file.nil?
  generate_filelist(filename, dir)
  File.open("#{filename}.htm", "w") { |f| f.write(result) }
  mime_package result, filename, dir
end

.stylesheet(filename, header_filename) ⇒ Object



82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/html2doc/base.rb', line 82

def self.stylesheet(filename, header_filename)
  fn = File.join(File.dirname(__FILE__), "wordstyle.css")
  stylesheet = File.read(fn, encoding: "UTF-8")
  if header_filename.nil?
    stylesheet.gsub!(/\n[^\n]*FILENAME[^\n]*i\n/, "\n")
  else
    stylesheet.gsub!(/FILENAME/, filename)
  end
  xml = Nokogiri::XML("<style/>")
  xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n")
  xml.root.to_s
end

.xhtml(result) ⇒ Object

preserve HTML escapes



22
23
24
25
26
27
28
29
# File 'lib/html2doc/mime.rb', line 22

def self.xhtml(result)
  unless /<!DOCTYPE html/.match? result
    result.gsub!(/<\?xml version="1.0"\?>/, "")
    result = "<!DOCTYPE html SYSTEM " +
      "'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'>" + result
  end
  result
end