Class: HeadlessHtmlEditor

Inherits:
Object
  • Object
show all
Defined in:
lib/headless_html_editor.rb

Overview

Headless HTML Editor. Edit HTML files programmatically.

Constant Summary collapse

UNWANTED_CLASSES =
%w{MsoNormal MsoBodyText NormalBold MsoTitle MsoHeader Templatehelp
TOCEntry Indent1 MsoCaption MsoListParagraph
MsoNormalTable MsoTableGrid MsoTableClassic1
MsoListParagraphCxSpFirst MsoListParagraphCxSpMiddle MsoListParagraphCxSpLast
MsoCommentText msocomtxt msocomoff MsoEndnoteText MsoFootnoteText}

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(input_file_name, input_encoding = 'utf-8') ⇒ HeadlessHtmlEditor

Create a new Headless HTML Editor.



18
19
20
21
22
23
24
25
26
27
# File 'lib/headless_html_editor.rb', line 18

def initialize(input_file_name, input_encoding = 'utf-8')
  @input_file_name = input_file_name
  if File.file?(input_file_name) && File.fnmatch?('**.html', input_file_name)
    # read html file
    puts "R: #{input_file_name}"
    @dom = Nokogiri::HTML(
      open(input_file_name, "r:#{input_encoding}", universal_newline: false)
    )
  end
end

Instance Attribute Details

#domObject (readonly)

Returns the value of attribute dom.



15
16
17
# File 'lib/headless_html_editor.rb', line 15

def dom
  @dom
end

Class Method Details

.bulk_edit(file_list_file_name, &block) ⇒ Object

Edit files listed in a text file. File names are absolute. If the first character on a line is # the line is ignored.



169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# File 'lib/headless_html_editor.rb', line 169

def self.bulk_edit(file_list_file_name, &block)
  txt_file_name = File.expand_path(file_list_file_name)
  File.readlines(txt_file_name).each do |file_name|
    unless file_name.start_with? '#'
      # Strip added to remove trailing newline characters.
      file_name.strip!
      if File.file? file_name
        editor = new(file_name)
        if editor.dom.nil?
          puts "No DOM found in #{file_name}."
        else
          yield editor
          editor.save!
        end
      end
    end
  end
end

.edit_folder(folder, &block) ⇒ Object

Edit all HTML files in a folder.



152
153
154
155
156
157
158
159
160
161
162
163
164
165
# File 'lib/headless_html_editor.rb', line 152

def self.edit_folder(folder, &block)
  Dir.open(folder.gsub(/\\/, '/')) do |d|
    d.each do |file_name|
      file_name = File.join(d.path, file_name)
      if File.file? file_name
        editor = new(file_name)
        unless editor.dom.nil?
          yield editor
          editor.save!
        end
      end
    end
  end
end

Instance Method Details

#accept_word_changes_trackedObject

Change tracking in MS Word, adds a lot of ins and del tags. These tags are removed.



116
117
118
119
120
121
# File 'lib/headless_html_editor.rb', line 116

def accept_word_changes_tracked
  @dom.css('del').remove
  @dom.css('ins').each do |ins|
    ins.replace ins.inner_html
  end
end

#demote_headingsObject

Change h1 to h2 and so on. h6 is not changed, so its a potential mess.



124
125
126
127
128
# File 'lib/headless_html_editor.rb', line 124

def demote_headings
  @dom.css('h1, h2, h3, h4, h5').each do |heading|
    heading.name = "h#{heading.name[1].to_i + 1}"
  end
end

#remove_header_scriptsObject

Remove script tags from the header



111
112
113
# File 'lib/headless_html_editor.rb', line 111

def remove_header_scripts
  @dom.css('script').remove
end

#remove_word_artifacts(options = { rebuild_toc: true }) ⇒ Object

Cleanup after MS Word.



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/headless_html_editor.rb', line 36

def remove_word_artifacts(options = { rebuild_toc: true })
  @dom.css('meta[name="Generator"]').remove
  # Remove abandoned anchors, that are not linked to.
  @dom.css('a[name]').each do |a|
    if @dom.css('a[href="#' + a['name'] + '"]').size == 0
      puts "<a name=\"#{a['name']}\"> was removed, because it had no links to it."
      a.replace(a.inner_html)
    end
  end
  # Clean up h1-h6 tags
  headings = @dom.css('h1, h2, h3, h4, h5, h6')
  headings.each do |heading|
    a = heading.at_css('a[name]')
    if a
      heading['id'] = a['name'].sub(/_Toc/, 'Toc')
      a.replace(a.inner_html)
    end
    heading.inner_html = heading.inner_html.sub(/\A(\s*\d+\.?)+\uC2A0*/, '').strip
  end
  # Remove Words "normal" classes.
  UNWANTED_CLASSES.each do |class_name|
    @dom.css(".#{class_name}").each do |node|
      node.remove_attribute('class')
    end
  end
  # Remove unwanted section tags
  @dom.css('.WordSection1, .WordSection2, .WordSection3, .WordSection4, .WordSection5, .WordSection6, .WordSection7, .WordSection8').each do |section|
    puts "Removing #{section.name}.#{section['class']}"
    section.replace(section.inner_html)
  end
  if options[:rebuild_toc]
    # Remove page numbers from TOC
    @dom.css('.MsoToc1 a, .MsoToc2 a, .MsoToc3 a, .MsoToc4 a').each do |item|
      item.inner_html = item.inner_text.sub(/\A(\d+\.)+/, '').sub(/(\s+\d+)\Z/, '').strip
    end
    # Rewrite Toc as ordered list.
    toc_item = @dom.at_css('.MsoToc1')
    previous_toc_level = 0
    new_toc = []
    while toc_item
      toc_item.inner_html = toc_item.inner_html.sub(/\n/, ' ')
      class_attr = toc_item.attr('class')
      current_toc_level = class_attr[6].to_i
      new_toc << "</li>\n" if previous_toc_level == current_toc_level
      new_toc << "</ol>\n</li>\n" if previous_toc_level > current_toc_level
      new_toc << "\n<ol#{' id="toc"' if previous_toc_level == 0}>\n" if previous_toc_level < current_toc_level
      link = toc_item.at_css('a')
      if link.nil?
        puts toc_item.to_s
       else
        toc_item.at_css('a').inner_html = link.inner_html.sub(/\A(\s*\d+)/, '').strip
        new_toc << "<li>#{toc_item.inner_html.sub(/#_Toc/, '#Toc')}"
      end
      previous_toc_level = current_toc_level
      begin
        toc_item = toc_item.next_element
      end while toc_item && toc_item.text?
      toc_item = nil unless toc_item && toc_item.attr('class') && toc_item.attr('class').start_with?('MsoToc')
    end
    @dom.at_css('.MsoToc1').replace(new_toc.join('')) if @dom.at_css('.MsoToc1')
    # Remove old Table of Contents
    @dom.css('.MsoToc1, .MsoToc2, .MsoToc3, .MsoToc4').each { |item| item.remove }
  end
  # Remove empty paragraphs
  @dom.css('p').each do |p|
    if p.content.gsub("\uC2A0", '').strip.size == 0 && !p.at_css('img')
      puts 'Removing empty paragraph.'
      p.remove
    end
  end
  @dom.css('table + br').remove
#  /<!--\[if[.\n\r]+\[endif\]\s*-->/
end

#save!(output_encoding = 'utf-8') ⇒ Object

Save the file with the same file name.



131
132
133
# File 'lib/headless_html_editor.rb', line 131

def save!(output_encoding = 'utf-8')
  save_as!(@input_file_name, output_encoding)
end

#save_as!(output_file_name, output_encoding = 'utf-8') ⇒ Object

Save file with a new file name.



136
137
138
139
140
141
142
143
144
145
146
147
148
149
# File 'lib/headless_html_editor.rb', line 136

def save_as!(output_file_name, output_encoding = 'utf-8')
  puts "W: #{output_file_name}"
  begin
    if File.writable?(output_file_name) || !File.exists?(output_file_name)
      File.open(output_file_name, "w:#{output_encoding}", universal_newline: false) do |f|
        f.write @dom.to_html({ encoding: output_encoding, indent: 2 })
      end
    else
      $stderr.puts 'Failed: Read only!'
    end
  rescue StandardError => se
    $stderr.puts "\nFailed!\n#{se.message}"
  end
end