Class: HeadlessHtmlEditor
- Inherits:
-
Object
- Object
- HeadlessHtmlEditor
- Defined in:
- lib/headless_html_editor.rb
Overview
Headless HTML Editor. Edit HTML files programmatically.
Constant Summary collapse
- UNWANTED_CLASSES =
%w{MsoNormal MsoBodyText NormalBold MsoTitle MsoHeader Templatehelp TOCEntry Indent1 MsoCaption MsoListParagraph MsoNormalTable MsoTableGrid MsoTableClassic1 MsoListParagraphCxSpFirst MsoListParagraphCxSpMiddle MsoListParagraphCxSpLast MsoCommentText msocomtxt msocomoff MsoEndnoteText MsoFootnoteText}
Instance Attribute Summary collapse
-
#dom ⇒ Object
readonly
Returns the value of attribute dom.
Class Method Summary collapse
-
.bulk_edit(file_list_file_name, &block) ⇒ Object
Edit files listed in a text file.
-
.edit_folder(folder, &block) ⇒ Object
Edit all HTML files in a folder.
Instance Method Summary collapse
-
#accept_word_changes_tracked ⇒ Object
Change tracking in MS Word, adds a lot of ins and del tags.
-
#demote_headings ⇒ Object
Change h1 to h2 and so on.
-
#initialize(input_file_name, input_encoding = 'utf-8') ⇒ HeadlessHtmlEditor
constructor
Create a new Headless HTML Editor.
-
#remove_header_scripts ⇒ Object
Remove script tags from the header.
-
#remove_word_artifacts(options = { rebuild_toc: true }) ⇒ Object
Cleanup after MS Word.
-
#save!(output_encoding = 'utf-8') ⇒ Object
Save the file with the same file name.
-
#save_as!(output_file_name, output_encoding = 'utf-8') ⇒ Object
Save file with a new file name.
Constructor Details
#initialize(input_file_name, input_encoding = 'utf-8') ⇒ HeadlessHtmlEditor
Create a new Headless HTML Editor.
18 19 20 21 22 23 24 25 26 27 |
# File 'lib/headless_html_editor.rb', line 18 def initialize(input_file_name, input_encoding = 'utf-8') @input_file_name = input_file_name if File.file?(input_file_name) && File.fnmatch?('**.html', input_file_name) # read html file puts "R: #{input_file_name}" @dom = Nokogiri::HTML( open(input_file_name, "r:#{input_encoding}", universal_newline: false) ) end end |
Instance Attribute Details
#dom ⇒ Object (readonly)
Returns the value of attribute dom.
15 16 17 |
# File 'lib/headless_html_editor.rb', line 15 def dom @dom end |
Class Method Details
.bulk_edit(file_list_file_name, &block) ⇒ Object
Edit files listed in a text file. File names are absolute. If the first character on a line is # the line is ignored.
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
# File 'lib/headless_html_editor.rb', line 169 def self.bulk_edit(file_list_file_name, &block) txt_file_name = File.(file_list_file_name) File.readlines(txt_file_name).each do |file_name| unless file_name.start_with? '#' # Strip added to remove trailing newline characters. file_name.strip! if File.file? file_name editor = new(file_name) if editor.dom.nil? puts "No DOM found in #{file_name}." else yield editor editor.save! end end end end end |
.edit_folder(folder, &block) ⇒ Object
Edit all HTML files in a folder.
152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
# File 'lib/headless_html_editor.rb', line 152 def self.edit_folder(folder, &block) Dir.open(folder.gsub(/\\/, '/')) do |d| d.each do |file_name| file_name = File.join(d.path, file_name) if File.file? file_name editor = new(file_name) unless editor.dom.nil? yield editor editor.save! end end end end end |
Instance Method Details
#accept_word_changes_tracked ⇒ Object
Change tracking in MS Word, adds a lot of ins and del tags. These tags are removed.
116 117 118 119 120 121 |
# File 'lib/headless_html_editor.rb', line 116 def accept_word_changes_tracked @dom.css('del').remove @dom.css('ins').each do |ins| ins.replace ins.inner_html end end |
#demote_headings ⇒ Object
Change h1 to h2 and so on. h6 is not changed, so its a potential mess.
124 125 126 127 128 |
# File 'lib/headless_html_editor.rb', line 124 def demote_headings @dom.css('h1, h2, h3, h4, h5').each do |heading| heading.name = "h#{heading.name[1].to_i + 1}" end end |
#remove_header_scripts ⇒ Object
Remove script tags from the header
111 112 113 |
# File 'lib/headless_html_editor.rb', line 111 def remove_header_scripts @dom.css('script').remove end |
#remove_word_artifacts(options = { rebuild_toc: true }) ⇒ Object
Cleanup after MS Word.
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
# File 'lib/headless_html_editor.rb', line 36 def remove_word_artifacts( = { rebuild_toc: true }) @dom.css('meta[name="Generator"]').remove # Remove abandoned anchors, that are not linked to. @dom.css('a[name]').each do |a| if @dom.css('a[href="#' + a['name'] + '"]').size == 0 puts "<a name=\"#{a['name']}\"> was removed, because it had no links to it." a.replace(a.inner_html) end end # Clean up h1-h6 tags headings = @dom.css('h1, h2, h3, h4, h5, h6') headings.each do |heading| a = heading.at_css('a[name]') if a heading['id'] = a['name'].sub(/_Toc/, 'Toc') a.replace(a.inner_html) end heading.inner_html = heading.inner_html.sub(/\A(\s*\d+\.?)+\uC2A0*/, '').strip end # Remove Words "normal" classes. UNWANTED_CLASSES.each do |class_name| @dom.css(".#{class_name}").each do |node| node.remove_attribute('class') end end # Remove unwanted section tags @dom.css('.WordSection1, .WordSection2, .WordSection3, .WordSection4, .WordSection5, .WordSection6, .WordSection7, .WordSection8').each do |section| puts "Removing #{section.name}.#{section['class']}" section.replace(section.inner_html) end if [:rebuild_toc] # Remove page numbers from TOC @dom.css('.MsoToc1 a, .MsoToc2 a, .MsoToc3 a, .MsoToc4 a').each do |item| item.inner_html = item.inner_text.sub(/\A(\d+\.)+/, '').sub(/(\s+\d+)\Z/, '').strip end # Rewrite Toc as ordered list. toc_item = @dom.at_css('.MsoToc1') previous_toc_level = 0 new_toc = [] while toc_item toc_item.inner_html = toc_item.inner_html.sub(/\n/, ' ') class_attr = toc_item.attr('class') current_toc_level = class_attr[6].to_i new_toc << "</li>\n" if previous_toc_level == current_toc_level new_toc << "</ol>\n</li>\n" if previous_toc_level > current_toc_level new_toc << "\n<ol#{' id="toc"' if previous_toc_level == 0}>\n" if previous_toc_level < current_toc_level link = toc_item.at_css('a') if link.nil? puts toc_item.to_s else toc_item.at_css('a').inner_html = link.inner_html.sub(/\A(\s*\d+)/, '').strip new_toc << "<li>#{toc_item.inner_html.sub(/#_Toc/, '#Toc')}" end previous_toc_level = current_toc_level begin toc_item = toc_item.next_element end while toc_item && toc_item.text? toc_item = nil unless toc_item && toc_item.attr('class') && toc_item.attr('class').start_with?('MsoToc') end @dom.at_css('.MsoToc1').replace(new_toc.join('')) if @dom.at_css('.MsoToc1') # Remove old Table of Contents @dom.css('.MsoToc1, .MsoToc2, .MsoToc3, .MsoToc4').each { |item| item.remove } end # Remove empty paragraphs @dom.css('p').each do |p| if p.content.gsub("\uC2A0", '').strip.size == 0 && !p.at_css('img') puts 'Removing empty paragraph.' p.remove end end @dom.css('table + br').remove # /<!--\[if[.\n\r]+\[endif\]\s*-->/ end |
#save!(output_encoding = 'utf-8') ⇒ Object
Save the file with the same file name.
131 132 133 |
# File 'lib/headless_html_editor.rb', line 131 def save!(output_encoding = 'utf-8') save_as!(@input_file_name, output_encoding) end |
#save_as!(output_file_name, output_encoding = 'utf-8') ⇒ Object
Save file with a new file name.
136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
# File 'lib/headless_html_editor.rb', line 136 def save_as!(output_file_name, output_encoding = 'utf-8') puts "W: #{output_file_name}" begin if File.writable?(output_file_name) || !File.exists?(output_file_name) File.open(output_file_name, "w:#{output_encoding}", universal_newline: false) do |f| f.write @dom.to_html({ encoding: output_encoding, indent: 2 }) end else $stderr.puts 'Failed: Read only!' end rescue StandardError => se $stderr.puts "\nFailed!\n#{se.}" end end |