Class: HeadlessHtmlEditor

Inherits:

Object

Object
HeadlessHtmlEditor

show all

Defined in:: lib/headless_html_editor.rb

Overview

Headless HTML Editor. Edit HTML files programmatically.

Constant Summary collapse

UNWANTED_CLASSES =

%w{MsoNormal MsoBodyText NormalBold MsoTitle MsoHeader Templatehelp
TOCEntry Indent1 MsoCaption MsoListParagraph
MsoNormalTable MsoTableGrid MsoTableClassic1
MsoListParagraphCxSpFirst MsoListParagraphCxSpMiddle MsoListParagraphCxSpLast
MsoCommentText msocomtxt msocomoff MsoEndnoteText MsoFootnoteText}

Instance Attribute Summary collapse

#dom ⇒ Object readonly

Returns the value of attribute dom.

Class Method Summary collapse

.bulk_edit(file_list_file_name, &block) ⇒ Object

Edit files listed in a text file.
.edit_folder(folder, &block) ⇒ Object

Edit all HTML files in a folder.

Instance Method Summary collapse

#accept_word_changes_tracked ⇒ Object

Change tracking in MS Word, adds a lot of ins and del tags.
#demote_headings ⇒ Object

Change h1 to h2 and so on.
#initialize(input_file_name, input_encoding = 'utf-8') ⇒ HeadlessHtmlEditor constructor

Create a new Headless HTML Editor.
#remove_header_scripts ⇒ Object

Remove script tags from the header.
#remove_word_artifacts(options = { rebuild_toc: true }) ⇒ Object

Cleanup after MS Word.
#save!(output_encoding = 'utf-8') ⇒ Object

Save the file with the same file name.
#save_as!(output_file_name, output_encoding = 'utf-8') ⇒ Object

Save file with a new file name.

Constructor Details

#initialize(input_file_name, input_encoding = 'utf-8') ⇒ `HeadlessHtmlEditor`

Create a new Headless HTML Editor.

# File 'lib/headless_html_editor.rb', line 18

def initialize(input_file_name, input_encoding = 'utf-8')
  @input_file_name = input_file_name
  if File.file?(input_file_name) && File.fnmatch?('**.html', input_file_name)
    # read html file
    puts "R: #{input_file_name}"
    @dom = Nokogiri::HTML(
      open(input_file_name, "r:#{input_encoding}", universal_newline: false)
    )
  end
end

Instance Attribute Details

#dom ⇒ `Object` (readonly)

Returns the value of attribute dom.



15
16
17

# File 'lib/headless_html_editor.rb', line 15

def dom
  @dom
end

Class Method Details

.bulk_edit(file_list_file_name, &block) ⇒ `Object`

Edit files listed in a text file. File names are absolute. If the first character on a line is # the line is ignored.

# File 'lib/headless_html_editor.rb', line 169

def self.bulk_edit(file_list_file_name, &block)
  txt_file_name = File.expand_path(file_list_file_name)
  File.readlines(txt_file_name).each do |file_name|
    unless file_name.start_with? '#'
      # Strip added to remove trailing newline characters.
      file_name.strip!
      if File.file? file_name
        editor = new(file_name)
        if editor.dom.nil?
          puts "No DOM found in #{file_name}."
        else
          yield editor
          editor.save!
        end
      end
    end
  end
end

.edit_folder(folder, &block) ⇒ `Object`

Edit all HTML files in a folder.

# File 'lib/headless_html_editor.rb', line 152

def self.edit_folder(folder, &block)
  Dir.open(folder.gsub(/\\/, '/')) do |d|
    d.each do |file_name|
      file_name = File.join(d.path, file_name)
      if File.file? file_name
        editor = new(file_name)
        unless editor.dom.nil?
          yield editor
          editor.save!
        end
      end
    end
  end
end

Instance Method Details

#accept_word_changes_tracked ⇒ `Object`

Change tracking in MS Word, adds a lot of ins and del tags. These tags are removed.

# File 'lib/headless_html_editor.rb', line 116

def accept_word_changes_tracked
  @dom.css('del').remove
  @dom.css('ins').each do |ins|
    ins.replace ins.inner_html
  end
end

#demote_headings ⇒ `Object`

Change h1 to h2 and so on. h6 is not changed, so its a potential mess.

# File 'lib/headless_html_editor.rb', line 124

def demote_headings
  @dom.css('h1, h2, h3, h4, h5').each do |heading|
    heading.name = "h#{heading.name[1].to_i + 1}"
  end
end

#remove_header_scripts ⇒ `Object`

Remove script tags from the header



111
112
113

# File 'lib/headless_html_editor.rb', line 111

def remove_header_scripts
  @dom.css('script').remove
end

#remove_word_artifacts(options = { rebuild_toc: true }) ⇒ `Object`

Cleanup after MS Word.

# File 'lib/headless_html_editor.rb', line 36

def remove_word_artifacts(options = { rebuild_toc: true })
  @dom.css('meta[name="Generator"]').remove
  # Remove abandoned anchors, that are not linked to.
  @dom.css('a[name]').each do |a|
    if @dom.css('a[href="#' + a['name'] + '"]').size == 0
      puts "<a name=\"#{a['name']}\"> was removed, because it had no links to it."
      a.replace(a.inner_html)
    end
  end
  # Clean up h1-h6 tags
  headings = @dom.css('h1, h2, h3, h4, h5, h6')
  headings.each do |heading|
    a = heading.at_css('a[name]')
    if a
      heading['id'] = a['name'].sub(/_Toc/, 'Toc')
      a.replace(a.inner_html)
    end
    heading.inner_html = heading.inner_html.sub(/\A(\s*\d+\.?)+\uC2A0*/, '').strip
  end
  # Remove Words "normal" classes.
  UNWANTED_CLASSES.each do |class_name|
    @dom.css(".#{class_name}").each do |node|
      node.remove_attribute('class')
    end
  end
  # Remove unwanted section tags
  @dom.css('.WordSection1, .WordSection2, .WordSection3, .WordSection4, .WordSection5, .WordSection6, .WordSection7, .WordSection8').each do |section|
    puts "Removing #{section.name}.#{section['class']}"
    section.replace(section.inner_html)
  end
  if options[:rebuild_toc]
    # Remove page numbers from TOC
    @dom.css('.MsoToc1 a, .MsoToc2 a, .MsoToc3 a, .MsoToc4 a').each do |item|
      item.inner_html = item.inner_text.sub(/\A(\d+\.)+/, '').sub(/(\s+\d+)\Z/, '').strip
    end
    # Rewrite Toc as ordered list.
    toc_item = @dom.at_css('.MsoToc1')
    previous_toc_level = 0
    new_toc = []
    while toc_item
      toc_item.inner_html = toc_item.inner_html.sub(/\n/, ' ')
      class_attr = toc_item.attr('class')
      current_toc_level = class_attr[6].to_i
      new_toc << "</li>\n" if previous_toc_level == current_toc_level
      new_toc << "</ol>\n</li>\n" if previous_toc_level > current_toc_level
      new_toc << "\n<ol#{' id="toc"' if previous_toc_level == 0}>\n" if previous_toc_level < current_toc_level
      link = toc_item.at_css('a')
      if link.nil?
        puts toc_item.to_s
       else
        toc_item.at_css('a').inner_html = link.inner_html.sub(/\A(\s*\d+)/, '').strip
        new_toc << "<li>#{toc_item.inner_html.sub(/#_Toc/, '#Toc')}"
      end
      previous_toc_level = current_toc_level
      begin
        toc_item = toc_item.next_element
      end while toc_item && toc_item.text?
      toc_item = nil unless toc_item && toc_item.attr('class') && toc_item.attr('class').start_with?('MsoToc')
    end
    @dom.at_css('.MsoToc1').replace(new_toc.join('')) if @dom.at_css('.MsoToc1')
    # Remove old Table of Contents
    @dom.css('.MsoToc1, .MsoToc2, .MsoToc3, .MsoToc4').each { |item| item.remove }
  end
  # Remove empty paragraphs
  @dom.css('p').each do |p|
    if p.content.gsub("\uC2A0", '').strip.size == 0 && !p.at_css('img')
      puts 'Removing empty paragraph.'
      p.remove
    end
  end
  @dom.css('table + br').remove
#  /<!--\[if[.\n\r]+\[endif\]\s*-->/
end

#save!(output_encoding = 'utf-8') ⇒ `Object`

Save the file with the same file name.



131
132
133

# File 'lib/headless_html_editor.rb', line 131

def save!(output_encoding = 'utf-8')
  save_as!(@input_file_name, output_encoding)
end

#save_as!(output_file_name, output_encoding = 'utf-8') ⇒ `Object`

Save file with a new file name.

# File 'lib/headless_html_editor.rb', line 136

def save_as!(output_file_name, output_encoding = 'utf-8')
  puts "W: #{output_file_name}"
  begin
    if File.writable?(output_file_name) || !File.exists?(output_file_name)
      File.open(output_file_name, "w:#{output_encoding}", universal_newline: false) do |f|
        f.write @dom.to_html({ encoding: output_encoding, indent: 2 })
      end
    else
      $stderr.puts 'Failed: Read only!'
    end
  rescue StandardError => se
    $stderr.puts "\nFailed!\n#{se.message}"
  end
end

Class: HeadlessHtmlEditor

Overview

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(input_file_name, input_encoding = 'utf-8') ⇒ HeadlessHtmlEditor

Instance Attribute Details

#dom ⇒ Object (readonly)

Class Method Details

.bulk_edit(file_list_file_name, &block) ⇒ Object

.edit_folder(folder, &block) ⇒ Object

Instance Method Details

#accept_word_changes_tracked ⇒ Object

#demote_headings ⇒ Object

#remove_header_scripts ⇒ Object

#remove_word_artifacts(options = { rebuild_toc: true }) ⇒ Object

#save!(output_encoding = 'utf-8') ⇒ Object

#save_as!(output_file_name, output_encoding = 'utf-8') ⇒ Object