Class: Coradoc::Input::Html::Cleaner
- Inherits:
-
Object
- Object
- Coradoc::Input::Html::Cleaner
- Defined in:
- lib/coradoc/input/html/cleaner.rb
Instance Method Summary collapse
-
#clean_headings(string) ⇒ Object
following added by me.
- #clean_punctuation_characters(string) ⇒ Object
-
#clean_tag_borders(string) ⇒ Object
Find non-asterisk content that is enclosed by two or more asterisks.
-
#preprocess_word_html(string) ⇒ Object
preprocesses HTML, rather than postprocessing it.
- #remove_block_leading_newlines(string) ⇒ Object
- #remove_inner_whitespaces(string) ⇒ Object
- #remove_leading_newlines(string) ⇒ Object
- #remove_newlines(string) ⇒ Object
- #remove_section_attribute_newlines(string) ⇒ Object
- #scrub_whitespace(string) ⇒ Object
- #tidy(string) ⇒ Object
Instance Method Details
#clean_headings(string) ⇒ Object
following added by me
110 111 112 113 114 115 116 117 |
# File 'lib/coradoc/input/html/cleaner.rb', line 110 def clean_headings(string) string.gsub!(%r{<h([1-9])[^>]*></h\1>}, " ") # I don't know why Libre Office is inserting them, but they need to go string.gsub!(%r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>}, "<sup>\\2</sup>") # I absolutely don't know why Libre Office is rendering superscripts as h1 string end |
#clean_punctuation_characters(string) ⇒ Object
91 92 93 |
# File 'lib/coradoc/input/html/cleaner.rb', line 91 def clean_punctuation_characters(string) string.gsub(/(\*\*|~~|__)\s([.!?'"])/, "\\1\\2") end |
#clean_tag_borders(string) ⇒ Object
Find non-asterisk content that is enclosed by two or more asterisks. Ensure that only one whitespace occurs in the border area. Same for underscores and brackets.
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
# File 'lib/coradoc/input/html/cleaner.rb', line 64 def clean_tag_borders(string) # result = string.gsub(/\s?\*{2,}.*?\*{2,}\s?/) do |match| # preserve_border_whitespaces(match, default_border: Coradoc::Input::HTML.config.tag_border) do # match.strip.sub("** ", "**").sub(" **", "**") # end # end # result = string.gsub(/\s?_{2,}.*?_{2,}\s?/) do |match| # preserve_border_whitespaces(match, default_border: Coradoc::Input::HTML.config.tag_border) do # match.strip.sub("__ ", "__").sub(" __", "__") # end # end result = string.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match| preserve_border_whitespaces(match, default_border: Coradoc::Input::Html.config.tag_border) do match.strip.sub("~~ ", "~~").sub(" ~~", "~~") end end result.gsub(/\s?\[.*?\]\s?/) do |match| preserve_border_whitespaces(match) do match.strip.sub("[ ", "[").sub(" ]", "]") end end end |
#preprocess_word_html(string) ⇒ Object
preprocesses HTML, rather than postprocessing it
96 97 98 |
# File 'lib/coradoc/input/html/cleaner.rb', line 96 def preprocess_word_html(string) clean_headings(scrub_whitespace(string.dup)) end |
#remove_block_leading_newlines(string) ⇒ Object
29 30 31 |
# File 'lib/coradoc/input/html/cleaner.rb', line 29 def remove_block_leading_newlines(string) string.gsub("]\n****\n\n", "]\n****\n") end |
#remove_inner_whitespaces(string) ⇒ Object
45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/coradoc/input/html/cleaner.rb', line 45 def remove_inner_whitespaces(string) unless string.nil? string.gsub!(/\n stem:\[/, "\nstem:[") string.gsub!(/(stem:\[([^\]]|\\\])*\])\n(?=\S)/, "\\1 ") string.gsub!(/(stem:\[([^\]]|\\\])*\])\s+(?=[\^-])/, "\\1") end result = +"" string.each_line do |line| result << preserve_border_whitespaces(line) do line.strip.gsub(/[ \t]{2,}/, " ") end end result end |
#remove_leading_newlines(string) ⇒ Object
41 42 43 |
# File 'lib/coradoc/input/html/cleaner.rb', line 41 def remove_leading_newlines(string) string.gsub(/\A\n+/, "") end |
#remove_newlines(string) ⇒ Object
37 38 39 |
# File 'lib/coradoc/input/html/cleaner.rb', line 37 def remove_newlines(string) string.gsub(/\n{3,}/, "\n\n") end |
#remove_section_attribute_newlines(string) ⇒ Object
33 34 35 |
# File 'lib/coradoc/input/html/cleaner.rb', line 33 def remove_section_attribute_newlines(string) string.gsub("]\n\n==", "]\n==") end |
#scrub_whitespace(string) ⇒ Object
100 101 102 103 104 105 106 107 |
# File 'lib/coradoc/input/html/cleaner.rb', line 100 def scrub_whitespace(string) string.gsub!(/ | |\u00a0/i, " ") # HTML encoded spaces string = Coradoc.strip_unicode(string) # Strip document-level leading and trailing whitespace string.gsub!(/( +)$/, " ") # line trailing whitespace string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks # string.delete!('?| ') # Unicode non-breaking spaces, injected as tabs string end |
#tidy(string) ⇒ Object
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
# File 'lib/coradoc/input/html/cleaner.rb', line 5 def tidy(string) if string.is_a? Hash return string.transform_values { |i| tidy(i) } end result = HtmlConverter.track_time "Removing inner whitespace" do remove_inner_whitespaces(String.new(string)) end result = HtmlConverter.track_time "Removing newlines" do remove_newlines(result) end result = HtmlConverter.track_time "Removing leading newlines" do remove_leading_newlines(result) end result = HtmlConverter.track_time "Cleaning tag borders" do clean_tag_borders(result) end result = HtmlConverter.track_time "Cleaning punctuation characters" do clean_punctuation_characters(result) end result = remove_block_leading_newlines(result) result = remove_section_attribute_newlines(result) end |