Class: Coradoc::Input::Html::Cleaner

Inherits:
Object
  • Object
show all
Defined in:
lib/coradoc/input/html/cleaner.rb

Instance Method Summary collapse

Instance Method Details

#clean_headings(string) ⇒ Object

following added by me



110
111
112
113
114
115
116
117
# File 'lib/coradoc/input/html/cleaner.rb', line 110

def clean_headings(string)
  string.gsub!(%r{<h([1-9])[^>]*></h\1>}, " ")
  # I don't know why Libre Office is inserting them, but they need to go
  string.gsub!(%r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>},
               "<sup>\\2</sup>")
  # I absolutely don't know why Libre Office is rendering superscripts as h1
  string
end

#clean_punctuation_characters(string) ⇒ Object



91
92
93
# File 'lib/coradoc/input/html/cleaner.rb', line 91

def clean_punctuation_characters(string)
  string.gsub(/(\*\*|~~|__)\s([.!?'"])/, "\\1\\2")
end

#clean_tag_borders(string) ⇒ Object

Find non-asterisk content that is enclosed by two or more asterisks. Ensure that only one whitespace occurs in the border area. Same for underscores and brackets.



64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# File 'lib/coradoc/input/html/cleaner.rb', line 64

def clean_tag_borders(string)
  # result = string.gsub(/\s?\*{2,}.*?\*{2,}\s?/) do |match|
  # preserve_border_whitespaces(match, default_border: Coradoc::Input::HTML.config.tag_border) do
  #   match.strip.sub("** ", "**").sub(" **", "**")
  # end
  # end

  # result = string.gsub(/\s?_{2,}.*?_{2,}\s?/) do |match|
  #   preserve_border_whitespaces(match, default_border: Coradoc::Input::HTML.config.tag_border) do
  #     match.strip.sub("__ ", "__").sub(" __", "__")
  #   end
  # end

  result = string.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match|
    preserve_border_whitespaces(match,
                                default_border: Coradoc::Input::Html.config.tag_border) do
      match.strip.sub("~~ ", "~~").sub(" ~~", "~~")
    end
  end

  result.gsub(/\s?\[.*?\]\s?/) do |match|
    preserve_border_whitespaces(match) do
      match.strip.sub("[ ", "[").sub(" ]", "]")
    end
  end
end

#preprocess_word_html(string) ⇒ Object

preprocesses HTML, rather than postprocessing it



96
97
98
# File 'lib/coradoc/input/html/cleaner.rb', line 96

def preprocess_word_html(string)
  clean_headings(scrub_whitespace(string.dup))
end

#remove_block_leading_newlines(string) ⇒ Object



29
30
31
# File 'lib/coradoc/input/html/cleaner.rb', line 29

def remove_block_leading_newlines(string)
  string.gsub("]\n****\n\n", "]\n****\n")
end

#remove_inner_whitespaces(string) ⇒ Object



45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/coradoc/input/html/cleaner.rb', line 45

def remove_inner_whitespaces(string)
  unless string.nil?
    string.gsub!(/\n stem:\[/, "\nstem:[")
    string.gsub!(/(stem:\[([^\]]|\\\])*\])\n(?=\S)/, "\\1 ")
    string.gsub!(/(stem:\[([^\]]|\\\])*\])\s+(?=[\^-])/, "\\1")
  end
  result = +""
  string.each_line do |line|
    result << preserve_border_whitespaces(line) do
      line.strip.gsub(/[ \t]{2,}/, " ")
    end
  end
  result
end

#remove_leading_newlines(string) ⇒ Object



41
42
43
# File 'lib/coradoc/input/html/cleaner.rb', line 41

def remove_leading_newlines(string)
  string.gsub(/\A\n+/, "")
end

#remove_newlines(string) ⇒ Object



37
38
39
# File 'lib/coradoc/input/html/cleaner.rb', line 37

def remove_newlines(string)
  string.gsub(/\n{3,}/, "\n\n")
end

#remove_section_attribute_newlines(string) ⇒ Object



33
34
35
# File 'lib/coradoc/input/html/cleaner.rb', line 33

def remove_section_attribute_newlines(string)
  string.gsub("]\n\n==", "]\n==")
end

#scrub_whitespace(string) ⇒ Object



100
101
102
103
104
105
106
107
# File 'lib/coradoc/input/html/cleaner.rb', line 100

def scrub_whitespace(string)
  string.gsub!(/&nbsp;|&#xA0;|\u00a0/i, "&#xA0;") # HTML encoded spaces
  string = Coradoc.strip_unicode(string) # Strip document-level leading and trailing whitespace
  string.gsub!(/( +)$/, " ") # line trailing whitespace
  string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks
  # string.delete!('?| ')               # Unicode non-breaking spaces, injected as tabs
  string
end

#tidy(string) ⇒ Object



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# File 'lib/coradoc/input/html/cleaner.rb', line 5

def tidy(string)
  if string.is_a? Hash
    return string.transform_values { |i| tidy(i) }
  end

  result = HtmlConverter.track_time "Removing inner whitespace" do
    remove_inner_whitespaces(String.new(string))
  end
  result = HtmlConverter.track_time "Removing newlines" do
    remove_newlines(result)
  end
  result = HtmlConverter.track_time "Removing leading newlines" do
    remove_leading_newlines(result)
  end
  result = HtmlConverter.track_time "Cleaning tag borders" do
    clean_tag_borders(result)
  end
  result = HtmlConverter.track_time "Cleaning punctuation characters" do
    clean_punctuation_characters(result)
  end
  result = remove_block_leading_newlines(result)
  result = remove_section_attribute_newlines(result)
end