Class: ReverseAdoc::Cleaner

Inherits:
Object
  • Object
show all
Defined in:
lib/reverse_adoc/cleaner.rb

Instance Method Summary collapse

Instance Method Details

#clean_headings(string) ⇒ Object

following added by me



82
83
84
85
86
87
88
89
# File 'lib/reverse_adoc/cleaner.rb', line 82

def clean_headings(string)
  string.gsub!(%r{<h([1-9])[^>]*></h\1>}, " ")
  # I don't know why Libre Office is inserting them, but they need to go
  string.gsub!(%r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>},
               "<sup>\\2</sup>")
  # I absolutely don't know why Libre Office is rendering superscripts as h1
  string
end

#clean_punctuation_characters(string) ⇒ Object



62
63
64
# File 'lib/reverse_adoc/cleaner.rb', line 62

def clean_punctuation_characters(string)
  string.gsub(/(\*\*|~~|__)\s([.!?'"])/, "\\1".strip + "\\2")
end

#clean_tag_borders(string) ⇒ Object

Find non-asterisk content that is enclosed by two or more asterisks. Ensure that only one whitespace occurs in the border area. Same for underscores and brackets.



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/reverse_adoc/cleaner.rb', line 36

def clean_tag_borders(string)
  result = string.gsub(/\s?\*{2,}.*?\*{2,}\s?/) do |match|
    preserve_border_whitespaces(match, default_border: ReverseAdoc.config.tag_border) do
      match.strip.sub("** ", "**").sub(" **", "**")
    end
  end

  result = result.gsub(/\s?_{2,}.*?_{2,}\s?/) do |match|
    preserve_border_whitespaces(match, default_border: ReverseAdoc.config.tag_border) do
      match.strip.sub("__ ", "__").sub(" __", "__")
    end
  end

  result = result.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match|
    preserve_border_whitespaces(match, default_border: ReverseAdoc.config.tag_border) do
      match.strip.sub("~~ ", "~~").sub(" ~~", "~~")
    end
  end

  result.gsub(/\s?\[.*?\]\s?/) do |match|
    preserve_border_whitespaces(match) do
      match.strip.sub("[ ", "[").sub(" ]", "]")
    end
  end
end

#preprocess_word_html(string) ⇒ Object

preprocesses HTML, rather than postprocessing it



67
68
69
# File 'lib/reverse_adoc/cleaner.rb', line 67

def preprocess_word_html(string)
  clean_headings(scrub_whitespace(string.dup))
end

#remove_inner_whitespaces(string) ⇒ Object



19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/reverse_adoc/cleaner.rb', line 19

def remove_inner_whitespaces(string)
  unless string.nil?
    string.gsub!(/\n stem:\[/, "\nstem:[")
    string.gsub!(/(stem:\[([^\]]|\\\])*\])\n(?=\S)/, "\\1 ")
    string.gsub!(/(stem:\[([^\]]|\\\])*\])\s+(?=[\^-])/, "\\1")
  end
  string.each_line.inject("") do |memo, line|
    memo + preserve_border_whitespaces(line) do
      line.strip.gsub(/[ \t]{2,}/, " ")
    end
  end
end

#remove_leading_newlines(string) ⇒ Object



15
16
17
# File 'lib/reverse_adoc/cleaner.rb', line 15

def remove_leading_newlines(string)
  string.gsub(/\A\n+/, "")
end

#remove_newlines(string) ⇒ Object



11
12
13
# File 'lib/reverse_adoc/cleaner.rb', line 11

def remove_newlines(string)
  string.gsub(/\n{3,}/, "\n\n")
end

#scrub_whitespace(string) ⇒ Object



71
72
73
74
75
76
77
78
79
# File 'lib/reverse_adoc/cleaner.rb', line 71

def scrub_whitespace(string)
  string.gsub!(/&nbsp;|&#xA0;|\u00a0/i, "&#xA0;") # HTML encoded spaces
  string.sub!(/^\A[[:space:]]+/m, "") # document leading whitespace
  string.sub!(/[[:space:]]+\z$/m, "") # document trailing whitespace
  string.gsub!(/( +)$/, " ") # line trailing whitespace
  string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks
  # string.delete!('?| ')               # Unicode non-breaking spaces, injected as tabs
  string
end

#tidy(string) ⇒ Object



3
4
5
6
7
8
9
# File 'lib/reverse_adoc/cleaner.rb', line 3

def tidy(string)
  result = remove_inner_whitespaces(string)
  result = remove_newlines(result)
  result = remove_leading_newlines(result)
  result = clean_tag_borders(result)
  clean_punctuation_characters(result)
end