Class: Lcms::Engine::HtmlSanitizer

Inherits:
Object
  • Object
show all
Defined in:
app/services/lcms/engine/html_sanitizer.rb

Overview

rubocop:disable Metrics/ClassLength

Constant Summary collapse

LIST_STYLE_RE =
/\.lst-(\S+)[^{}]+>\s*(?:li:before)\s*{\s*content[^{}]+counter\(lst-ctn-\1,([^)]+)\)/.freeze
CLEAN_ELEMENTS =
%w(a div h1 h2 h3 h4 h5 h6 p table).join(',')
GDOC_REMOVE_EMPTY_SELECTOR =
'.o-ld-activity'
/text-decoration\s*:\s*underline/i.freeze
SKIP_P_CHECK =
%w(ul ol table).freeze
STRIP_ELEMENTS =
%w(a div h1 h2 h3 h4 h5 h6 p span table).freeze

Class Method Summary collapse

Class Method Details

.clean_content(html, context_type) ⇒ Object



16
17
18
19
20
21
22
23
24
# File 'app/services/lcms/engine/html_sanitizer.rb', line 16

def clean_content(html, context_type)
  return html unless context_type.to_s.casecmp('gdoc').zero?

  nodes = Nokogiri::HTML.fragment html
  clean_double_margin_elements(nodes)
  clean_empty_elements(nodes.elements)
  clean_dropdowns(nodes)
  nodes.to_html.strip
end

.css_configObject

Config to keep list-style-type bc gdoc is doing this trough content/counter



85
86
87
88
89
90
91
# File 'app/services/lcms/engine/html_sanitizer.rb', line 85

def css_config
  {
    css: {
      properties: %w(content counter-increment counter-reset counter-set list-style-type)
    }
  }
end

.css_inline_configObject

List of attributes we need to keep when all parsing jobs have been completed



96
97
98
99
100
101
102
# File 'app/services/lcms/engine/html_sanitizer.rb', line 96

def css_inline_config
  {
    css: {
      properties: %w(font-style font-weight text-decoration)
    }
  }
end

.default_configObject



104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# File 'app/services/lcms/engine/html_sanitizer.rb', line 104

def default_config
  {
    elements: %w(table td th tr tbody thead span a p h1 h2 h3 h4 h5 h6 ol ul li div img hr abbr b blockquote br
                 cite code dd dfn dl dt em i kbd mark pre q s samp small strike strong sub sup time u var),
    attributes: {
      'a' => %w(href title data-toggle id),
      'div' => %w(data-json-field data-json-type),
      'img' => %w(alt src style drawing_url),
      'ol' => %w(type style start list-style-type),
      'ul' => %w(type style start list-style-type),
      'li' => %w(class),
      'p' => %w(class style),
      'span' => %w(style),
      'sub' => %w(style),
      'sup' => %w(style),
      'td' => %w(colspan rowspan style),
      'th' => %w(colspan rowspan),
      'tr' => %w(style)
    },
    protocols: {
      'a' => { 'href' => ['http', 'https', :relative] }
    },
    css: {
      properties: %w(background-color border-bottom-width border-left-width border-right-width border-top-width
                     border-bottom border-left border-right border-top height font-style font-weight
                     list-style-type text-align text-decoration vertical-align width)
    },
    transformers: [ # These transformers Will be executed via .call(), as lambdas
      method(:remove_meanless_styles),
      method(:remove_empty_paragraphs),
      # TODO: need to change parsing tags xpath before, it's relying on spans
      # method(:remove_spans_wo_attrs)
      method(:remove_gdocs_pagebreaks),
      method(:remove_gdocs_suggestions),
      method(:replace_charts_urls),
      method(:replace_supsub),
      method(:keep_bullets_level),
      method(:replace_table_border_styles)
    ]
  }
end

.post_processing(html, options) ⇒ Object



59
60
61
62
63
64
65
66
67
# File 'app/services/lcms/engine/html_sanitizer.rb', line 59

def post_processing(html, options)
  @options = options
  nodes = Nokogiri::HTML.fragment html

  post_processing_hr(nodes) if options[:material]
  return post_processing_gdoc(nodes) if options[:context_type].to_s.casecmp('gdoc').zero?

  post_processing_default(nodes)
end

.process_list_styles(html) ⇒ Object



69
70
71
72
73
74
75
76
77
78
79
80
# File 'app/services/lcms/engine/html_sanitizer.rb', line 69

def process_list_styles(html)
  html.xpath('//style').each do |stylesheet|
    stylesheet.text.scan(LIST_STYLE_RE) do |match|
      list_selector = "ol.lst-#{match[0]}"
      counter_type = match[1]
      html.css(list_selector).each do |element|
        element['style'] = [element['style'], "list-style-type: #{counter_type}"].join(';')
      end
    end
  end
  html
end

.sanitize(html) ⇒ Object



26
27
28
# File 'app/services/lcms/engine/html_sanitizer.rb', line 26

def sanitize(html)
  Sanitize.fragment(html, default_config)
end

.sanitize_css(css) ⇒ Object



30
31
32
# File 'app/services/lcms/engine/html_sanitizer.rb', line 30

def sanitize_css(css)
  Sanitize::CSS.stylesheet(css, css_config)
end

.strip_content(nodes) ⇒ Object

Removes all empty nodes before first one filled in



37
38
39
40
41
42
43
# File 'app/services/lcms/engine/html_sanitizer.rb', line 37

def strip_content(nodes)
  nodes.xpath('./*').each do |node|
    break if keep_node?(node)

    node.remove
  end
end

.strip_html(html) ⇒ Object



45
46
47
48
49
50
51
# File 'app/services/lcms/engine/html_sanitizer.rb', line 45

def strip_html(html)
  return '' if html.blank?

  nodes = Nokogiri::HTML.fragment html
  strip_content(nodes)
  nodes.to_html || ''
end

.strip_html_element(element) ⇒ Object



53
54
55
56
57
# File 'app/services/lcms/engine/html_sanitizer.rb', line 53

def strip_html_element(element)
  return '' if element.blank? || Sanitize.fragment(element, elements: []).strip.empty?

  element
end