Class: Govspeak::Document

Inherits:

Object

Object
Govspeak::Document

show all

Defined in:: lib/govspeak.rb

Constant Summary collapse

Parser =

Kramdown::Parser::Govuk

PARSER_CLASS_NAME =

Parser.name.split("::").last

UUID_REGEX =

/^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/i

NEW_PARAGRAPH_LOOKBEHIND =

%q{(?<=\A|\n\n|\r\n\r\n)}.freeze

Class Attribute Summary collapse

.extensions ⇒ Object readonly

Returns the value of attribute extensions.

Instance Attribute Summary collapse

#attachments ⇒ Object readonly

Returns the value of attribute attachments.
#auto_numbered_header_levels ⇒ Object readonly

Returns the value of attribute auto_numbered_header_levels.
#auto_numbered_headers ⇒ Object readonly

Returns the value of attribute auto_numbered_headers.
#contacts ⇒ Object readonly

Returns the value of attribute contacts.
#images ⇒ Object

Returns the value of attribute images.
#links ⇒ Object readonly

Returns the value of attribute links.
#locale ⇒ Object readonly

Returns the value of attribute locale.
#log_snapshots ⇒ Object readonly

Returns the value of attribute log_snapshots.

Class Method Summary collapse

Instance Method Summary collapse

#extract_contact_content_ids ⇒ Object
#extracted_links(website_root: nil) ⇒ Object
#headers ⇒ Object
#initialize(source, options = {}) ⇒ Document constructor

A new instance of Document.
#insert_strong_inside_p(body, parser = Govspeak::Document) ⇒ Object
#preprocess(source) ⇒ Object
#remove_forbidden_characters(source) ⇒ Object
#render_image(image) ⇒ Object

As of version 1.12.0 of Kramdown the block elements (div & figcaption) inside this html block will have it’s < > converted into HTML Entities when ever this code is used inside block level elements.
#structured_headers ⇒ Object
#to_html ⇒ Object
#to_liquid ⇒ Object
#to_text ⇒ Object
#valid?(validation_options = {}) ⇒ Boolean

Constructor Details

#initialize(source, options = {}) ⇒ `Document`

# File 'lib/govspeak.rb', line 51

def initialize(source, options = {})
  options = options.dup.deep_symbolize_keys
  @source = source ? source.dup : ""

  @log_snapshots = options.fetch(:log_snapshots, false)
  log_snapshot("options", options)
  log_snapshot("source", @source)
  @images = options.delete(:images) || []
  @allowed_elements = options.delete(:allowed_elements) || []
  @allowed_image_hosts = options.delete(:allowed_image_hosts) || []
  @attachments = Array.wrap(options.delete(:attachments))
  @auto_numbered_headers = options.fetch(:auto_numbered_headers, false)
  @auto_numbered_header_levels = options.delete(:auto_numbered_header_levels) || [2, 3, 4, 5, 6]
  @links = Array.wrap(options.delete(:links))
  @contacts = Array.wrap(options.delete(:contacts))
  @locale = options.fetch(:locale, "en")
  @options = { input: PARSER_CLASS_NAME,
               sanitize: true,
               syntax_highlighter: nil }.merge(options)
  @options[:entity_output] = :symbolic
end

Class Attribute Details

.extensions ⇒ `Object` (readonly)

Returns the value of attribute extensions.



48
49
50

# File 'lib/govspeak.rb', line 48

def extensions
  @extensions
end

Instance Attribute Details

#attachments ⇒ `Object` (readonly)

Returns the value of attribute attachments.



41
42
43

# File 'lib/govspeak.rb', line 41

def attachments
  @attachments
end

#auto_numbered_header_levels ⇒ `Object` (readonly)

Returns the value of attribute auto_numbered_header_levels.



41
42
43

# File 'lib/govspeak.rb', line 41

def auto_numbered_header_levels
  @auto_numbered_header_levels
end

#auto_numbered_headers ⇒ `Object` (readonly)

Returns the value of attribute auto_numbered_headers.



41
42
43

# File 'lib/govspeak.rb', line 41

def auto_numbered_headers
  @auto_numbered_headers
end

#contacts ⇒ `Object` (readonly)

Returns the value of attribute contacts.



41
42
43

# File 'lib/govspeak.rb', line 41

def contacts
  @contacts
end

#images ⇒ `Object`

Returns the value of attribute images.



40
41
42

# File 'lib/govspeak.rb', line 40

def images
  @images
end

#links ⇒ `Object` (readonly)

Returns the value of attribute links.



41
42
43

# File 'lib/govspeak.rb', line 41

def links
  @links
end

#locale ⇒ `Object` (readonly)

Returns the value of attribute locale.



41
42
43

# File 'lib/govspeak.rb', line 41

def locale
  @locale
end

#log_snapshots ⇒ `Object` (readonly)

Returns the value of attribute log_snapshots.



41
42
43

# File 'lib/govspeak.rb', line 41

def log_snapshots
  @log_snapshots
end

Class Method Details

.extension(title, regexp = nil, &block) ⇒ `Object`

# File 'lib/govspeak.rb', line 143

def self.extension(title, regexp = nil, &block)
  regexp ||= %r${::#{title}}(.*?){:/#{title}}$m
  @extensions << [title, regexp, block]
end

.surrounded_by(open, close = nil) ⇒ `Object`

# File 'lib/govspeak.rb', line 148

def self.surrounded_by(open, close = nil)
  open = Regexp.escape(open)
  if close
    close = Regexp.escape(close)
    %r{(?:\r|\n|^)#{open}(.*?)#{close} *(\r|\n|$)?}m
  else
    %r{(?:\r|\n|^)#{open}(.*?)#{open}? *(\r|\n|$)}m
  end
end

.to_html(source, options = {}) ⇒ `Object`



43
44
45

# File 'lib/govspeak.rb', line 43

def self.to_html(source, options = {})
  new(source, options).to_html
end

.wrap_with_div(class_name, character, parser = Kramdown::Document) ⇒ `Object`

# File 'lib/govspeak.rb', line 158

def self.wrap_with_div(class_name, character, parser = Kramdown::Document)
  extension(class_name, surrounded_by(character)) do |body|
    content = if parser
                parser.new("#{body.strip}\n", locale: @locale).to_html
              else
                body.strip
              end
    %(\n<div class="#{class_name}">\n#{content}</div>\n)
  end
end

Instance Method Details

#extract_contact_content_ids ⇒ `Object`

# File 'lib/govspeak.rb', line 117

def extract_contact_content_ids
  _, regex = self.class.extensions.find { |(title)| title == "Contact" }
  return [] unless regex

  @source.scan(regex).map(&:first).uniq.select { |id| id.match(UUID_REGEX) }
end

#extracted_links(website_root: nil) ⇒ `Object`



113
114
115

# File 'lib/govspeak.rb', line 113

def extracted_links(website_root: nil)
  Govspeak::LinkExtractor.new(self, website_root:).call
end

#headers ⇒ `Object`



105
106
107

# File 'lib/govspeak.rb', line 105

def headers
  Govspeak::HeaderExtractor.convert(kramdown_doc).first
end

#insert_strong_inside_p(body, parser = Govspeak::Document) ⇒ `Object`

# File 'lib/govspeak.rb', line 169

def insert_strong_inside_p(body, parser = Govspeak::Document)
  parser
    .new(body.strip, locale: @locale)
    .to_html
    .sub(/^<p>(.*)<\/p>$/, "<p><strong>\\1</strong></p>")
end

#preprocess(source) ⇒ `Object`

# File 'lib/govspeak.rb', line 124

def preprocess(source)
  source = Govspeak::BlockquoteExtraQuoteRemover.remove(source)
  source = remove_forbidden_characters(source)

  self.class.extensions.each do |_, regexp, block|
    source.gsub!(regexp) do
      instance_exec(*Regexp.last_match.captures, &block)
    end
  end

  source.tap { log_snapshot("after preprocess", _1) }
end

#remove_forbidden_characters(source) ⇒ `Object`

# File 'lib/govspeak.rb', line 137

def remove_forbidden_characters(source)
  # These are characters that are not deemed not suitable for
  # markup: https://www.w3.org/TR/unicode-xml/#Charlist
  source.gsub(Sanitize::REGEX_UNSUITABLE_CHARS, "")
end

#render_image(image) ⇒ `Object`

As of version 1.12.0 of Kramdown the block elements (div & figcaption) inside this html block will have it’s < > converted into HTML Entities when ever this code is used inside block level elements.

To resolve this we have a post-processing task that will convert this back into HTML (I know - it’s ugly). The way we could resolve this without ugliness would be to output only inline elements which rules out div and figcaption

This issue is not considered a bug by kramdown: github.com/gettalong/kramdown/issues/191

# File 'lib/govspeak.rb', line 287

def render_image(image)
  id_attr = image.id ? %( id="attachment_#{image.id}") : ""
  lines = []
  lines << %(<figure#{id_attr} class="image embedded">)
  lines << %(<div class="img"><img src="#{encode(image.url)}" alt="#{encode(image.alt_text)}"></div>)
  lines << image.figcaption_html if image.figcaption?
  lines << "</figure>"
  lines.join
end

#structured_headers ⇒ `Object`



109
110
111

# File 'lib/govspeak.rb', line 109

def structured_headers
  Govspeak::StructuredHeaderExtractor.new(self).call
end

#to_html ⇒ `Object`

# File 'lib/govspeak.rb', line 73

def to_html
  @to_html ||= begin
    html = if @options[:sanitize]
             HtmlSanitizer.new(kramdown_doc.to_html, allowed_image_hosts: @allowed_image_hosts)
                          .sanitize(allowed_elements: @allowed_elements)
           else
             kramdown_doc.to_html
           end

    log_snapshot("after Kramdown process", html)

    Govspeak::PostProcessor.process(html, self).tap do
      log_snapshot("after postprocess", _1)
    end
  end
end

#to_liquid ⇒ `Object`



90
91
92

# File 'lib/govspeak.rb', line 90

def to_liquid
  to_html
end

#to_text ⇒ `Object`



94
95
96

# File 'lib/govspeak.rb', line 94

def to_text
  HTMLEntities.new.decode(to_html.gsub(/(?:<[^>]+>|\s)+/, " ").strip)
end

#valid?(validation_options = {}) ⇒ `Boolean`

# File 'lib/govspeak.rb', line 98

def valid?(validation_options = {})
  Govspeak::HtmlValidator.new(
    @source,
    validation_options.merge({ locale: @locale }),
  ).valid?
end

Class: Govspeak::Document

Constant Summary collapse

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(source, options = {}) ⇒ Document

Class Attribute Details

.extensions ⇒ Object (readonly)

Instance Attribute Details

#attachments ⇒ Object (readonly)

#auto_numbered_header_levels ⇒ Object (readonly)

#auto_numbered_headers ⇒ Object (readonly)

#contacts ⇒ Object (readonly)

#images ⇒ Object

#links ⇒ Object (readonly)

#locale ⇒ Object (readonly)

#log_snapshots ⇒ Object (readonly)

Class Method Details

.extension(title, regexp = nil, &block) ⇒ Object

.surrounded_by(open, close = nil) ⇒ Object

.to_html(source, options = {}) ⇒ Object

.wrap_with_div(class_name, character, parser = Kramdown::Document) ⇒ Object

Instance Method Details

#extract_contact_content_ids ⇒ Object

#extracted_links(website_root: nil) ⇒ Object

#headers ⇒ Object

#insert_strong_inside_p(body, parser = Govspeak::Document) ⇒ Object

#preprocess(source) ⇒ Object

#remove_forbidden_characters(source) ⇒ Object

#render_image(image) ⇒ Object

#structured_headers ⇒ Object

#to_html ⇒ Object

#to_liquid ⇒ Object

#to_text ⇒ Object

#valid?(validation_options = {}) ⇒ Boolean