Class: TruncatedSaxDocument

Inherits:
Nokogiri::XML::SAX::Document
  • Object
show all
Defined in:
lib/abbreviato/truncated_sax_document.rb

Constant Summary collapse

IGNORABLE_TAGS =
%w[html head body].freeze
VOID_TAGS =

These don’t have to be closed (which also impacts ongoing length calculations) www.456bereastreet.com/archive/201005/void_empty_elements_and_self-closing_start_tags_in_html/

%w[area base br col command hr img input keygen link meta param source wbr].freeze

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options) ⇒ TruncatedSaxDocument

Returns a new instance of TruncatedSaxDocument.



17
18
19
20
21
22
23
24
25
26
27
28
29
# File 'lib/abbreviato/truncated_sax_document.rb', line 17

def initialize(options)
  @html_coder = HTMLEntities.new

  @max_length = options[:max_length]
  @tail = options[:tail] || ''
  @fragment_mode = options[:fragment]

  @truncated_string = ""
  @closing_tags = []
  @estimated_length = 0
  @ignored_levels = 0
  @truncated = false
end

Instance Attribute Details

#ignored_levelsObject (readonly)

Returns the value of attribute ignored_levels.



11
12
13
# File 'lib/abbreviato/truncated_sax_document.rb', line 11

def ignored_levels
  @ignored_levels
end

#max_lengthObject (readonly)

Returns the value of attribute max_length.



11
12
13
# File 'lib/abbreviato/truncated_sax_document.rb', line 11

def max_length
  @max_length
end

#tailObject (readonly)

Returns the value of attribute tail.



11
12
13
# File 'lib/abbreviato/truncated_sax_document.rb', line 11

def tail
  @tail
end

#truncatedObject (readonly)

Returns the value of attribute truncated.



11
12
13
# File 'lib/abbreviato/truncated_sax_document.rb', line 11

def truncated
  @truncated
end

#truncated_stringObject (readonly)

Returns the value of attribute truncated_string.



11
12
13
# File 'lib/abbreviato/truncated_sax_document.rb', line 11

def truncated_string
  @truncated_string
end

Instance Method Details

#cdata_block(string) ⇒ Object

This method is called when the parser encounters cdata. In practice, this also gets called for this style of comment inside an element:

<style><!--
  /* Font Definitions */
  @font-face
    {font-family:Wingdings;
    panose-1:5 0 0 0 0 0 0 0 0 0;}
--></style>


100
101
102
103
104
105
106
# File 'lib/abbreviato/truncated_sax_document.rb', line 100

def cdata_block(string)
  if string.bytesize <= remaining_length
    append_to_truncated_string(string)
  else
    @truncated = true
  end
end

#characters(decoded_string) ⇒ Object

This method is called when the parser encounters characters between tags



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/abbreviato/truncated_sax_document.rb', line 61

def characters(decoded_string)
  if max_length_reached? || ignore_mode?
    @truncated = true
    return
  end

  # Use encoded length, so &gt; counts as 4 bytes, not 1 (which is what '>' would give)
  encoded_string = @html_coder.encode(decoded_string, :named)
  string_to_append = if encoded_string.bytesize > remaining_length
    # This is the line which prevents HTML entities getting truncated - treat them as a single char
    str = truncate_string(decoded_string)
    str << tail if remaining_length - str.bytesize >= tail.bytesize
    str
  else
    encoded_string
  end
  append_to_truncated_string(string_to_append)
end

#comment(string) ⇒ Object

This method is called when the parser encounters a comment



81
82
83
84
85
86
87
88
# File 'lib/abbreviato/truncated_sax_document.rb', line 81

def comment(string)
  comment = comment_tag(string)
  if comment.bytesize <= remaining_length
    append_to_truncated_string(comment)
  else
    @truncated = true
  end
end

#end_documentObject



126
127
128
# File 'lib/abbreviato/truncated_sax_document.rb', line 126

def end_document
  @closing_tags.reverse_each { |name| append_to_truncated_string(closing_tag(name), 0) }
end

#end_element(name) ⇒ Object

This method is called when the parser encounters a closing tag



109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/abbreviato/truncated_sax_document.rb', line 109

def end_element(name)
  if ignore_mode?
    exit_ignored_level(name)
    return
  end

  # Note that any remaining end tags get added automatically (in `end_document`) as the document is closed
  return if max_length_reached? || ignorable_tag?(name)

  unless single_tag_element?(name)
    @closing_tags.pop
    # Don't count the length when closing a tag - it was accommodated when
    # the tag was opened
    append_to_truncated_string(closing_tag(name), 0)
  end
end

#start_element(name, attributes) ⇒ Object

This method is called when the parser encounters an open tag



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/abbreviato/truncated_sax_document.rb', line 32

def start_element(name, attributes)
  if max_length_reached? || ignorable_tag?(name)
    @truncated = true if max_length_reached?
    return
  end

  # If already in ignore mode, go in deeper
  if ignore_mode?
    enter_ignored_level(name)
    return
  end

  string_to_add = opening_tag(name, attributes)

  # Abort if there is not enough space to add the combined opening tag and (potentially) the closing tag
  length_of_tags = overridden_tag_length(name, string_to_add)
  if length_of_tags > remaining_length
    @truncated = true
    enter_ignored_level(name)
    return
  end

  # Save the tag so we can push it on at the end
  @closing_tags.push name unless single_tag_element?(name)

  append_to_truncated_string(string_to_add, length_of_tags)
end