Class: Gammo::Tokenizer

Inherits:
Object
  • Object
show all
Extended by:
Forwardable
Includes:
Escape
Defined in:
lib/gammo/tokenizer.rb,
lib/gammo/tokenizer/debug.rb,
lib/gammo/tokenizer/entity.rb,
lib/gammo/tokenizer/escape.rb,
lib/gammo/tokenizer/tokens.rb,
lib/gammo/tokenizer/script_scanner.rb

Overview

Class for implementing HTML5 tokenization algorithm.

Defined Under Namespace

Modules: Debug, Entity, Escape Classes: BaseToken, EscapedToken, ScriptScanner, Tag

Constant Summary collapse

EOS =

Represents end-of-string.

ErrorToken.new('end of string')
ErrorToken =
Class.new(BaseToken)
TextToken =
Class.new(EscapedToken)
StartTagToken =
Class.new(BaseToken)
EndTagToken =
Class.new(BaseToken)
SelfClosingTagToken =
Class.new(BaseToken)
CommentToken =
Class.new(EscapedToken)
DoctypeToken =
Class.new(EscapedToken)

Constants included from Escape

Escape::ESCAPE_REPLACEMENT_TABLE, Escape::LONGEST_ENTITY_WITHOUT_SEMICOLON, Escape::REPLACEMENT_TABLE

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Escape

#escape, #unescape

Constructor Details

#initialize(text, context: nil) ⇒ Tokenizer

Returns a new instance of Tokenizer.



26
27
28
29
30
31
32
33
# File 'lib/gammo/tokenizer.rb', line 26

def initialize(text, context: nil)
  @text          = text
  @scanner       = StringScanner.new(text.force_encoding(Encoding::UTF_8))
  @raw_tag       = context && raw_tag?(context.downcase) ? context.downcase : ''
  @convert_null  = false
  @cdata_allowed = false
  @raw           = false
end

Instance Attribute Details

#convert_nullObject

Returns the value of attribute convert_null.



21
22
23
# File 'lib/gammo/tokenizer.rb', line 21

def convert_null
  @convert_null
end

#rawObject

Returns the value of attribute raw.



21
22
23
# File 'lib/gammo/tokenizer.rb', line 21

def raw
  @raw
end

#raw_tagObject

Returns the value of attribute raw_tag.



21
22
23
# File 'lib/gammo/tokenizer.rb', line 21

def raw_tag
  @raw_tag
end

#scannerObject

Returns the value of attribute scanner.



21
22
23
# File 'lib/gammo/tokenizer.rb', line 21

def scanner
  @scanner
end

Instance Method Details

#allow_cdata!(b) ⇒ Object



35
36
37
# File 'lib/gammo/tokenizer.rb', line 35

def allow_cdata!(b)
  @cdata_allowed = !!b
end

#allow_cdata?Boolean

Returns:

  • (Boolean)


39
40
41
# File 'lib/gammo/tokenizer.rb', line 39

def allow_cdata?
  @cdata_allowed
end

#next_is_not_raw_text!Object



96
97
98
# File 'lib/gammo/tokenizer.rb', line 96

def next_is_not_raw_text!
  @raw_tag = ''
end

#next_tokenObject



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# File 'lib/gammo/tokenizer.rb', line 47

def next_token
  return EOS if scanner.eos?
  if previous_token_is_raw_tag? && (token = next_token_for_raw_tag)
    return token
  end
  @raw          = false
  @convert_null = false
  pos = scanner.pos
  buffer = ''.force_encoding(Encoding::ASCII)
  loop do
    break unless byte = scanner.get_byte
    buffer << byte
    next if byte != ?<
    break unless byte = scanner.get_byte
    buffer << byte
    if pos < (scanner.pos - 2)
      scanner.pos -= 2
      buffer = buffer.slice(0, buffer.length - 2)
      return text_token(buffer)
    end
    case byte
    when %r{[a-zA-Z]}
      step_back
      return scan_start_tag
    when ?!           then return scan_markup_declaration
    when ??           then return comment_token(?? + scan_until_close_angle)
    when ?/
      return text_token(buffer) if scanner.eos?
      # "</>" does not generate a token at all. treat this as empty comment token.
      return comment_token('') if scan(/>/)
      # Expects chars like "</a"
      return comment_token(scan_until_close_angle) unless check(/[a-zA-Z]/)
      begin
        tag = scan_tag(need_attribute: false)
      rescue EOSError
        return EOS
      end
      return error_token(pos) if tag.nil?
      return end_tag_token(tag)
    else
      step_back
      buffer = buffer.slice(0, buffer.length - 1)
      next
    end
  end
  return text_token(buffer) if pos < scanner.pos
  EOS
end

#previous_token_is_raw_tag?Boolean

Returns:

  • (Boolean)


43
44
45
# File 'lib/gammo/tokenizer.rb', line 43

def previous_token_is_raw_tag?
  !raw_tag.empty?
end