Class: Zaid::LexerComponents::Tokenizer

Inherits:
Object
  • Object
show all
Includes:
Keywords
Defined in:
lib/zaid/lexer_components/tokenizer.rb

Constant Summary collapse

KEYWORDS_MAPPING =
{
  AND => :AND,
  CLASS => :CLASS,
  DIVIDE => :DIVIDE,
  ELSE => :ELSE,
  EQUALS => :EQUALS,
  FALSE => :FALSE,
  GREATER => :GREATER,
  IF => :IF,
  IS => :IS,
  IT_IS => :IT_IS,
  LESS => :LESS,
  METHOD => :METHOD,
  MINUS => :MINUS,
  NIL => :NIL,
  NOT => :NOT,
  OR => :OR,
  PLUS => :PLUS,
  RECEIVE => :RECEIVE,
  THAN => :THAN,
  THEN => :THEN,
  TIMES => :TIMES,
  TRUE => :TRUE,
  WAS => :WAS,
  WHILE => :WHILE
}.freeze
COMMENT_PREFIXES =
['#', 'تعليق:', 'ملاحظة:', 'سؤال:'].freeze
INDENT_KEYWORDS =
[THEN, IS, ELSE, IT_IS].freeze
ARABIC_CHARACTERS =
'ابتةثجحخدذرزسشصضطظعغفقكلمنهوىيءآأؤإئ'
ARABIC_DIGITS =
'٠١٢٣٤٥٦٧٨٩'
ENGLISH_DIGITS =
'0123456789'
DIGITS =
[ARABIC_DIGITS, ENGLISH_DIGITS].join
TOKEN_PATTERNS =
[
  { pattern: /\G((#{Regexp.union(COMMENT_PREFIXES)}).*$)/, type: :comment },
  { pattern: /\G([#{ARABIC_CHARACTERS}_ـ][#{ARABIC_CHARACTERS}#{DIGITS}_ـ]*؟?)/, type: :identifier },
  { pattern: /\G([#{DIGITS}]+\.[#{DIGITS}]+)/, type: :float },
  { pattern: /\G([#{DIGITS}]+)/, type: :number },
  { pattern: /\G"([^"]*)"/, type: :string },
  { pattern: /\G\n( *)/m, type: :dedent },
  { pattern: /\G(\|\||&&|==|!=|<=|>=|<|>)/, type: :operator },
  { pattern: /\G(.)/, type: :single_character }
].freeze
INDENT_PATTERN =
/\G\n( +)/m

Constants included from Keywords

Keywords::AND, Keywords::CLASS, Keywords::DIVIDE, Keywords::ELSE, Keywords::EQUALS, Keywords::FALSE, Keywords::GREATER, Keywords::IF, Keywords::IS, Keywords::IT_IS, Keywords::LESS, Keywords::METHOD, Keywords::MINUS, Keywords::NIL, Keywords::NOT, Keywords::OR, Keywords::PLUS, Keywords::RECEIVE, Keywords::THAN, Keywords::THEN, Keywords::TIMES, Keywords::TRUE, Keywords::WAS, Keywords::WHILE

Instance Method Summary collapse

Instance Method Details

#tokenize(code, run_compression: true) ⇒ Object



58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/zaid/lexer_components/tokenizer.rb', line 58

def tokenize(code, run_compression: true)
  code = code.chomp

  tokens = []
  indent_stack = []

  parsing_position = 0
  parsing_position += parse_token(code, tokens, indent_stack, parsing_position) while parsing_position < code.size

  tokens << [:DEDENT, indent_stack.last || 0] while indent_stack.pop

  run_compression ? Compressor.new.compress(tokens) : tokens
end