Class: PatternRuby::PatternCompiler

Inherits:
Object
  • Object
show all
Defined in:
lib/pattern_ruby/pattern_compiler.rb

Defined Under Namespace

Classes: AlternationToken, EntityToken, LiteralToken, OptionalToken

Constant Summary collapse

MAX_PATTERN_LENGTH =

Maximum pattern string length to prevent ReDoS / excessive compilation cost

10_000

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(entity_registry: nil) ⇒ PatternCompiler

Returns a new instance of PatternCompiler.



23
24
25
# File 'lib/pattern_ruby/pattern_compiler.rb', line 23

def initialize(entity_registry: nil)
  @entity_registry = entity_registry
end

Class Method Details

.validate!(pattern_string) ⇒ Object



27
28
29
# File 'lib/pattern_ruby/pattern_compiler.rb', line 27

def self.validate!(pattern_string)
  new.validate!(pattern_string)
end

Instance Method Details

#compile(pattern_string) ⇒ Object



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/pattern_ruby/pattern_compiler.rb', line 54

def compile(pattern_string)
  validate!(pattern_string)

  tokens = tokenize(pattern_string)
  entity_names = []
  literal_count = 0
  regex_parts = []
  optional_flags = []

  tokens.each do |token|
    case token
    when EntityToken
      entity_names << token.name.to_sym
      regex_parts << build_entity_regex(token)
      optional_flags << false
    when OptionalToken
      inner = compile_inner(token.content)
      entity_names.concat(inner[:entity_names])
      regex_parts << inner[:regex]
      optional_flags << true
    when AlternationToken
      alts = token.alternatives.map { |a| Regexp.escape(a) }
      regex_parts << "(?:#{alts.join('|')})"
      literal_count += 1
      optional_flags << false
    when WildcardToken
      regex_parts << "(.+)"
      optional_flags << false
    when LiteralToken
      regex_parts << Regexp.escape(token.text)
      literal_count += 1
      optional_flags << false
    end
  end

  token_count = tokens.size
  entity_count = entity_names.size

  # Join parts with \s+ separators, wrapping optional parts in (?:...)?
  regex_str = +""
  need_sep = false
  regex_parts.each_with_index do |part, i|
    if optional_flags[i]
      if i == 0
        regex_str << "(?:#{part}\\s+)?"
        need_sep = false
      else
        regex_str << "(?:\\s+#{part})?"
        need_sep = true
      end
    else
      regex_str << "\\s+" if need_sep
      regex_str << part
      need_sep = true
    end
  end

  regex = Regexp.new("\\A\\s*#{regex_str}\\s*\\z", Regexp::IGNORECASE)

  CompiledPattern.new(
    source: pattern_string,
    regex: regex,
    entity_names: entity_names,
    literal_count: literal_count,
    token_count: token_count,
    entity_count: entity_count
  )
end

#validate!(pattern_string) ⇒ Object

Raises:

  • (ArgumentError)


31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/pattern_ruby/pattern_compiler.rb', line 31

def validate!(pattern_string)
  raise ArgumentError, "pattern must be a String, got #{pattern_string.class}" unless pattern_string.is_a?(String)
  raise ArgumentError, "pattern cannot be nil or empty" if pattern_string.nil? || pattern_string.strip.empty?
  if pattern_string.length > MAX_PATTERN_LENGTH
    raise ArgumentError, "pattern exceeds maximum length of #{MAX_PATTERN_LENGTH} characters"
  end

  # Check for unbalanced brackets
  check_balanced(pattern_string, "[", "]", "square brackets")
  check_balanced(pattern_string, "(", ")", "parentheses")
  check_balanced(pattern_string, "{", "}", "curly braces")

  # Check for empty entity names
  if pattern_string.match?(/\{\s*\}/)
    raise ArgumentError, "empty entity name {} in pattern"
  end
  if pattern_string.match?(/\{\s*:/)
    raise ArgumentError, "entity name cannot start with ':' in pattern"
  end

  true
end