Class: LexM::Lemma

Inherits:
Object
  • Object
show all
Defined in:
lib/lexm/lemma.rb

Overview

Represents a lemma, the main entry in a lexicon

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(input = nil, source_file = nil, source_line = nil, source_column = nil) ⇒ Lemma

Initialize from either a string or direct components

Parameters:

  • input (String, nil) (defaults to: nil)

    input string in LexM format to parse

  • source_file (String, nil) (defaults to: nil)

    source file path

  • source_line (Integer, nil) (defaults to: nil)

    source line number

  • source_column (Integer, nil) (defaults to: nil)

    source column number



23
24
25
26
27
28
29
30
31
32
33
# File 'lib/lexm/lemma.rb', line 23

def initialize(input = nil, source_file = nil, source_line = nil, source_column = nil)
    @text = nil
    @annotations = {}
    @sublemmas = []
    @redirect = nil
    @source_file = source_file
    @source_line = source_line
    @source_column = source_column
    
    parse(input) if input.is_a?(String)
end

Instance Attribute Details

#annotationsObject

Returns the value of attribute annotations.



14
15
16
# File 'lib/lexm/lemma.rb', line 14

def annotations
  @annotations
end

#redirectObject

Returns the value of attribute redirect.



14
15
16
# File 'lib/lexm/lemma.rb', line 14

def redirect
  @redirect
end

#source_columnObject

Source location information



16
17
18
# File 'lib/lexm/lemma.rb', line 16

def source_column
  @source_column
end

#source_fileObject

Source location information



16
17
18
# File 'lib/lexm/lemma.rb', line 16

def source_file
  @source_file
end

#source_lineObject

Source location information



16
17
18
# File 'lib/lexm/lemma.rb', line 16

def source_line
  @source_line
end

#sublemmasObject

Returns the value of attribute sublemmas.



14
15
16
# File 'lib/lexm/lemma.rb', line 14

def sublemmas
  @sublemmas
end

#textObject

Returns the value of attribute text.



14
15
16
# File 'lib/lexm/lemma.rb', line 14

def text
  @text
end

Instance Method Details

#addRedirect(target, types = []) ⇒ Lemma

Add a pure redirect sublemma

Parameters:

  • target (String)

    target to redirect to

  • types (Array<String>) (defaults to: [])

    relation types

Returns:



257
258
259
260
261
262
263
264
# File 'lib/lexm/lemma.rb', line 257

def addRedirect(target, types = [])
    if redirected?
        raise "Cannot add sublemmas to a redirection lemma"
    end
    redirect = LemmaRedirect.new(target, types)
    @sublemmas << Sublemma.new(nil, redirect, self)
    self
end

#addSublemma(text) ⇒ Lemma

Add a standard sublemma

Parameters:

  • text (String)

    text of the sublemma

Returns:



232
233
234
235
236
237
238
# File 'lib/lexm/lemma.rb', line 232

def addSublemma(text)
    if redirected?
        raise "Cannot add sublemmas to a redirection lemma"
    end
    @sublemmas << Sublemma.new(text, nil, self)
    self
end

#addSublemmas(texts) ⇒ Lemma

Add multiple sublemmas at once

Parameters:

  • texts (Array<String>)

    array of sublemma texts

Returns:



243
244
245
246
247
248
249
250
251
# File 'lib/lexm/lemma.rb', line 243

def addSublemmas(texts)
    if redirected?
        raise "Cannot add sublemmas to a redirection lemma"
    end
    texts.each do |text|
        @sublemmas << Sublemma.new(text, nil, self)
    end
    self
end

#clearLemma

Clear all annotations and sublemmas but keep the main lemma

Returns:



363
364
365
366
367
368
# File 'lib/lexm/lemma.rb', line 363

def clear
    @annotations = {}
    @sublemmas = []
    @redirect = nil
    self
end

#clearAllLemma

Clear everything including the main lemma

Returns:



372
373
374
375
376
377
378
# File 'lib/lexm/lemma.rb', line 372

def clearAll
    @text = nil
    @annotations = {}
    @sublemmas = []
    @redirect = nil
    self
end

#clearAnnotationsLemma

Clear all annotations

Returns:



342
343
344
345
# File 'lib/lexm/lemma.rb', line 342

def clearAnnotations
    @annotations = {}
    self
end

#clearRedirectLemma

Clear redirect

Returns:



356
357
358
359
# File 'lib/lexm/lemma.rb', line 356

def clearRedirect
    @redirect = nil
    self
end

#clearSublemmasLemma

Clear all sublemmas

Returns:



349
350
351
352
# File 'lib/lexm/lemma.rb', line 349

def clearSublemmas
    @sublemmas = []
    self
end

#parse(input) ⇒ Lemma

Parse a lemma string

Parameters:

  • input (String)

    lemma string in LexM format

Returns:



38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/lexm/lemma.rb', line 38

def parse(input)
    # Check for empty input
    raise "Empty lemma input!" if input.nil? || input.strip.empty?
    
    # Check for basic syntax issues
    if input.count('[') != input.count(']')
        raise "Malformed input: mismatched brackets in '#{input}'"
    end
    
    # Check for balanced pipes
    if input.include?("|") && input.start_with?("|")
        raise "Malformed input: lemma starts with pipe character in '#{input}'"
    end
  
    if input.include?(">>")
        parseRedirectionLemma(input)
        return self
    end
  
    lemmaPart, sublemmasPart = input.split('|', 2)
  
    parseLemma(lemmaPart)
    parseSublemmas(sublemmasPart) if sublemmasPart
  
    self
end

#parseAnnotations(annotationsText) ⇒ void

This method returns an undefined value.

Parse annotations like sp:past,pp:participle or pl:oxen

Parameters:

  • annotationsText (String)

    annotations string



194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# File 'lib/lexm/lemma.rb', line 194

def parseAnnotations(annotationsText)
    if annotationsText.strip.empty?
        raise "Empty annotations block"
    end
    
    annotationsText.split(',').each do |annotation|
        if annotation.strip.empty?
            raise "Empty annotation in comma-separated list"
        end
      
        if annotation.include?(':')
            type, value = annotation.split(':', 2)
        
            # Validate annotation type
            if type.strip.empty?
                raise "Empty annotation type in '#{annotation}'"
            end
        
            # Validate annotation value
            if value.strip.empty?
                raise "Empty annotation value for type '#{type.strip}'"
            end
        
            @annotations[type.strip] = value.strip
        else
            # Handle simple annotations without values
            if annotation.strip.empty?
                raise "Empty annotation name"
            end
        
            @annotations[annotation.strip] = true
        end
    end
end

#parseLemma(lemmaPart) ⇒ void

This method returns an undefined value.

Parse just the lemma part (before any pipe)

Parameters:

  • lemmaPart (String)

    lemma part string



92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# File 'lib/lexm/lemma.rb', line 92

def parseLemma(lemmaPart)
    if lemmaPart.include?('[')
        baseLemma, annotationsPart = lemmaPart.split('[', 2)
      
        # Check for malformed annotation syntax
        raise "Malformed annotation: missing closing ']' in '#{lemmaPart}'" unless annotationsPart.end_with?(']')
      
        # Ensure there's actual lemma text before annotations
        if baseLemma.strip.empty?
            raise "Missing lemma text before annotations in '#{lemmaPart}'"
        end
      
        @text = baseLemma.strip
  
        annotationsPart.sub!(/\]$/, '')
        parseAnnotations(annotationsPart)
    else
        # Simple lemma
        # Ensure there's actual text
        if lemmaPart.strip.empty?
            raise "Empty lemma text in '#{lemmaPart}'"
        end
        @text = lemmaPart.strip
    end
end

#parseRedirectionLemma(input) ⇒ void

This method returns an undefined value.

Parse a redirection lemma (with >> syntax)

Parameters:

  • input (String)

    redirection lemma string



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/lexm/lemma.rb', line 68

def parseRedirectionLemma(input)
    # Check for valid redirection syntax (needs a target after >>)
    if input.match(/>>[\s]*$/)
        raise "Malformed redirection syntax in '#{input}'. Should be 'word>>target' or 'word>>(relation)target'"
    end
    
    if input =~ /(.+?)>>\((.+?)\)(.+)/
        @text = $1.strip
        @redirect = LemmaRedirect.new($3.strip, $2.split(',').map(&:strip))
    elsif input =~ /(.+?)>>(.+)/
        @text = $1.strip
        target = $2.strip
        if target.empty?
            raise "Malformed redirection syntax in '#{input}'. Missing target after '>>'"
        end
        @redirect = LemmaRedirect.new(target)
    else
        raise "Malformed redirection syntax in '#{input}'. Should be 'word>>target' or 'word>>(relation)target'"
    end
end

#parseSublemmas(sublemmasPart) ⇒ void

This method returns an undefined value.

Parse sublemmas part (after the pipe)

Parameters:

  • sublemmasPart (String)

    sublemmas part string



121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/lexm/lemma.rb', line 121

def parseSublemmas(sublemmasPart)
    # We need a smarter way to split sublemmas that respects parentheses
    # This helps us correctly handle cases like ">(sp,pp)wring,abc"
    sublemmas = smart_split_sublemmas(sublemmasPart)
    
    # Process each sublemma
    sublemmas.each do |sublemma|
        sublemma = sublemma.strip
        
        # Handle pure redirection sublemma (starts with >)
        if sublemma.start_with?('>')
            if sublemma =~ />\((.+?)\)(.+)/
                redirect = LemmaRedirect.new($2.strip, $1.split(',').map(&:strip))
                @sublemmas << Sublemma.new(nil, redirect, self)
            elsif sublemma =~ />(.+)/
                redirect = LemmaRedirect.new($1.strip)
                @sublemmas << Sublemma.new(nil, redirect, self)
            end
        # Handle normal sublemma with possible redirection
        elsif sublemma.include?('>')
            # Check for a redirection with relation types
            if sublemma =~ /(.+?)>\((.+?)\)(.+)/
                # Format: word>(relation)target
                text = $1.strip
                redirect = LemmaRedirect.new($3.strip, $2.split(',').map(&:strip))
                @sublemmas << Sublemma.new(text, redirect, self)
            elsif sublemma =~ /(.+?)>(.+)/
                # Simple redirection without relation type
                text = $1.strip
                redirect = LemmaRedirect.new($2.strip)
                @sublemmas << Sublemma.new(text, redirect, self)
            else
                @sublemmas << Sublemma.new(sublemma, nil, self)
            end
        else
            # Simple sublemma
            @sublemmas << Sublemma.new(sublemma, nil, self)
        end
    end
end

#redirected?Boolean

Is this a redirection lemma (no sublemmas, just a redirect)?

Returns:

  • (Boolean)

    true if this is a redirection lemma



382
383
384
# File 'lib/lexm/lemma.rb', line 382

def redirected?
    !@redirect.nil? && @sublemmas.empty?
end

#setAnnotation(type, value = true) ⇒ Lemma

Set an annotation

Parameters:

  • type (String)

    annotation type

  • value (Object) (defaults to: true)

    annotation value

Returns:



318
319
320
321
322
323
324
325
# File 'lib/lexm/lemma.rb', line 318

def setAnnotation(type, value = true)
    if redirected?
        raise "Cannot add annotations to a redirection lemma"
    end
    validateAnnotation(type, value)
    @annotations[type] = value
    self
end

#setAnnotations(annotations) ⇒ Lemma

Add multiple annotations at once

Parameters:

  • annotations (Hash)

    hash of annotation type => value pairs

Returns:



330
331
332
333
334
335
336
337
338
# File 'lib/lexm/lemma.rb', line 330

def setAnnotations(annotations)
    if redirected?
        raise "Cannot add annotations to a redirection lemma"
    end
    annotations.each do |key, value|
        @annotations[key] = value
    end
    self
end

#setRedirect(target, types = []) ⇒ Lemma

Set the lemma’s redirection

Parameters:

  • target (String)

    target to redirect to

  • types (Array<String>) (defaults to: [])

    relation types

Returns:



270
271
272
273
274
275
276
# File 'lib/lexm/lemma.rb', line 270

def setRedirect(target, types = [])
    if !@sublemmas.empty?
        raise "Cannot set redirect on a lemma with sublemmas"
    end
    @redirect = LemmaRedirect.new(target, types)
    self
end

#shortcuts(placeholder = "~") ⇒ Hash<String, String>

Returns a hash mapping each sublemma to its shortcut

Parameters:

  • placeholder (String) (defaults to: "~")

    optional placeholder to use instead of “~” (default: “~”)

Returns:

  • (Hash<String, String>)

    hash mapping full sublemma text to shortcut



281
282
283
284
285
286
287
288
289
290
291
# File 'lib/lexm/lemma.rb', line 281

def shortcuts(placeholder = "~")
    return {} if @text.nil? || redirected? || @sublemmas.empty?
    
    result = {}
    @sublemmas.each do |sublemma|
        # Skip redirections and get the shortcut for text sublemmas
        next if sublemma.redirected? || sublemma.text.nil?
        result[sublemma.text] = sublemma.shortcut(placeholder)
    end
    result
end

#smart_split_sublemmas(text) ⇒ Array<String>

Helper method to split sublemmas while respecting parentheses This ensures we don’t split inside relation type lists like (sp,pp)

Parameters:

  • text (String)

    text to split at commas outside of parentheses

Returns:

  • (Array<String>)

    resulting substrings



166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# File 'lib/lexm/lemma.rb', line 166

def smart_split_sublemmas(text)
    result = []
    current = ""
    paren_level = 0
    
    text.each_char do |c|
        if c == ',' && paren_level == 0
            # Only split on commas outside of parentheses
            result << current unless current.empty?
            current = ""
        else
            current << c
            # Track parenthesis nesting level
            if c == '('
                paren_level += 1
            elsif c == ')'
                paren_level -= 1 if paren_level > 0
            end
        end
    end
    
    result << current unless current.empty?
    result
end

#to_sString

Convert to string format

Returns:

  • (String)

    the string representation of this lemma



388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
# File 'lib/lexm/lemma.rb', line 388

def to_s
    # Redirection lemma format (with double >>)
    if redirected?
        return "#{@text}>>#{@redirect.to_s.sub('>', '')}"
    end
    
    # Normal lemma format
    result = ""
    
    # Format the lemma part with any annotations
    if @text.nil?
        return ""
    elsif @annotations.empty?
        result << @text
    else
        annotationsStr = @annotations.map do |type, value| 
            value == true ? type : "#{type}:#{value}"
        end.join(',')
        result << "#{@text}[#{annotationsStr}]"
    end
    
    # Add sublemmas if present
    if !@sublemmas.empty?
        result << "|"
        result << @sublemmas.map(&:to_s).join(',')
    end
    
    result
end

#validateAnnotation(key, value) ⇒ Boolean

Validate annotation key and value format Ensures keys and values follow the expected format

Parameters:

  • key (String)

    annotation key to validate

  • value (String, Boolean)

    annotation value to validate

Returns:

  • (Boolean)

    true if validation passes

Raises:

  • (StandardError)

    with detailed message if validation fails



299
300
301
302
303
304
305
306
307
308
309
310
311
312
# File 'lib/lexm/lemma.rb', line 299

def validateAnnotation(key, value)
    # Check that key matches a valid pattern (alphanumeric and limited symbols)
    unless key =~ /^[a-zA-Z0-9_]+$/
        raise "Invalid annotation key: '#{key}' (must contain only letters, numbers, and underscores)"
    end
    
    # Additional validation for values
    if value.is_a?(String)
        # Check for invalid characters in value if needed
        if value.include?(']') || value.include?('[')
            raise "Invalid annotation value for '#{key}': cannot contain square brackets"
        end
    end
end