Module: Lingo::Attendee::Stemmer::Porter
Constant Summary collapse
- RULES =
{ # Step 1a S100: [ 'SSES -> SS', # caresses -> caress 'IES -> I', # ponies -> poni, ties -> ti 'SS -> SS', # caress -> caress 'S -> ' # cats -> cat ], # Step 1b S110: [ '(m>0) EED -> EE goto(S120)', # agreed -> agree, feed -> feed '(*v*) ED -> goto(S111)', # plastered -> plaster, bled -> bled '(*v*) ING -> goto(S111)', # motoring -> motor, sing -> sing 'goto(S120)' ], # If the second or third of the rules in Step 1b is successful, # the following is done: S111: [ 'AT -> ATE', # conflat(ed) -> conflate 'BL -> BLE', # troubl(ed) -> trouble 'IZ -> IZE', # siz(ed) -> size '(*d and not (*L or *S or *Z)) -> -1', # hopp(ing) -> hop # tann(ed) -> tan # fall(ing) -> fall # hiss(ing) -> hiss # fizz(ed) -> fizz '(m=1 and *o) -> E' # fail(ing) -> fail # fil(ing) -> file ], # The rule to map to a single letter causes the removal of one of # the double letter pair. The -E is put back on -AT, -BL and -IZ, # so that the suffixes -ATE, -BLE and -IZE can be recognised later. # This E may be removed in step 4. # Step 1c S120: [ '(*v*) Y -> I' # happy -> happi, sky -> sky ], # Step 1 deals with plurals and past participles. The subsequent # steps are much more straightforward. # Step 2 S200: [ '(m>0) ATIONAL -> ATE', # relational -> relate '(m>0) TIONAL -> TION', # conditional -> condition, rational -> rational '(m>0) ENCI -> ENCE', # valenci -> valence '(m>0) ANCI -> ANCE', # hesitanci -> hesitance '(m>0) IZER -> IZE', # digitizer -> digitize '(m>0) ABLI -> ABLE', # conformabli -> conformable '(m>0) ALLI -> AL', # radicalli -> radical '(m>0) ENTLI -> ENT', # differentli -> different '(m>0) ELI -> E', # vileli -> vile '(m>0) OUSLI -> OUS', # analogousli -> analogous '(m>0) IZATION -> IZE', # vietnamization -> vietnamize '(m>0) ATION -> ATE', # predication -> predicate '(m>0) ATOR -> ATE', # operator -> operate '(m>0) ALISM -> AL', # feudalism -> feudal '(m>0) IVENESS -> IVE', # decisiveness -> decisive '(m>0) FULNESS -> FUL', # hopefulness -> hopeful '(m>0) OUSNESS -> OUS', # callousness -> callous '(m>0) ALITI -> AL', # formaliti -> formal '(m>0) IVITI -> IVE', # sensitiviti -> sensitive '(m>0) BILITI -> BLE' # sensibiliti -> sensible ], # The test for the string S1 can be made fast by doing a program # switch on the penultimate letter of the word being tested. This # gives a fairly even breakdown of the possible values of the # string S1. It will be seen in fact that the S1-strings in step 2 # are presented here in the alphabetical order of their penultimate # letter. Similar techniques may be applied in the other steps. # Step 3 S300: [ '(m>0) ICATE -> IC', # triplicate -> triplic '(m>0) ATIVE -> ', # formative -> form '(m>0) ALIZE -> AL', # formalize -> formal '(m>0) ICITI -> IC', # electriciti -> electric '(m>0) ICAL -> IC', # electrical -> electric '(m>0) FUL -> ', # hopeful -> hope '(m>0) NESS -> ' # goodness -> good ], # Step 4 S400: [ '(m>1) AL -> ', # revival -> reviv '(m>1) ANCE -> ', # allowance -> allow '(m>1) ENCE -> ', # inference -> infer '(m>1) ER -> ', # airliner -> airlin '(m>1) IC -> ', # gyroscopic -> gyroscop '(m>1) ABLE -> ', # adjustable -> adjust '(m>1) IBLE -> ', # defensible -> defens '(m>1) ANT -> ', # irritant -> irrit '(m>1) EMENT -> ', # replacement -> replac '(m>1) MENT -> ', # adjustment -> adjust '(m>1) ENT -> ', # dependent -> depend '(m>1 and (*S or *T)) ION -> ', # adoption -> adopt '(m>1) OU -> ', # homologou -> homolog '(m>1) ISM -> ', # communism -> commun '(m>1) ATE -> ', # activate -> activ '(m>1) ITI -> ', # angulariti -> angular '(m>1) OUS -> ', # homologous -> homolog '(m>1) IVE -> ', # effective -> effect '(m>1) IZE -> ' # bowdlerize -> bowdler ], # The suffixes are now removed. All that remains is a little # tidying up. # Step 5a S500: [ '(m>1) E -> ', # probate -> probat, rate -> rate '(m=1 and not *o) E -> ' # cease -> ceas ], # Step 5b S510: [ '(m > 1 and *d and *L) -> -1' # controll -> control, roll -> roll ] }
- GOTO_RE =
%r{^#{goto_re = %r{\s*goto\((\S+)\)}}$}- RULE_RE =
%r{^(\(.+\))?\s*(\S*)\s*->\s*(\S*?)(?:#{goto_re})?\s*$}
Instance Method Summary collapse
Instance Method Details
#stem(word, found = false) ⇒ Object
283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 |
# File 'lib/lingo/attendee/stemmer/porter.rb', line 283 def stem(word, found = false) goto, conv = nil, lambda { |s, h| h.each { |q, r| s.gsub!(q, r.to_s) } } RULES.each { |key, rules| next if goto && goto != key.to_s rules.each { |rule| case rule when RULE_RE cond, repl, goto = $1, $3, $4 stem = word[/(.+)#{Unicode.downcase($2)}$/, 1] or next when GOTO_RE goto = $1 break end conv[shad = stem.dup, /[^aeiouy]/ => 'c', /[aeiou]/ => 'v', /cy/ => 'cv', /y/ => 'c' ] if cond conv[cond, 'm' => shad.scan(/vc/).size, '*v*' => shad.include?('v'), '*d' => shad.end_with?('c') && (last = stem[-1]) == stem[-2], '*o' => shad.end_with?('cvc') && !'wxy'.include?(last), 'and' => '&&', 'or' => '||', 'not' => '!', '=' => '==' ] last.upcase! if last cond.gsub!(/\*(\w)/) { last == $1 } next unless eval(cond) end found, word = true, begin stem[0...Integer(repl)] rescue ArgumentError stem << Unicode.downcase(repl) end break } } word if found end |