Module: Lingo::Attendee::Stemmer::Porter

Extended by:
Porter
Included in:
Porter
Defined in:
lib/lingo/attendee/stemmer/porter.rb

Constant Summary collapse

RULES =
{
  # Step 1a

  S100: [
    'SSES -> SS',  # caresses -> caress

    'IES  -> I',   # ponies   -> poni, ties -> ti

    'SS   -> SS',  # caress   -> caress

    'S    -> '     # cats     -> cat

  ],

  # Step 1b

  S110: [
    '(m>0) EED -> EE goto(S120)',  # agreed    ->  agree,   feed -> feed

    '(*v*) ED  ->    goto(S111)',  # plastered ->  plaster, bled -> bled

    '(*v*) ING ->    goto(S111)',  # motoring  ->  motor,   sing -> sing

    'goto(S120)'
  ],

  # If the second or third of the rules in Step 1b is successful,

  # the following is done:

  S111: [
    'AT -> ATE',                            # conflat(ed) -> conflate

    'BL -> BLE',                            # troubl(ed)  -> trouble

    'IZ -> IZE',                            # siz(ed)     -> size

    '(*d and not (*L or *S or *Z)) -> -1',  # hopp(ing)   -> hop

                                            # tann(ed)    -> tan

                                            # fall(ing)   -> fall

                                            # hiss(ing)   -> hiss

                                            # fizz(ed)    -> fizz

    '(m=1 and *o) -> E'                     # fail(ing)   -> fail

                                            # fil(ing)    -> file

  ],

  # The rule to map to a single letter causes the removal of one of

  # the double letter pair. The -E is put back on -AT, -BL and -IZ,

  # so that the suffixes -ATE, -BLE and -IZE can be recognised later.

  # This E may be removed in step 4.


  # Step 1c

  S120: [
    '(*v*) Y -> I'  # happy -> happi, sky -> sky

  ],

  # Step 1 deals with plurals and past participles. The subsequent

  # steps are much more straightforward.


  # Step 2

  S200: [
    '(m>0) ATIONAL -> ATE',   # relational     -> relate

    '(m>0) TIONAL  -> TION',  # conditional    -> condition, rational -> rational

    '(m>0) ENCI    -> ENCE',  # valenci        -> valence

    '(m>0) ANCI    -> ANCE',  # hesitanci      -> hesitance

    '(m>0) IZER    -> IZE',   # digitizer      -> digitize

    '(m>0) ABLI    -> ABLE',  # conformabli    -> conformable

    '(m>0) ALLI    -> AL',    # radicalli      -> radical

    '(m>0) ENTLI   -> ENT',   # differentli    -> different

    '(m>0) ELI     -> E',     # vileli         -> vile

    '(m>0) OUSLI   -> OUS',   # analogousli    -> analogous

    '(m>0) IZATION -> IZE',   # vietnamization -> vietnamize

    '(m>0) ATION   -> ATE',   # predication    -> predicate

    '(m>0) ATOR    -> ATE',   # operator       -> operate

    '(m>0) ALISM   -> AL',    # feudalism      -> feudal

    '(m>0) IVENESS -> IVE',   # decisiveness   -> decisive

    '(m>0) FULNESS -> FUL',   # hopefulness    -> hopeful

    '(m>0) OUSNESS -> OUS',   # callousness    -> callous

    '(m>0) ALITI   -> AL',    # formaliti      -> formal

    '(m>0) IVITI   -> IVE',   # sensitiviti    -> sensitive

    '(m>0) BILITI  -> BLE'    # sensibiliti    -> sensible

  ],

  # The test for the string S1 can be made fast by doing a program

  # switch on the penultimate letter of the word being tested. This

  # gives a fairly even breakdown of the possible values of the

  # string S1. It will be seen in fact that the S1-strings in step 2

  # are presented here in the alphabetical order of their penultimate

  # letter. Similar techniques may be applied in the other steps.


  # Step 3

  S300: [
    '(m>0) ICATE -> IC',  # triplicate  -> triplic

    '(m>0) ATIVE -> ',    # formative   -> form

    '(m>0) ALIZE -> AL',  # formalize   -> formal

    '(m>0) ICITI -> IC',  # electriciti -> electric

    '(m>0) ICAL  -> IC',  # electrical  -> electric

    '(m>0) FUL   -> ',    # hopeful     -> hope

    '(m>0) NESS  -> '     # goodness    -> good

  ],

  # Step 4

  S400: [
    '(m>1) AL    -> ',               # revival     -> reviv

    '(m>1) ANCE  -> ',               # allowance   -> allow

    '(m>1) ENCE  -> ',               # inference   -> infer

    '(m>1) ER    -> ',               # airliner    -> airlin

    '(m>1) IC    -> ',               # gyroscopic  -> gyroscop

    '(m>1) ABLE  -> ',               # adjustable  -> adjust

    '(m>1) IBLE  -> ',               # defensible  -> defens

    '(m>1) ANT   -> ',               # irritant    -> irrit

    '(m>1) EMENT -> ',               # replacement -> replac

    '(m>1) MENT  -> ',               # adjustment  -> adjust

    '(m>1) ENT   -> ',               # dependent   -> depend

    '(m>1 and (*S or *T)) ION -> ',  # adoption    -> adopt

    '(m>1) OU    -> ',               # homologou   -> homolog

    '(m>1) ISM   -> ',               # communism   -> commun

    '(m>1) ATE   -> ',               # activate    -> activ

    '(m>1) ITI   -> ',               # angulariti  -> angular

    '(m>1) OUS   -> ',               # homologous  -> homolog

    '(m>1) IVE   -> ',               # effective   -> effect

    '(m>1) IZE   -> '                # bowdlerize  -> bowdler

  ],

  # The suffixes are now removed. All that remains is a little

  # tidying up.


  # Step 5a

  S500: [
    '(m>1) E -> ',            # probate -> probat, rate -> rate

    '(m=1 and not *o) E -> '  # cease   -> ceas

  ],

  # Step 5b

  S510: [
    '(m > 1 and *d and *L) -> -1'  # controll -> control, roll -> roll

  ]
}
GOTO_RE =
%r{^#{goto_re = %r{\s*goto\((\S+)\)}}$}
RULE_RE =
%r{^(\(.+\))?\s*(\S*)\s*->\s*(\S*?)(?:#{goto_re})?\s*$}

Instance Method Summary collapse

Instance Method Details

#stem(word, found = false) ⇒ Object



283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
# File 'lib/lingo/attendee/stemmer/porter.rb', line 283

def stem(word, found = false)
  goto, conv = nil, lambda { |s, h| h.each { |q, r| s.gsub!(q, r.to_s) } }

  RULES.each { |key, rules|
    next if goto && goto != key.to_s

    rules.each { |rule|
      case rule
        when RULE_RE
          cond, repl, goto = $1, $3, $4
          stem = word[/(.+)#{Unicode.downcase($2)}$/, 1] or next
        when GOTO_RE
          goto = $1
          break
      end

      conv[shad = stem.dup,
        /[^aeiouy]/ => 'c',
        /[aeiou]/   => 'v',
        /cy/        => 'cv',
        /y/         => 'c'
      ]

      if cond
        conv[cond,
          'm'   => shad.scan(/vc/).size,
          '*v*' => shad.include?('v'),
          '*d'  => shad.end_with?('c') && (last = stem[-1]) == stem[-2],
          '*o'  => shad.end_with?('cvc') && !'wxy'.include?(last),
          'and' => '&&',
          'or'  => '||',
          'not' => '!',
          '='   => '=='
        ]

        last.upcase! if last
        cond.gsub!(/\*(\w)/) { last == $1 }

        next unless eval(cond)
      end

      found, word = true, begin
        stem[0...Integer(repl)]
      rescue ArgumentError
        stem << Unicode.downcase(repl)
      end

      break
    }
  }

  word if found
end