Module: Lingo::Attendee::Stemmer::Porter

Extended by:: Porter

Included in:: Porter

Defined in:: lib/lingo/attendee/stemmer/porter.rb

Constant Summary collapse

RULES =

{
  # Step 1a

  S100: [
    'SSES -> SS',  # caresses -> caress

    'IES  -> I',   # ponies   -> poni, ties -> ti

    'SS   -> SS',  # caress   -> caress

    'S    -> '     # cats     -> cat

  ],

  # Step 1b

  S110: [
    '(m>0) EED -> EE goto(S120)',  # agreed    ->  agree,   feed -> feed

    '(*v*) ED  ->    goto(S111)',  # plastered ->  plaster, bled -> bled

    '(*v*) ING ->    goto(S111)',  # motoring  ->  motor,   sing -> sing

    'goto(S120)'
  ],

  # If the second or third of the rules in Step 1b is successful,

  # the following is done:

  S111: [
    'AT -> ATE',                            # conflat(ed) -> conflate

    'BL -> BLE',                            # troubl(ed)  -> trouble

    'IZ -> IZE',                            # siz(ed)     -> size

    '(*d and not (*L or *S or *Z)) -> -1',  # hopp(ing)   -> hop

                                            # tann(ed)    -> tan

                                            # fall(ing)   -> fall

                                            # hiss(ing)   -> hiss

                                            # fizz(ed)    -> fizz

    '(m=1 and *o) -> E'                     # fail(ing)   -> fail

                                            # fil(ing)    -> file

  ],

  # The rule to map to a single letter causes the removal of one of

  # the double letter pair. The -E is put back on -AT, -BL and -IZ,

  # so that the suffixes -ATE, -BLE and -IZE can be recognised later.

  # This E may be removed in step 4.


  # Step 1c

  S120: [
    '(*v*) Y -> I'  # happy -> happi, sky -> sky

  ],

  # Step 1 deals with plurals and past participles. The subsequent

  # steps are much more straightforward.


  # Step 2

  S200: [
    '(m>0) ATIONAL -> ATE',   # relational     -> relate

    '(m>0) TIONAL  -> TION',  # conditional    -> condition, rational -> rational

    '(m>0) ENCI    -> ENCE',  # valenci        -> valence

    '(m>0) ANCI    -> ANCE',  # hesitanci      -> hesitance

    '(m>0) IZER    -> IZE',   # digitizer      -> digitize

    '(m>0) ABLI    -> ABLE',  # conformabli    -> conformable

    '(m>0) ALLI    -> AL',    # radicalli      -> radical

    '(m>0) ENTLI   -> ENT',   # differentli    -> different

    '(m>0) ELI     -> E',     # vileli         -> vile

    '(m>0) OUSLI   -> OUS',   # analogousli    -> analogous

    '(m>0) IZATION -> IZE',   # vietnamization -> vietnamize

    '(m>0) ATION   -> ATE',   # predication    -> predicate

    '(m>0) ATOR    -> ATE',   # operator       -> operate

    '(m>0) ALISM   -> AL',    # feudalism      -> feudal

    '(m>0) IVENESS -> IVE',   # decisiveness   -> decisive

    '(m>0) FULNESS -> FUL',   # hopefulness    -> hopeful

    '(m>0) OUSNESS -> OUS',   # callousness    -> callous

    '(m>0) ALITI   -> AL',    # formaliti      -> formal

    '(m>0) IVITI   -> IVE',   # sensitiviti    -> sensitive

    '(m>0) BILITI  -> BLE'    # sensibiliti    -> sensible

  ],

  # The test for the string S1 can be made fast by doing a program

  # switch on the penultimate letter of the word being tested. This

  # gives a fairly even breakdown of the possible values of the

  # string S1. It will be seen in fact that the S1-strings in step 2

  # are presented here in the alphabetical order of their penultimate

  # letter. Similar techniques may be applied in the other steps.


  # Step 3

  S300: [
    '(m>0) ICATE -> IC',  # triplicate  -> triplic

    '(m>0) ATIVE -> ',    # formative   -> form

    '(m>0) ALIZE -> AL',  # formalize   -> formal

    '(m>0) ICITI -> IC',  # electriciti -> electric

    '(m>0) ICAL  -> IC',  # electrical  -> electric

    '(m>0) FUL   -> ',    # hopeful     -> hope

    '(m>0) NESS  -> '     # goodness    -> good

  ],

  # Step 4

  S400: [
    '(m>1) AL    -> ',               # revival     -> reviv

    '(m>1) ANCE  -> ',               # allowance   -> allow

    '(m>1) ENCE  -> ',               # inference   -> infer

    '(m>1) ER    -> ',               # airliner    -> airlin

    '(m>1) IC    -> ',               # gyroscopic  -> gyroscop

    '(m>1) ABLE  -> ',               # adjustable  -> adjust

    '(m>1) IBLE  -> ',               # defensible  -> defens

    '(m>1) ANT   -> ',               # irritant    -> irrit

    '(m>1) EMENT -> ',               # replacement -> replac

    '(m>1) MENT  -> ',               # adjustment  -> adjust

    '(m>1) ENT   -> ',               # dependent   -> depend

    '(m>1 and (*S or *T)) ION -> ',  # adoption    -> adopt

    '(m>1) OU    -> ',               # homologou   -> homolog

    '(m>1) ISM   -> ',               # communism   -> commun

    '(m>1) ATE   -> ',               # activate    -> activ

    '(m>1) ITI   -> ',               # angulariti  -> angular

    '(m>1) OUS   -> ',               # homologous  -> homolog

    '(m>1) IVE   -> ',               # effective   -> effect

    '(m>1) IZE   -> '                # bowdlerize  -> bowdler

  ],

  # The suffixes are now removed. All that remains is a little

  # tidying up.


  # Step 5a

  S500: [
    '(m>1) E -> ',            # probate -> probat, rate -> rate

    '(m=1 and not *o) E -> '  # cease   -> ceas

  ],

  # Step 5b

  S510: [
    '(m > 1 and *d and *L) -> -1'  # controll -> control, roll -> roll

  ]
}

GOTO_RE =

%r{^#{goto_re = %r{\s*goto\((\S+)\)}}$}

RULE_RE =

%r{^(\(.+\))?\s*(\S*)\s*->\s*(\S*?)(?:#{goto_re})?\s*$}

Instance Method Summary collapse

#stem(word, found = false) ⇒ Object

Instance Method Details

#stem(word, found = false) ⇒ `Object`

# File 'lib/lingo/attendee/stemmer/porter.rb', line 283

def stem(word, found = false)
  goto, conv = nil, lambda { |s, h| h.each { |q, r| s.gsub!(q, r.to_s) } }

  RULES.each { |key, rules|
    next if goto && goto != key.to_s

    rules.each { |rule|
      case rule
        when RULE_RE
          cond, repl, goto = $1, $3, $4
          stem = word[/(.+)#{Unicode.downcase($2)}$/, 1] or next
        when GOTO_RE
          goto = $1
          break
      end

      conv[shad = stem.dup,
        /[^aeiouy]/ => 'c',
        /[aeiou]/   => 'v',
        /cy/        => 'cv',
        /y/         => 'c'
      ]

      if cond
        conv[cond,
          'm'   => shad.scan(/vc/).size,
          '*v*' => shad.include?('v'),
          '*d'  => shad.end_with?('c') && (last = stem[-1]) == stem[-2],
          '*o'  => shad.end_with?('cvc') && !'wxy'.include?(last),
          'and' => '&&',
          'or'  => '||',
          'not' => '!',
          '='   => '=='
        ]

        last.upcase! if last
        cond.gsub!(/\*(\w)/) { last == $1 }

        next unless eval(cond)
      end

      found, word = true, begin
        stem[0...Integer(repl)]
      rescue ArgumentError
        stem << Unicode.downcase(repl)
      end

      break
    }
  }

  word if found
end

Module: Lingo::Attendee::Stemmer::Porter

Constant Summary collapse

Instance Method Summary collapse

Instance Method Details

#stem(word, found = false) ⇒ Object

#stem(word, found = false) ⇒ `Object`