Module: Lingo::Attendee::Stemmer::Porter

Extended by:: Porter

Included in:: Porter

Defined in:: lib/lingo/attendee/stemmer/porter.rb

Constant Summary collapse

RULES =

{
  # Step 1a
  S100: [
    'SSES -> SS',  # caresses -> caress
    'IES  -> I',   # ponies   -> poni, ties -> ti
    'SS   -> SS',  # caress   -> caress
    'S    -> '     # cats     -> cat
  ],

  # Step 1b
  S110: [
    '(m>0) EED -> EE goto(S120)',  # agreed    ->  agree,   feed -> feed
    '(*v*) ED  ->    goto(S111)',  # plastered ->  plaster, bled -> bled
    '(*v*) ING ->    goto(S111)',  # motoring  ->  motor,   sing -> sing
    'goto(S120)'
  ],

  # If the second or third of the rules in Step 1b is successful,
  # the following is done:
  S111: [
    'AT -> ATE',                            # conflat(ed) -> conflate
    'BL -> BLE',                            # troubl(ed)  -> trouble
    'IZ -> IZE',                            # siz(ed)     -> size
    '(*d and not (*L or *S or *Z)) -> -1',  # hopp(ing)   -> hop
                                            # tann(ed)    -> tan
                                            # fall(ing)   -> fall
                                            # hiss(ing)   -> hiss
                                            # fizz(ed)    -> fizz
    '(m=1 and *o) -> E'                     # fail(ing)   -> fail
                                            # fil(ing)    -> file
  ],

  # The rule to map to a single letter causes the removal of one of
  # the double letter pair. The -E is put back on -AT, -BL and -IZ,
  # so that the suffixes -ATE, -BLE and -IZE can be recognised later.
  # This E may be removed in step 4.

  # Step 1c
  S120: [
    '(*v*) Y -> I'  # happy -> happi, sky -> sky
  ],

  # Step 1 deals with plurals and past participles. The subsequent
  # steps are much more straightforward.

  # Step 2
  S200: [
    '(m>0) ATIONAL -> ATE',   # relational     -> relate
    '(m>0) TIONAL  -> TION',  # conditional    -> condition, rational -> rational
    '(m>0) ENCI    -> ENCE',  # valenci        -> valence
    '(m>0) ANCI    -> ANCE',  # hesitanci      -> hesitance
    '(m>0) IZER    -> IZE',   # digitizer      -> digitize
    '(m>0) ABLI    -> ABLE',  # conformabli    -> conformable
    '(m>0) ALLI    -> AL',    # radicalli      -> radical
    '(m>0) ENTLI   -> ENT',   # differentli    -> different
    '(m>0) ELI     -> E',     # vileli         -> vile
    '(m>0) OUSLI   -> OUS',   # analogousli    -> analogous
    '(m>0) IZATION -> IZE',   # vietnamization -> vietnamize
    '(m>0) ATION   -> ATE',   # predication    -> predicate
    '(m>0) ATOR    -> ATE',   # operator       -> operate
    '(m>0) ALISM   -> AL',    # feudalism      -> feudal
    '(m>0) IVENESS -> IVE',   # decisiveness   -> decisive
    '(m>0) FULNESS -> FUL',   # hopefulness    -> hopeful
    '(m>0) OUSNESS -> OUS',   # callousness    -> callous
    '(m>0) ALITI   -> AL',    # formaliti      -> formal
    '(m>0) IVITI   -> IVE',   # sensitiviti    -> sensitive
    '(m>0) BILITI  -> BLE'    # sensibiliti    -> sensible
  ],

  # The test for the string S1 can be made fast by doing a program
  # switch on the penultimate letter of the word being tested. This
  # gives a fairly even breakdown of the possible values of the
  # string S1. It will be seen in fact that the S1-strings in step 2
  # are presented here in the alphabetical order of their penultimate
  # letter. Similar techniques may be applied in the other steps.

  # Step 3
  S300: [
    '(m>0) ICATE -> IC',  # triplicate  -> triplic
    '(m>0) ATIVE -> ',    # formative   -> form
    '(m>0) ALIZE -> AL',  # formalize   -> formal
    '(m>0) ICITI -> IC',  # electriciti -> electric
    '(m>0) ICAL  -> IC',  # electrical  -> electric
    '(m>0) FUL   -> ',    # hopeful     -> hope
    '(m>0) NESS  -> '     # goodness    -> good
  ],

  # Step 4
  S400: [
    '(m>1) AL    -> ',               # revival     -> reviv
    '(m>1) ANCE  -> ',               # allowance   -> allow
    '(m>1) ENCE  -> ',               # inference   -> infer
    '(m>1) ER    -> ',               # airliner    -> airlin
    '(m>1) IC    -> ',               # gyroscopic  -> gyroscop
    '(m>1) ABLE  -> ',               # adjustable  -> adjust
    '(m>1) IBLE  -> ',               # defensible  -> defens
    '(m>1) ANT   -> ',               # irritant    -> irrit
    '(m>1) EMENT -> ',               # replacement -> replac
    '(m>1) MENT  -> ',               # adjustment  -> adjust
    '(m>1) ENT   -> ',               # dependent   -> depend
    '(m>1 and (*S or *T)) ION -> ',  # adoption    -> adopt
    '(m>1) OU    -> ',               # homologou   -> homolog
    '(m>1) ISM   -> ',               # communism   -> commun
    '(m>1) ATE   -> ',               # activate    -> activ
    '(m>1) ITI   -> ',               # angulariti  -> angular
    '(m>1) OUS   -> ',               # homologous  -> homolog
    '(m>1) IVE   -> ',               # effective   -> effect
    '(m>1) IZE   -> '                # bowdlerize  -> bowdler
  ],

  # The suffixes are now removed. All that remains is a little
  # tidying up.

  # Step 5a
  S500: [
    '(m>1) E -> ',            # probate -> probat, rate -> rate
    '(m=1 and not *o) E -> '  # cease   -> ceas
  ],

  # Step 5b
  S510: [
    '(m > 1 and *d and *L) -> -1'  # controll -> control, roll -> roll
  ]
}

GOTO_RE =

%r{^#{goto_re = %r{\s*goto\((\S+)\)}}$}

RULE_RE =

%r{^(\(.+\))?\s*(\S*)\s*->\s*(\S*?)(?:#{goto_re})?\s*$}

Instance Method Summary collapse

#stem(word, found = false) ⇒ Object

Instance Method Details

#stem(word, found = false) ⇒ `Object`

# File 'lib/lingo/attendee/stemmer/porter.rb', line 283

def stem(word, found = false)
  goto, conv = nil, lambda { |s, h| h.each { |q, r| s.gsub!(q, r.to_s) } }

  RULES.each { |key, rules|
    next if goto && goto != key.to_s

    rules.each { |rule|
      case rule
        when RULE_RE
          cond, repl, goto = $1, $3, $4
          stem = word[/(.+)#{Unicode.downcase($2)}$/, 1] or next
        when GOTO_RE
          goto = $1
          break
      end

      conv[shad = stem.dup,
        /[^aeiouy]/ => 'c',
        /[aeiou]/   => 'v',
        /cy/        => 'cv',
        /y/         => 'c'
      ]

      if cond
        conv[cond,
          'm'   => shad.scan(/vc/).size,
          '*v*' => shad.include?('v'),
          '*d'  => shad.end_with?('c') && (last = stem[-1]) == stem[-2],
          '*o'  => shad.end_with?('cvc') && !'wxy'.include?(last),
          'and' => '&&',
          'or'  => '||',
          'not' => '!',
          '='   => '=='
        ]

        last.upcase! if last
        cond.gsub!(/\*(\w)/) { last == $1 }

        next unless eval(cond)
      end

      found, word = true, begin
        stem[0...Integer(repl)]
      rescue ArgumentError
        stem << Unicode.downcase(repl)
      end

      break
    }
  }

  word if found
end

Module: Lingo::Attendee::Stemmer::Porter

Constant Summary collapse

Instance Method Summary collapse

Instance Method Details

#stem(word, found = false) ⇒ Object

#stem(word, found = false) ⇒ `Object`