Module: TurkishStemmer

Extended by:: TurkishStemmer

Included in:: TurkishStemmer

Defined in:: lib/turkish_stemmer.rb,
lib/turkish_stemmer/version.rb

Overview

Please note that we use only lowercase letters for all methods. One should normalize input streams before using the ‘stem` method.

Constant Summary collapse

VOWELS =

"üiıueöao"

CONSONANTS =

"bcçdfgğhjklmnprsştvyz"

ROUNDED_VOWELS =

"oöuü"

UNROUNDED_VOWELS =

"iıea"

FOLLOWING_ROUNDED_VOWELS =

"aeuü"

FRONT_VOWELS =

"eiöü"

BACK_VOWELS =

"ıuao"

AVG_STEMMED_SIZE = Heuristic size for average Turkish stemmed word size

ALPHABET = Regular expression that checks if the word contains only turkish characters

Regexp.new("^[abcçdefgğhıijklmnoöprsştuüvyz]+$").freeze

NOMINAL_VERB_STATES =

load_states_or_suffixes("config/nominal_verb_states.yml")

NOMINAL_VERB_SUFFIXES =

load_states_or_suffixes("config/nominal_verb_suffixes.yml")

NOUN_STATES =

load_states_or_suffixes("config/noun_states.yml")

NOUN_SUFFIXES =

load_states_or_suffixes("config/noun_suffixes.yml")

DERIVATIONAL_STATES =

load_states_or_suffixes("config/derivational_states.yml")

DERIVATIONAL_SUFFIXES =

load_states_or_suffixes("config/derivational_suffixes.yml")

PROTECTED_WORDS = Load settings Protected words

load_settings("protected_words")

LAST_CONSONANT_EXCEPTIONS = Last consonant exceptions

load_settings("last_consonant_exceptions")

VOWEL_HARMONY_EXCEPTIONS = Vower harmony exceptions

load_settings("vowel_harmony_exceptions")

SELECTION_LIST_EXCEPTIONS = Selection list exceptions

load_settings("selection_list_exceptions")

VERSION =

"0.1.12"

Instance Method Summary collapse

#affix_morphological_stripper(word, options = {}) ⇒ Array

A simple algorithm to strip suffixes from a word based on states and transitions.
#count_syllables(word) ⇒ Fixnum

Counts syllables of a Turkish word.
#derivational_suffix_machine ⇒ Object

Helper method.
#generate_pendings(key, word, states, suffixes, options = {}) ⇒ Array

Given a state key and a word, scans through given states and generate valid pending transitions.
#has_frontness?(vowel, candidate) ⇒ Boolean

Checks frontness vowel harmony of two vowels according to Turkish vowel harmony.
#has_roundness?(vowel, candidate) ⇒ Boolean

Checks roundness vowel harmony of two vowels according to Turkish vowel harmony.
#has_vowel_harmony?(word) ⇒ Boolean

Checks vowel harmony of a word according to Turkish vowel harmony.
#last_consonant!(word) ⇒ String

Transforms a word taken into account last consonant rule.
#load_settings(key) ⇒ Object

Helper method for loading settings.
#load_states_or_suffixes(file) ⇒ Hash

Loads yaml file and symbolizes keys.
#mark_stem(word, suffix) ⇒ Hash

Given a suffix it stems a word according to Turkish orthographic rules.
#nominal_verbs_suffix_machine ⇒ Object

Helper method.
#noun_suffix_machine ⇒ Object

Helper method.
#proceed_to_stem?(word) ⇒ Boolean

Checks whether a word can be stemmed or not.
#stem(original_word) ⇒ String

Stems a Turkish word.
#stem_post_process(stems, original_word) ⇒ String

Post stemming process.
#valid_optional_letter?(word, letter) ⇒ Array

Given a word and a letter it checks if the optional letter can be part of the stem or not.
#vowel_harmony?(vowel, candidate) ⇒ Boolean

Checks vowel harmony between two vowels.
#vowels(word) ⇒ Array

Gets the vowels of a word.

Instance Method Details

#affix_morphological_stripper(word, options = {}) ⇒ `Array`

A simple algorithm to strip suffixes from a word based on states and transitions.

Parameters:

word (String) —

the word to strip affixes from
options (Hash) (defaults to: {}) —

options for the algorithm

Options Hash (options):

:states (Hash) —

The states and valid transitions
:suffixes (Hash) —

The suffixes with their rules

Returns:

(Array) —

all possible stem versions

# File 'lib/turkish_stemmer.rb', line 377

def affix_morphological_stripper(word, options = {})
  states   = options[:states]   || {}
  suffixes = options[:suffixes] || {}

  return [word] if states.nil?   || states.empty?
  return [word] if suffixes.nil? || suffixes.empty?

  stems    = []
  # Init first state pending transitions
  pendings = generate_pendings(:a, word, states, suffixes)

  while !pendings.empty? do
    transition = pendings.shift
    word       = transition[:word]
    suffix     = suffixes[transition[:suffix]]
    to_state   = states[transition[:to_state]]
    answer     = mark_stem(word, suffix)

    if answer[:stem] == true
      if ENV['DEBUG']
        puts "Word: #{word} \nAnswer: #{answer} \nInfo: #{transition} \nSuffix: #{suffix}"
      end

      if to_state["final_state"] == true
        # We have a valid transition here. It is safe to remove any pendings
        # with the same signature current pending
        remove_pendings_like!(transition, pendings)
        remove_mark_pendings!(pendings)

        stems.push answer[:word]

        unless to_state["transitions"].empty?
          pendings.unshift(*generate_pendings(transition[:to_state], answer[:word], states, suffixes))
        end

      else
        mark_pendings!(transition, pendings)
        pendings.unshift(*generate_pendings(transition[:to_state], answer[:word],
          states, suffixes, mark: true))
      end
    end
  end

  return [word] if pendings.empty? && stems.empty?

  stems.uniq
end

#count_syllables(word) ⇒ `Fixnum`

Counts syllables of a Turkish word. In Turkish the number of syllables is equals to the number of vowels.

Parameters:

word (String) —

the word to count its syllables

Returns:

(Fixnum) —

the number of syllables



109
110
111

# File 'lib/turkish_stemmer.rb', line 109

def count_syllables(word)
  vowels(word).size
end

#derivational_suffix_machine ⇒ `Object`

Helper method

# File 'lib/turkish_stemmer.rb', line 364

def derivational_suffix_machine
  affix_morphological_stripper(yield, states: self::DERIVATIONAL_STATES,
    suffixes: self::DERIVATIONAL_SUFFIXES)
end

#generate_pendings(key, word, states, suffixes, options = {}) ⇒ `Array`

Given a state key and a word, scans through given states and generate valid pending transitions.

Parameters:

key (String) —

the key for states hash
word (String) —

the word to check
states (Hash) —

the states hash
suffixes (Hash) —

the suffixes hash
options (Hash) (defaults to: {}) —

options for pendings

Options Hash (options):

:mark (Boolean) —

Whether this pending is marked for deletion

Returns:

(Array) —

array of pendings

Raises:

(ArgumentError)

# File 'lib/turkish_stemmer.rb', line 247

def generate_pendings(key, word, states, suffixes, options = {})
  raise ArgumentError, "State #{key} does not exist" if (state = states[key]).nil?
  mark = options[:mark] || false

  matched_transitions = state["transitions"].select do |transition|
    word.match(/(#{suffixes[transition["suffix"]]["regex"]})$/)
  end

  matched_transitions.map do |transition|
    {
      suffix: transition["suffix"],
      to_state: transition["state"],
      from_state: key,
      word: word,
      mark: mark
    }
  end
end

#has_frontness?(vowel, candidate) ⇒ `Boolean`

Checks frontness vowel harmony of two vowels according to Turkish vowel harmony.

Parameters:

vowel (String) —

the first vowel
candidate (String) —

the second vowel

Returns:

(Boolean)

See Also:

https://en.wikipedia.org/wiki/Vowel_harmony#Turkish

# File 'lib/turkish_stemmer.rb', line 170

def has_frontness?(vowel, candidate)
  return true if vowel.nil? || vowel.empty?
  return true if candidate.nil? || candidate.empty?

  if (FRONT_VOWELS.include?(vowel) && FRONT_VOWELS.include?(candidate)) ||
     (BACK_VOWELS.include?(vowel) && BACK_VOWELS.include?(candidate))
    return true
  end

  false
end

#has_roundness?(vowel, candidate) ⇒ `Boolean`

Checks roundness vowel harmony of two vowels according to Turkish vowel harmony.

Parameters:

vowel (String) —

the first vowel
candidate (String) —

the second vowel

Returns:

(Boolean)

See Also:

https://en.wikipedia.org/wiki/Vowel_harmony#Turkish

# File 'lib/turkish_stemmer.rb', line 151

def has_roundness?(vowel, candidate)
  return true if vowel.nil? || vowel.empty?
  return true if candidate.nil? || candidate.empty?

  if (UNROUNDED_VOWELS.include?(vowel) && UNROUNDED_VOWELS.include?(candidate)) ||
     (ROUNDED_VOWELS.include?(vowel) && FOLLOWING_ROUNDED_VOWELS.include?(candidate))
    return true
  end

  false
end

#has_vowel_harmony?(word) ⇒ `Boolean`

Checks vowel harmony of a word according to Turkish vowel harmony.

Parameters:

word (String) —

the word to be checked against Turkish vowel harmony

Returns:

(Boolean)

See Also:

https://en.wikipedia.org/wiki/Vowel_harmony#Turkish

# File 'lib/turkish_stemmer.rb', line 126

def has_vowel_harmony?(word)
  word_vowels = vowels(word)
  vowel       = word_vowels[-2]
  candidate   = word_vowels[-1]

  vowel_harmony?(vowel, candidate)
end

#last_consonant!(word) ⇒ `String`

Transforms a word taken into account last consonant rule.

Parameters:

word (String) —

the word to check for last consonant change

Returns:

(String) —

the changed word

# File 'lib/turkish_stemmer.rb', line 338

def last_consonant!(word)
  return word if LAST_CONSONANT_EXCEPTIONS.include?(word)

  consonants  = { 'b' => 'p', 'c' => 'ç', 'd' => 't', 'ğ' => 'k' }
  last_char   = word[-1]

  if consonants.keys.include?(last_char)
    word[-1] = consonants[last_char]
  end

  word
end

#load_settings(key) ⇒ `Object`

Helper method for loading settings

Parameters:

key (String) —

the key

# File 'lib/turkish_stemmer.rb', line 70

def load_settings(key)
  config_path = File.expand_path("../../config/stemmer.yml", __FILE__)

  begin
    YAML.load_file(config_path)[key]
  rescue => e
    raise "Please provide a valid config/stemmer.yml file, #{e}"
  end
end

#load_states_or_suffixes(file) ⇒ `Hash`

Loads yaml file and symbolizes keys

Parameters:

file (String) —

path to yaml file

Returns:

(Hash) —

the hash with symbols as keys

# File 'lib/turkish_stemmer.rb', line 59

def load_states_or_suffixes(file)
  config_path = File.expand_path("../../#{file}", __FILE__)

  YAML.load_file(config_path).symbolize_keys
rescue => e
  raise "An error occured loading #{file}, #{e}"
end

#mark_stem(word, suffix) ⇒ `Hash`

Given a suffix it stems a word according to Turkish orthographic rules

Parameters:

word (String) —

the word to stem
suffix (Hash) —

a suffix record

Returns:

(Hash) —

a stem answer record

# File 'lib/turkish_stemmer.rb', line 271

def mark_stem(word, suffix)
  stem = !PROTECTED_WORDS.include?(word) &&
         (suffix["check_harmony"] &&
         (has_vowel_harmony?(word) || VOWEL_HARMONY_EXCEPTIONS.include?(word))) ||
         !suffix["check_harmony"]

  suffix_applied = suffix["regex"]

  if stem && (match = word.match(/(#{suffix_applied})$/))
    new_word = word.gsub(/(#{match.to_s})$/, '')
    suffix_applied = match.to_s

    if suffix["optional_letter"]
      answer, match = valid_optional_letter?(new_word, suffix["optional_letter"])

      if answer && match
        new_word = new_word.chop
        suffix_applied = match + suffix_applied
      elsif !answer
        new_word = word
        suffix_applied = nil
        stem = false
      end
    end
  else
    stem = false
    suffix_applied = nil
    new_word = word
  end

  { stem: stem, word: new_word, suffix_applied: suffix_applied }
end

#nominal_verbs_suffix_machine ⇒ `Object`

Helper method. This is just a shortcut.

# File 'lib/turkish_stemmer.rb', line 352

def nominal_verbs_suffix_machine
  affix_morphological_stripper(yield, states: self::NOMINAL_VERB_STATES,
    suffixes: self::NOMINAL_VERB_SUFFIXES)
end

#noun_suffix_machine ⇒ `Object`

Helper method. This is just a shortcut.

# File 'lib/turkish_stemmer.rb', line 358

def noun_suffix_machine
  affix_morphological_stripper(yield, states: self::NOUN_STATES,
    suffixes: self::NOUN_SUFFIXES)
end

#proceed_to_stem?(word) ⇒ `Boolean`

Checks whether a word can be stemmed or not. This method checks candidate word against nil, protected, length and vowel harmory.

Parameters:

word (String) —

the candidate word for stemming

Returns:

(Boolean) —

whether should proceed to stem or not

# File 'lib/turkish_stemmer.rb', line 187

def proceed_to_stem?(word)
  if word.nil? || !turkish?(word) ||
    PROTECTED_WORDS.include?(word) ||
    count_syllables(word) <= 1

    return false
  end

  true
end

#stem(original_word) ⇒ `String`

Stems a Turkish word.

Algorithm consists of 3 parts: pre-process, process and post-process. The pre-process phase is a quick lookup for words that should not be stemmed based on length, protected words list and vowel harmony. The process phase includes a nominal verb suffix and a noun suffix stripper machine. The last phase includes some additional checks and a simple stem selection decision.

Parameters:

word (String) —

the word to stem

Returns:

(String) —

the stemmed word

# File 'lib/turkish_stemmer.rb', line 35

def stem(original_word)
  # Preprocess
  return original_word if !proceed_to_stem?(original_word)

  word = original_word.dup

  # Process
  stems = []
  stems << nominal_verbs_suffix_machine { word }
  stems << original_word
  stems.flatten!.uniq!
  stems << stems.map { |word| noun_suffix_machine { word }}
  stems << original_word
  stems.flatten!.uniq!
  stems << stems.map { |word| derivational_suffix_machine { word }}

  # Postprocess
  stem_post_process(stems, original_word)
end

#stem_post_process(stems, original_word) ⇒ `String`

Post stemming process

Parameters:

stems (Array) —

array of candidate stems
original_word (String) —

the original word

Returns:

(String) —

the stemmed or the original word

# File 'lib/turkish_stemmer.rb', line 203

def stem_post_process(stems, original_word)
  if ENV['DEBUG']
    puts "post process for #{original_word}: #{stems}"
  end

  stems = stems.flatten.uniq

  # Reject original word
  stems.reject! { |w| w == original_word }

  # Reject all non-syllable words
  stems.reject! { |w| count_syllables(w) == 0 }

  # Transform last consonant
  stems.map! { |word| last_consonant!(word) }

  # Sort stems by size
  stems.sort! do |x,y|
    if (x.size - AVG_STEMMED_SIZE).abs == (y.size - AVG_STEMMED_SIZE).abs
      x.size <=> y.size
    else
      (x.size - AVG_STEMMED_SIZE).abs <=>  (y.size - AVG_STEMMED_SIZE).abs
    end
  end

  # Check selection list exceptions
  if !(exception = (stems & SELECTION_LIST_EXCEPTIONS)).empty?
    return exception.first
  end

  # Keep first or original word
  stems.empty? ? original_word : stems.first
end

#valid_optional_letter?(word, letter) ⇒ `Array`

Given a word and a letter it checks if the optional letter can be part of the stem or not.

Examples:

self.valid_optional_letter?("test", "t")
# => [true, 't']

Parameters:

word (String) —

the examined word
letter (String) —

a single letter or a string armed with a regular expression

Returns:

(Array) —

the answer is returned as an array. First element is a Boolean value and second element is the mached character.

# File 'lib/turkish_stemmer.rb', line 315

def valid_optional_letter?(word, letter)
  match         = word.match(/(#{letter})$/)
  answer        = true
  matched_char  = nil

  if match
    matched_char  = match.to_s
    previous_char = word[-2]

    answer = if VOWELS.include?(matched_char)
               (previous_char && CONSONANTS.include?(previous_char))
             else
               (previous_char && VOWELS.include?(previous_char))
             end
  end

  [answer, matched_char]
end

#vowel_harmony?(vowel, candidate) ⇒ `Boolean`

Checks vowel harmony between two vowels

Parameters:

vowel (String) —

the first vowel
candidate (String) —

the second vowel

Returns:

(Boolean)

See Also:

https://en.wikipedia.org/wiki/Vowel_harmony#Turkish



140
141
142

# File 'lib/turkish_stemmer.rb', line 140

def vowel_harmony?(vowel, candidate)
  has_roundness?(vowel, candidate) && has_frontness?(vowel, candidate)
end

#vowels(word) ⇒ `Array`

Gets the vowels of a word

Parameters:

word (String) —

the word to get its vowels

Returns:

(Array) —

array of vowels



117
118
119

# File 'lib/turkish_stemmer.rb', line 117

def vowels(word)
  word.gsub(/#{CONSONANTS.chars.to_a.join('|')}/,"").chars.to_a
end

Module: TurkishStemmer

Overview

Constant Summary collapse

Instance Method Summary collapse

Instance Method Details

#affix_morphological_stripper(word, options = {}) ⇒ Array

#count_syllables(word) ⇒ Fixnum

#derivational_suffix_machine ⇒ Object

#generate_pendings(key, word, states, suffixes, options = {}) ⇒ Array

#has_frontness?(vowel, candidate) ⇒ Boolean

#has_roundness?(vowel, candidate) ⇒ Boolean

#has_vowel_harmony?(word) ⇒ Boolean

#last_consonant!(word) ⇒ String

#load_settings(key) ⇒ Object

#load_states_or_suffixes(file) ⇒ Hash

#mark_stem(word, suffix) ⇒ Hash

#nominal_verbs_suffix_machine ⇒ Object

#noun_suffix_machine ⇒ Object

#proceed_to_stem?(word) ⇒ Boolean

#stem(original_word) ⇒ String

#stem_post_process(stems, original_word) ⇒ String

#valid_optional_letter?(word, letter) ⇒ Array

#vowel_harmony?(vowel, candidate) ⇒ Boolean

#vowels(word) ⇒ Array

#affix_morphological_stripper(word, options = {}) ⇒ `Array`

#count_syllables(word) ⇒ `Fixnum`

#derivational_suffix_machine ⇒ `Object`

#generate_pendings(key, word, states, suffixes, options = {}) ⇒ `Array`

#has_frontness?(vowel, candidate) ⇒ `Boolean`

#has_roundness?(vowel, candidate) ⇒ `Boolean`

#has_vowel_harmony?(word) ⇒ `Boolean`

#last_consonant!(word) ⇒ `String`

#load_settings(key) ⇒ `Object`

#load_states_or_suffixes(file) ⇒ `Hash`

#mark_stem(word, suffix) ⇒ `Hash`

#nominal_verbs_suffix_machine ⇒ `Object`

#noun_suffix_machine ⇒ `Object`

#proceed_to_stem?(word) ⇒ `Boolean`

#stem(original_word) ⇒ `String`

#stem_post_process(stems, original_word) ⇒ `String`

#valid_optional_letter?(word, letter) ⇒ `Array`

#vowel_harmony?(vowel, candidate) ⇒ `Boolean`

#vowels(word) ⇒ `Array`