Class: StuffClassifier::Tokenizer

Inherits:

Object

Object
StuffClassifier::Tokenizer

show all

Includes:: RMMSeg

Defined in:: lib/stuff-classifier/tokenizer.rb

Constant Summary collapse

TOKENIZER_PROPERTIES =

{
  "en" => {
    :preprocessing_regexps => {/['`]/ => '',/[_]/ => ' '},
    :stop_word => Set.new([
    '的','个','得', 
    'a', 'about', 'above', 'across', 'after', 'afterwards', 
    'again', 'against', 'all', 'almost', 'alone', 'along', 
    'already', 'also', 'although', 'always', 'am', 'among', 
    'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 
    'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 
    'are', 'around', 'as', 'at', 'back', 'be', 
    'became', 'because', 'become', 'becomes', 'becoming', 'been', 
    'before', 'beforehand', 'behind', 'being', 'below', 'beside', 
    'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 
    'but', 'by', 'call', 'can', 'cannot', 'cant', 'dont',
    'co', 'computer', 'con', 'could', 'couldnt', 'cry', 
    'de', 'describe', 'detail', 'do', 'done', 'down', 
    'due', 'during', 'each', 'eg', 'eight', 'either', 
    'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 
    'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 
    'fify', 'fill', 'find', 'fire', 'first', 'five', 
    'for', 'former', 'formerly', 'forty', 'found', 'four', 
    'from', 'front', 'full', 'further', 'get', 'give', 
    'go', 'had', 'has', 'hasnt', 'have', 'he', 
    'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 
    'hereupon', 'hers', 'herself', 'him', 'himself', 'his', 
    'how', 'however', 'hundred', 'i', 'ie', 'if', 
    'in', 'inc', 'indeed', 'interest', 'into', 'is', 
    'it', 'its', 'itself', 'keep', 'last', 'latter', 
    'latterly', 'least', 'less', 'ltd', 'made', 'many', 
    'may', 'me', 'meanwhile', 'might', 'mill', 'mine', 
    'more', 'moreover', 'most', 'mostly', 'move', 'much', 
    'must', 'my', 'myself', 'name', 'namely', 'neither', 
    'never', 'nevertheless', 'next', 'nine', 'no', 'nobody', 
    'none', 'noone', 'nor', 'not', 'nothing', 'now', 
    'nowhere', 'of', 'off', 'often', 'on', 'once', 
    'one', 'only', 'onto', 'or', 'other', 'others', 
    'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 
    'own', 'part', 'per', 'perhaps', 'please', 'put', 
    'rather', 're', 'same', 'see', 'seem', 'seemed', 
    'seeming', 'seems', 'serious', 'several', 'she', 'should', 
    'show', 'side', 'since', 'sincere', 'six', 'sixty', 
    'so', 'some', 'somehow', 'someone', 'something', 'sometime', 
    'sometimes', 'somewhere', 'still', 'such', 'system', 'take', 
    'ten', 'than', 'that', 'the', 'their', 'them', 
    'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 
    'therefore', 'therein', 'thereupon', 'these', 'they', 'thick', 
    'thin', 'third', 'this', 'those', 'though', 'three', 
    'through', 'throughout', 'thru', 'thus', 'to', 'together', 
    'too', 'top', 'toward', 'towards', 'twelve', 'twenty', 
    'two', 'un', 'under', 'until', 'up', 'upon', 
    'us', 'very', 'via', 'was', 'we', 'well', 
    'were', 'what', 'whatever', 'when', 'whence', 'whenever', 
    'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 
    'wherever', 'whether', 'which', 'while', 'whither', 'who', 
    'whoever', 'whole', 'whom', 'whose', 'why', 'will', 
    'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours', 
    'yourself', 'yourselves'
])
},"fr" => {
  :stop_word => Set.new([
  'au',  'aux',  'avec',  'ce',  'ces',  'dans',  'de',  'des',  'du',  'elle',  'en',  'et',  'eux',
  'il',  'je',  'la',  'le',  'leur',  'lui',  'ma',  'mais',  'me',  'même',  'mes',  'moi',  'mon',
  'ne',  'nos',  'notre',  'nous',  'on',  'ou',  'par',  'pas',  'pour',  'qu',  'que',  'qui',  'sa',
  'se',  'ses',  'son',  'sur',  'ta',  'te',  'tes',  'toi',  'ton',  'tu',  'un',  'une',  'vos',  'votre',
  'vous',  'c',  'd',  'j',  'l',  'à',  'm',  'n',  's',  't',  'y',  'été',  'étée',  'étées',
  'étés',  'étant',  'suis',  'es',  'est',  'sommes',  'êtes',  'sont',  'serai',  'seras',
  'sera',  'serons',  'serez',  'seront',  'serais',  'serait',  'serions',  'seriez',  'seraient',
  'étais',  'était',  'étions',  'étiez',  'étaient',  'fus',  'fut',  'fûmes',  'fûtes',
  'furent',  'sois',  'soit',  'soyons',  'soyez',  'soient',  'fusse',  'fusses',  'fût',
  'fussions',  'fussiez',  'fussent',  'ayant',  'eu',  'eue',  'eues',  'eus',  'ai',  'as',
  'avons',  'avez',  'ont',  'aurai',  'auras',  'aura',  'aurons',  'aurez',  'auront',  'aurais',
  'aurait',  'aurions',  'auriez',  'auraient',  'avais',  'avait',  'avions',  'aviez',  'avaient',
  'eut',  'eûmes',  'eûtes',  'eurent',  'aie',  'aies',  'ait',  'ayons',  'ayez',  'aient',  'eusse',
  'eusses',  'eût',  'eussions',  'eussiez',  'eussent',  'ceci',  'celà ',  'cet',  'cette',  'ici',
  'ils',  'les',  'leurs',  'quel',  'quels',  'quelle',  'quelles',  'sans',  'soi'
  ])
  }
}

Instance Method Summary collapse

#each_word(string) ⇒ Object
#ignore_words ⇒ Object
#ignore_words=(value) ⇒ Object
#initialize(opts = {}) ⇒ Tokenizer constructor

A new instance of Tokenizer.
#language ⇒ Object
#preprocessing_regexps ⇒ Object
#preprocessing_regexps=(value) ⇒ Object
#stemming? ⇒ Boolean

Constructor Details

#initialize(opts = {}) ⇒ `Tokenizer`

Returns a new instance of Tokenizer.

# File 'lib/stuff-classifier/tokenizer.rb', line 13

def initialize(opts={})
  @language = opts.key?(:language) ? opts[:language] : "en"
  @properties = StuffClassifier::Tokenizer::TOKENIZER_PROPERTIES[@language]
  
  @stemming = opts.key?(:stemming) ? opts[:stemming] : true
  if @stemming
    @stemmer = Lingua::Stemmer.new(:language => @language)
  end
end

Instance Method Details

#each_word(string) ⇒ `Object`

# File 'lib/stuff-classifier/tokenizer.rb', line 47

def each_word(string)
  string = string.strip
  return if string == ''

  words = []

  # tokenize string
  string.split("\n").each do |line|

    # Apply preprocessing regexps
    if preprocessing_regexps
      preprocessing_regexps.each { |regexp,replace_by| line.gsub!(regexp, replace_by) }
    end

    segment(line).each do |w|
        next if w == '' || ignore_words.member?(w.downcase)

      if stemming? and stemable?(w)
        w = @stemmer.stem(w).downcase
        next if ignore_words.member?(w)
      else
        w = w.downcase
      end

      words << (block_given? ? (yield w) : w)
    end
  end

  return words
end

#ignore_words ⇒ `Object`



39
40
41

# File 'lib/stuff-classifier/tokenizer.rb', line 39

def ignore_words
  @ignore_words || @properties[:stop_word]
end

#ignore_words=(value) ⇒ `Object`



35
36
37

# File 'lib/stuff-classifier/tokenizer.rb', line 35

def ignore_words=(value)
  @ignore_words = value
end

#language ⇒ `Object`



23
24
25

# File 'lib/stuff-classifier/tokenizer.rb', line 23

def language
  @language
end

#preprocessing_regexps ⇒ `Object`



31
32
33

# File 'lib/stuff-classifier/tokenizer.rb', line 31

def preprocessing_regexps
  @preprocessing_regexps || @properties[:preprocessing_regexps]
end

#preprocessing_regexps=(value) ⇒ `Object`



27
28
29

# File 'lib/stuff-classifier/tokenizer.rb', line 27

def preprocessing_regexps=(value)
  @preprocessing_regexps = value
end

#stemming? ⇒ `Boolean`

Returns:

(Boolean)



43
44
45

# File 'lib/stuff-classifier/tokenizer.rb', line 43

def stemming?
  @stemming || false
end

Class: StuffClassifier::Tokenizer

Constant Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(opts = {}) ⇒ Tokenizer

Instance Method Details

#each_word(string) ⇒ Object

#ignore_words ⇒ Object

#ignore_words=(value) ⇒ Object

#language ⇒ Object

#preprocessing_regexps ⇒ Object

#preprocessing_regexps=(value) ⇒ Object

#stemming? ⇒ Boolean

#initialize(opts = {}) ⇒ `Tokenizer`

#each_word(string) ⇒ `Object`

#ignore_words ⇒ `Object`

#ignore_words=(value) ⇒ `Object`

#language ⇒ `Object`

#preprocessing_regexps ⇒ `Object`

#preprocessing_regexps=(value) ⇒ `Object`

#stemming? ⇒ `Boolean`