Class: StuffClassifier::Tokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/stuff-classifier/tokenizer.rb

Constant Summary collapse

TOKENIZER_PROPERTIES =
{
  "en" => {
    :preprocessing_regexps => {/['`]/ => '',/[_]/ => ' '},
    :stop_word => Set.new([
    'a', 'about', 'above', 'across', 'after', 'afterwards',
    'again', 'against', 'all', 'almost', 'alone', 'along',
    'already', 'also', 'although', 'always', 'am', 'among',
    'amongst', 'amoungst', 'amount', 'an', 'and', 'another',
    'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere',
    'are', 'around', 'as', 'at', 'back', 'be',
    'became', 'because', 'become', 'becomes', 'becoming', 'been',
    'before', 'beforehand', 'behind', 'being', 'below', 'beside',
    'besides', 'between', 'beyond', 'bill', 'both', 'bottom',
    'but', 'by', 'call', 'can', 'cannot', 'cant', 'dont',
    'co', 'computer', 'con', 'could', 'couldnt', 'cry',
    'de', 'describe', 'detail', 'do', 'done', 'down',
    'due', 'during', 'each', 'eg', 'eight', 'either',
    'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every',
    'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen',
    'fify', 'fill', 'find', 'fire', 'first', 'five',
    'for', 'former', 'formerly', 'forty', 'found', 'four',
    'from', 'front', 'full', 'further', 'get', 'give',
    'go', 'had', 'has', 'hasnt', 'have', 'he',
    'hence', 'her', 'here', 'hereafter', 'hereby', 'herein',
    'hereupon', 'hers', 'herself', 'him', 'himself', 'his',
    'how', 'however', 'hundred', 'i', 'ie', 'if',
    'in', 'inc', 'indeed', 'interest', 'into', 'is',
    'it', 'its', 'itself', 'keep', 'last', 'latter',
    'latterly', 'least', 'less', 'ltd', 'made', 'many',
    'may', 'me', 'meanwhile', 'might', 'mill', 'mine',
    'more', 'moreover', 'most', 'mostly', 'move', 'much',
    'must', 'my', 'myself', 'name', 'namely', 'neither',
    'never', 'nevertheless', 'next', 'nine', 'no', 'nobody',
    'none', 'noone', 'nor', 'not', 'nothing', 'now',
    'nowhere', 'of', 'off', 'often', 'on', 'once',
    'one', 'only', 'onto', 'or', 'other', 'others',
    'otherwise', 'our', 'ours', 'ourselves', 'out', 'over',
    'own', 'part', 'per', 'perhaps', 'please', 'put',
    'rather', 're', 'same', 'see', 'seem', 'seemed',
    'seeming', 'seems', 'serious', 'several', 'she', 'should',
    'show', 'side', 'since', 'sincere', 'six', 'sixty',
    'so', 'some', 'somehow', 'someone', 'something', 'sometime',
    'sometimes', 'somewhere', 'still', 'such', 'system', 'take',
    'ten', 'than', 'that', 'the', 'their', 'them',
    'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby',
    'therefore', 'therein', 'thereupon', 'these', 'they', 'thick',
    'thin', 'third', 'this', 'those', 'though', 'three',
    'through', 'throughout', 'thru', 'thus', 'to', 'together',
    'too', 'top', 'toward', 'towards', 'twelve', 'twenty',
    'two', 'un', 'under', 'until', 'up', 'upon',
    'us', 'very', 'via', 'was', 'we', 'well',
    'were', 'what', 'whatever', 'when', 'whence', 'whenever',
    'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
    'wherever', 'whether', 'which', 'while', 'whither', 'who',
    'whoever', 'whole', 'whom', 'whose', 'why', 'will',
    'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours',
    'yourself', 'yourselves'
    ])
  },
  "fr" => {
    :stop_word => Set.new([
    'au',  'aux',  'avec',  'ce',  'ces',  'dans',  'de',  'des',  'du',  'elle',  'en',  'et',  'eux',
    'il',  'je',  'la',  'le',  'leur',  'lui',  'ma',  'mais',  'me',  'même',  'mes',  'moi',  'mon',
    'ne',  'nos',  'notre',  'nous',  'on',  'ou',  'par',  'pas',  'pour',  'qu',  'que',  'qui',  'sa',
    'se',  'ses',  'son',  'sur',  'ta',  'te',  'tes',  'toi',  'ton',  'tu',  'un',  'une',  'vos',  'votre',
    'vous',  'c',  'd',  'j',  'l',  'à',  'm',  'n',  's',  't',  'y',  'été',  'étée',  'étées',
    'étés',  'étant',  'suis',  'es',  'est',  'sommes',  'êtes',  'sont',  'serai',  'seras',
    'sera',  'serons',  'serez',  'seront',  'serais',  'serait',  'serions',  'seriez',  'seraient',
    'étais',  'était',  'étions',  'étiez',  'étaient',  'fus',  'fut',  'fûmes',  'fûtes',
    'furent',  'sois',  'soit',  'soyons',  'soyez',  'soient',  'fusse',  'fusses',  'fût',
    'fussions',  'fussiez',  'fussent',  'ayant',  'eu',  'eue',  'eues',  'eus',  'ai',  'as',
    'avons',  'avez',  'ont',  'aurai',  'auras',  'aura',  'aurons',  'aurez',  'auront',  'aurais',
    'aurait',  'aurions',  'auriez',  'auraient',  'avais',  'avait',  'avions',  'aviez',  'avaient',
    'eut',  'eûmes',  'eûtes',  'eurent',  'aie',  'aies',  'ait',  'ayons',  'ayez',  'aient',  'eusse',
    'eusses',  'eût',  'eussions',  'eussiez',  'eussent',  'ceci',  'celà ',  'cet',  'cette',  'ici',
    'ils',  'les',  'leurs',  'quel',  'quels',  'quelle',  'quelles',  'sans',  'soi'
    ])
  },
  "de" => {
    :stop_word => Set.new([
    'aber', 'alle', 'allem', 'allen', 'aller', 'alles', 'als', 'also', 'am', 'an', 'ander', 'andere',
    'anderem', 'anderen', 'anderer', 'anderes', 'anderm', 'andern', 'anderr', 'anders', 'auch', 'auf',
    'aus', 'bei', 'bin', 'bis', 'bist', 'da', 'damit', 'dann', 'der', 'den', 'des', 'dem', 'die', 'das',
    'daß', 'dass', 'derselbe', 'derselben', 'denselben', 'desselben', 'demselben', 'dieselbe', 'dieselben', 'dasselbe',
    'dazu', 'dein', 'deine', 'deinem', 'deinen', 'deiner', 'deines', 'denn', 'derer', 'dessen', 'dich', 'dir', 'du',
    'dies', 'diese', 'diesem', 'diesen', 'dieser', 'dieses', 'doch', 'dort', 'durch', 'ein', 'eine', 'einem', 'einen',
    'einer', 'eines', 'einig', 'einige', 'einigem', 'einigen', 'einiger', 'einiges', 'einmal', 'er', 'ihn', 'ihm', 'es',
    'etwas', 'euer', 'eure', 'eurem', 'euren', 'eurer', 'eures', 'für', 'gegen', 'gewesen', 'hab', 'habe', 'haben', 'hat',
    'hatte', 'hatten', 'hier', 'hin', 'hinter', 'ich', 'mich', 'mir', 'ihr', 'ihre', 'ihrem', 'ihren', 'ihrer', 'ihres',
    'euch', 'im', 'in', 'indem', 'ins', 'ist', 'jede', 'jedem', 'jeden', 'jeder', 'jedes', 'jene', 'jenem', 'jenen', 'jener',
    'jenes', 'jetzt', 'kann', 'kein', 'keine', 'keinem', 'keinen', 'keiner', 'keines', 'können', 'könnte', 'machen', 'man', 'manche',
    'manchem', 'manchen', 'mancher', 'manches', 'mein', 'meine', 'meinem', 'meinen', 'meiner', 'meines', 'mit', 'muss', 'musste', 'nach',
    'nicht', 'nichts', 'noch', 'nun', 'nur', 'ob', 'oder', 'ohne', 'sehr', 'sein', 'seine', 'seinem', 'seinen', 'seiner', 'seines', 'selbst',
    'sich', 'sie', 'ihnen', 'sind', 'so', 'solche', 'solchem', 'solchen', 'solcher', 'solches', 'soll', 'sollte', 'sondern', 'sonst', 'über',
    'um', 'und', 'uns', 'unse', 'unsem', 'unsen', 'unser', 'unses', 'unter', 'viel', 'vom', 'von', 'vor', 'während', 'war', 'waren', 'warst',
    'was', 'weg', 'weil', 'weiter', 'welche', 'welchem', 'welchen', 'welcher', 'welches', 'wenn', 'werde', 'werden', 'wie', 'wieder', 'will',
    'wir', 'wird', 'wirst', 'wo', 'wollen', 'wollte', 'würde', 'würden', 'zu', 'zum', 'zur', 'zwar', 'zwischen'
    ])
  },
  "zh" =>{
    :preprocessing_regexps => {/['`]/ => '',/[_]/ => ' '},
    :stop_word => Set.new([
    ])
  }
}

Instance Method Summary collapse

Constructor Details

#initialize(opts = {}) ⇒ Tokenizer

Returns a new instance of Tokenizer.



11
12
13
14
15
16
17
18
19
# File 'lib/stuff-classifier/tokenizer.rb', line 11

def initialize(opts={})
  @language = opts.key?(:language) ? opts[:language] : "en"
  @properties = StuffClassifier::Tokenizer::TOKENIZER_PROPERTIES[@language]
  
  @stemming = opts.key?(:stemming) ? opts[:stemming] : true
  if @stemming
    @stemmer = Lingua::Stemmer.new(:language => @language)
  end
end

Instance Method Details

#each_word(string) ⇒ Object



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/stuff-classifier/tokenizer.rb', line 45

def each_word(string)
  string = string.strip
  return if string == ''

  words = []

  # tokenize string
  string.split("\n").each do |line|

    # Apply preprocessing regexps
    if preprocessing_regexps
      preprocessing_regexps.each { |regexp,replace_by| line.gsub!(regexp, replace_by) }
    end

    list = language == 'zh' ? segment(line) : line.gsub(/\p{Word}+/)

    list.each do |w|
        next if w == '' || ignore_words.member?(w.downcase)

      if stemming? and stemable?(w)
        w = @stemmer.stem(w).downcase
        next if ignore_words.member?(w)
      else
        w = w.downcase
      end

      words << (block_given? ? (yield w) : w)
    end
  end

  return words
end

#ignore_wordsObject



37
38
39
# File 'lib/stuff-classifier/tokenizer.rb', line 37

def ignore_words
  @ignore_words || @properties[:stop_word]
end

#ignore_words=(value) ⇒ Object



33
34
35
# File 'lib/stuff-classifier/tokenizer.rb', line 33

def ignore_words=(value)
  @ignore_words = value
end

#languageObject



21
22
23
# File 'lib/stuff-classifier/tokenizer.rb', line 21

def language
  @language
end

#preprocessing_regexpsObject



29
30
31
# File 'lib/stuff-classifier/tokenizer.rb', line 29

def preprocessing_regexps
  @preprocessing_regexps || @properties[:preprocessing_regexps]
end

#preprocessing_regexps=(value) ⇒ Object



25
26
27
# File 'lib/stuff-classifier/tokenizer.rb', line 25

def preprocessing_regexps=(value)
  @preprocessing_regexps = value
end

#stemming?Boolean

Returns:

  • (Boolean)


41
42
43
# File 'lib/stuff-classifier/tokenizer.rb', line 41

def stemming?
  @stemming || false
end