Class: StuffClassifier::Tokenizer

Inherits:
Object
  • Object
show all
Includes:
RMMSeg
Defined in:
lib/stuff-classifier/tokenizer.rb

Constant Summary collapse

TOKENIZER_PROPERTIES =
{
  "en" => {
    :preprocessing_regexps => {/['`]/ => '',/[_]/ => ' '},
    :stop_word => Set.new([
    '','','', 
    'a', 'about', 'above', 'across', 'after', 'afterwards', 
    'again', 'against', 'all', 'almost', 'alone', 'along', 
    'already', 'also', 'although', 'always', 'am', 'among', 
    'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 
    'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 
    'are', 'around', 'as', 'at', 'back', 'be', 
    'became', 'because', 'become', 'becomes', 'becoming', 'been', 
    'before', 'beforehand', 'behind', 'being', 'below', 'beside', 
    'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 
    'but', 'by', 'call', 'can', 'cannot', 'cant', 'dont',
    'co', 'computer', 'con', 'could', 'couldnt', 'cry', 
    'de', 'describe', 'detail', 'do', 'done', 'down', 
    'due', 'during', 'each', 'eg', 'eight', 'either', 
    'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 
    'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 
    'fify', 'fill', 'find', 'fire', 'first', 'five', 
    'for', 'former', 'formerly', 'forty', 'found', 'four', 
    'from', 'front', 'full', 'further', 'get', 'give', 
    'go', 'had', 'has', 'hasnt', 'have', 'he', 
    'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 
    'hereupon', 'hers', 'herself', 'him', 'himself', 'his', 
    'how', 'however', 'hundred', 'i', 'ie', 'if', 
    'in', 'inc', 'indeed', 'interest', 'into', 'is', 
    'it', 'its', 'itself', 'keep', 'last', 'latter', 
    'latterly', 'least', 'less', 'ltd', 'made', 'many', 
    'may', 'me', 'meanwhile', 'might', 'mill', 'mine', 
    'more', 'moreover', 'most', 'mostly', 'move', 'much', 
    'must', 'my', 'myself', 'name', 'namely', 'neither', 
    'never', 'nevertheless', 'next', 'nine', 'no', 'nobody', 
    'none', 'noone', 'nor', 'not', 'nothing', 'now', 
    'nowhere', 'of', 'off', 'often', 'on', 'once', 
    'one', 'only', 'onto', 'or', 'other', 'others', 
    'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 
    'own', 'part', 'per', 'perhaps', 'please', 'put', 
    'rather', 're', 'same', 'see', 'seem', 'seemed', 
    'seeming', 'seems', 'serious', 'several', 'she', 'should', 
    'show', 'side', 'since', 'sincere', 'six', 'sixty', 
    'so', 'some', 'somehow', 'someone', 'something', 'sometime', 
    'sometimes', 'somewhere', 'still', 'such', 'system', 'take', 
    'ten', 'than', 'that', 'the', 'their', 'them', 
    'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 
    'therefore', 'therein', 'thereupon', 'these', 'they', 'thick', 
    'thin', 'third', 'this', 'those', 'though', 'three', 
    'through', 'throughout', 'thru', 'thus', 'to', 'together', 
    'too', 'top', 'toward', 'towards', 'twelve', 'twenty', 
    'two', 'un', 'under', 'until', 'up', 'upon', 
    'us', 'very', 'via', 'was', 'we', 'well', 
    'were', 'what', 'whatever', 'when', 'whence', 'whenever', 
    'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 
    'wherever', 'whether', 'which', 'while', 'whither', 'who', 
    'whoever', 'whole', 'whom', 'whose', 'why', 'will', 
    'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours', 
    'yourself', 'yourselves'
])
},"fr" => {
  :stop_word => Set.new([
  'au',  'aux',  'avec',  'ce',  'ces',  'dans',  'de',  'des',  'du',  'elle',  'en',  'et',  'eux',
  'il',  'je',  'la',  'le',  'leur',  'lui',  'ma',  'mais',  'me',  'même',  'mes',  'moi',  'mon',
  'ne',  'nos',  'notre',  'nous',  'on',  'ou',  'par',  'pas',  'pour',  'qu',  'que',  'qui',  'sa',
  'se',  'ses',  'son',  'sur',  'ta',  'te',  'tes',  'toi',  'ton',  'tu',  'un',  'une',  'vos',  'votre',
  'vous',  'c',  'd',  'j',  'l',  'à',  'm',  'n',  's',  't',  'y',  'été',  'étée',  'étées',
  'étés',  'étant',  'suis',  'es',  'est',  'sommes',  'êtes',  'sont',  'serai',  'seras',
  'sera',  'serons',  'serez',  'seront',  'serais',  'serait',  'serions',  'seriez',  'seraient',
  'étais',  'était',  'étions',  'étiez',  'étaient',  'fus',  'fut',  'fûmes',  'fûtes',
  'furent',  'sois',  'soit',  'soyons',  'soyez',  'soient',  'fusse',  'fusses',  'fût',
  'fussions',  'fussiez',  'fussent',  'ayant',  'eu',  'eue',  'eues',  'eus',  'ai',  'as',
  'avons',  'avez',  'ont',  'aurai',  'auras',  'aura',  'aurons',  'aurez',  'auront',  'aurais',
  'aurait',  'aurions',  'auriez',  'auraient',  'avais',  'avait',  'avions',  'aviez',  'avaient',
  'eut',  'eûmes',  'eûtes',  'eurent',  'aie',  'aies',  'ait',  'ayons',  'ayez',  'aient',  'eusse',
  'eusses',  'eût',  'eussions',  'eussiez',  'eussent',  'ceci',  'celà ',  'cet',  'cette',  'ici',
  'ils',  'les',  'leurs',  'quel',  'quels',  'quelle',  'quelles',  'sans',  'soi'
  ])
  }
}

Instance Method Summary collapse

Constructor Details

#initialize(opts = {}) ⇒ Tokenizer

Returns a new instance of Tokenizer.



13
14
15
16
17
18
19
20
21
# File 'lib/stuff-classifier/tokenizer.rb', line 13

def initialize(opts={})
  @language = opts.key?(:language) ? opts[:language] : "en"
  @properties = StuffClassifier::Tokenizer::TOKENIZER_PROPERTIES[@language]
  
  @stemming = opts.key?(:stemming) ? opts[:stemming] : true
  if @stemming
    @stemmer = Lingua::Stemmer.new(:language => @language)
  end
end

Instance Method Details

#each_word(string) ⇒ Object



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/stuff-classifier/tokenizer.rb', line 47

def each_word(string)
  string = string.strip
  return if string == ''

  words = []

  # tokenize string
  string.split("\n").each do |line|

    # Apply preprocessing regexps
    if preprocessing_regexps
      preprocessing_regexps.each { |regexp,replace_by| line.gsub!(regexp, replace_by) }
    end

    segment(line).each do |w|
        next if w == '' || ignore_words.member?(w.downcase)

      if stemming? and stemable?(w)
        w = @stemmer.stem(w).downcase
        next if ignore_words.member?(w)
      else
        w = w.downcase
      end

      words << (block_given? ? (yield w) : w)
    end
  end

  return words
end

#ignore_wordsObject



39
40
41
# File 'lib/stuff-classifier/tokenizer.rb', line 39

def ignore_words
  @ignore_words || @properties[:stop_word]
end

#ignore_words=(value) ⇒ Object



35
36
37
# File 'lib/stuff-classifier/tokenizer.rb', line 35

def ignore_words=(value)
  @ignore_words = value
end

#languageObject



23
24
25
# File 'lib/stuff-classifier/tokenizer.rb', line 23

def language
  @language
end

#preprocessing_regexpsObject



31
32
33
# File 'lib/stuff-classifier/tokenizer.rb', line 31

def preprocessing_regexps
  @preprocessing_regexps || @properties[:preprocessing_regexps]
end

#preprocessing_regexps=(value) ⇒ Object



27
28
29
# File 'lib/stuff-classifier/tokenizer.rb', line 27

def preprocessing_regexps=(value)
  @preprocessing_regexps = value
end

#stemming?Boolean

Returns:

  • (Boolean)


43
44
45
# File 'lib/stuff-classifier/tokenizer.rb', line 43

def stemming?
  @stemming || false
end