Class: Preprocessor

Inherits:
Object
  • Object
show all
Defined in:
lib/libsvm_preprocessor/preprocessor.rb

Constant Summary collapse

OPTIONS_MAP =
{
  0  => { lang: "it", mode: :unigram, stemming: false, stopword: false },
  1  => { lang: "it", mode: :bigram, stemming: false, stopword: false },
  2  => { lang: "it", mode: :unigram, stemming: true, stopword: false },
  3  => { lang: "it", mode: :bigram, stemming: true, stopword: false },
  4  => { lang: "it", mode: :unigram, stemming: false, stopword: true },
  5  => { lang: "it", mode: :bigram, stemming: false, stopword: true },
  6  => { lang: "it", mode: :unigram, stemming: true, stopword: true },
  7  => { lang: "it", mode: :bigram, stemming: true, stopword: true },
  8  => { lang: "it", mode: :trichar, stemming: true, stopword: true },
  9  => { lang: "it", mode: :trichar, stemming: true, stopword: false },
  10 => { lang: "it", mode: :trichar, stemming: false, stopword: true },
  11 => { lang: "it", mode: :trichar, stemming: false, stopword: false },
}

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Preprocessor



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 46

def initialize(options = {})
  if options[:numeric_type]
    options = override_options(options)
  end
  @options = options
  @tokenizer  = Tokenizer.new(options)
  @generator  = FeatureGenerator.new(options)

  @non_zero_features = {}
  @non_zero_features[:testing]  = 0
  @non_zero_features[:training] = 0

  @instances  = {}
  @instances[:testing]  = []
  @instances[:training] = []

  @categories = {}
  @current_category_id = -1
end

Instance Attribute Details

#categoriesObject (readonly)

Returns the value of attribute categories.



7
8
9
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 7

def categories
  @categories
end

#instancesObject (readonly)

Returns the value of attribute instances.



8
9
10
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 8

def instances
  @instances
end

#non_zero_featuresObject (readonly)

Returns the value of attribute non_zero_features.



9
10
11
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 9

def non_zero_features
  @non_zero_features
end

Class Method Details

.options_map(key) ⇒ Object



38
39
40
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 38

def self.options_map(key)
  OPTIONS_MAP[key].map { |k, v| "#{k}: #{v}"}.join(" | ")
end

.options_map_sizeObject



34
35
36
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 34

def self.options_map_size
  OPTIONS_MAP.size
end

Instance Method Details

#hash_of_ngramsObject



26
27
28
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 26

def hash_of_ngrams
  @generator.hash_of_ngrams
end

#nice_string(v) ⇒ Object

This method is only meant to stringify the vector in very same format of libsvm (in this way diff does not mess up)



93
94
95
96
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 93

def nice_string(v)
  return v.join("  ") if v[1] != ""
  return "#{v[0]} "
end

#optionsObject



42
43
44
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 42

def options
  @options
end

#override_options(options) ⇒ Object



30
31
32
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 30

def override_options(options)
  OPTIONS_MAP[options[:numeric_type]]
end

#push(data, testing: false) ⇒ Object



66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 66

def push(data, testing: false)
  category, string = data
  # If it is a new category I need to associate a new id
  if !@categories[category]
    @categories[category] = next_category_id
  end
  v = vectorize(category, string, testing: testing)
  if testing
    @instances[:testing] << v
    @non_zero_features[:testing] += v.last.size
  else
    @instances[:training] << v
    @non_zero_features[:training] += v.last.size
  end
  return v
end

#toSVM(vector) ⇒ Object



83
84
85
86
87
88
89
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 83

def toSVM(vector)
  # the following line is made to have clean diff with libshorttext
  return "#{vector.first} " if vector.last.empty?
  features = vector.last
    .map {|h| "#{h.keys.first}:#{h[h.keys.first]}"}.join(" ")
  "#{vector.first}  #{features}"
end

#use(input_path, testing: false) ⇒ Object



98
99
100
101
102
103
104
105
106
107
108
109
110
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 98

def use(input_path, testing: false)
  if @options[:output]
    output_file = File.open(@options.output, "w")
    CSV.foreach(input_path, ::LibsvmPreprocessor::CSV_OPTIONS) do |row|
      output_file.puts toSVM( push(row, testing: testing) )
    end
    output_file.close
  else
    CSV.foreach(input_path, ::LibsvmPreprocessor::CSV_OPTIONS) do |row|
      puts toSVM( push(row, testing: testing) )
    end
  end
end