Class: Preprocessor
- Inherits:
-
Object
- Object
- Preprocessor
- Defined in:
- lib/libsvm_preprocessor/preprocessor.rb
Constant Summary collapse
- OPTIONS_MAP =
{ 0 => { lang: "it", mode: :unigram, stemming: false, stopword: false }, 1 => { lang: "it", mode: :bigram, stemming: false, stopword: false }, 2 => { lang: "it", mode: :unigram, stemming: true, stopword: false }, 3 => { lang: "it", mode: :bigram, stemming: true, stopword: false }, 4 => { lang: "it", mode: :unigram, stemming: false, stopword: true }, 5 => { lang: "it", mode: :bigram, stemming: false, stopword: true }, 6 => { lang: "it", mode: :unigram, stemming: true, stopword: true }, 7 => { lang: "it", mode: :bigram, stemming: true, stopword: true }, 8 => { lang: "it", mode: :trichar, stemming: true, stopword: true }, 9 => { lang: "it", mode: :trichar, stemming: true, stopword: false }, 10 => { lang: "it", mode: :trichar, stemming: false, stopword: true }, 11 => { lang: "it", mode: :trichar, stemming: false, stopword: false }, }
Instance Attribute Summary collapse
-
#categories ⇒ Object
readonly
Returns the value of attribute categories.
-
#instances ⇒ Object
readonly
Returns the value of attribute instances.
-
#non_zero_features ⇒ Object
readonly
Returns the value of attribute non_zero_features.
Class Method Summary collapse
Instance Method Summary collapse
- #hash_of_ngrams ⇒ Object
-
#initialize(options = {}) ⇒ Preprocessor
constructor
A new instance of Preprocessor.
-
#nice_string(v) ⇒ Object
This method is only meant to stringify the vector in very same format of libsvm (in this way diff does not mess up).
- #options ⇒ Object
- #override_options(options) ⇒ Object
- #push(data, testing: false) ⇒ Object
- #toSVM(vector) ⇒ Object
- #use(input_path, testing: false) ⇒ Object
Constructor Details
#initialize(options = {}) ⇒ Preprocessor
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 46 def initialize( = {}) if [:numeric_type] = () end = @tokenizer = Tokenizer.new() @generator = FeatureGenerator.new() @non_zero_features = {} @non_zero_features[:testing] = 0 @non_zero_features[:training] = 0 @instances = {} @instances[:testing] = [] @instances[:training] = [] @categories = {} @current_category_id = -1 end |
Instance Attribute Details
#categories ⇒ Object (readonly)
Returns the value of attribute categories.
7 8 9 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 7 def categories @categories end |
#instances ⇒ Object (readonly)
Returns the value of attribute instances.
8 9 10 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 8 def instances @instances end |
#non_zero_features ⇒ Object (readonly)
Returns the value of attribute non_zero_features.
9 10 11 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 9 def non_zero_features @non_zero_features end |
Class Method Details
.options_map(key) ⇒ Object
38 39 40 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 38 def self.(key) OPTIONS_MAP[key].map { |k, v| "#{k}: #{v}"}.join(" | ") end |
.options_map_size ⇒ Object
34 35 36 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 34 def self. OPTIONS_MAP.size end |
Instance Method Details
#hash_of_ngrams ⇒ Object
26 27 28 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 26 def hash_of_ngrams @generator.hash_of_ngrams end |
#nice_string(v) ⇒ Object
This method is only meant to stringify the vector in very same format of libsvm (in this way diff does not mess up)
93 94 95 96 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 93 def nice_string(v) return v.join(" ") if v[1] != "" return "#{v[0]} " end |
#options ⇒ Object
42 43 44 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 42 def end |
#override_options(options) ⇒ Object
30 31 32 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 30 def () OPTIONS_MAP[[:numeric_type]] end |
#push(data, testing: false) ⇒ Object
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 66 def push(data, testing: false) category, string = data # If it is a new category I need to associate a new id if !@categories[category] @categories[category] = next_category_id end v = vectorize(category, string, testing: testing) if testing @instances[:testing] << v @non_zero_features[:testing] += v.last.size else @instances[:training] << v @non_zero_features[:training] += v.last.size end return v end |
#toSVM(vector) ⇒ Object
83 84 85 86 87 88 89 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 83 def toSVM(vector) # the following line is made to have clean diff with libshorttext return "#{vector.first} " if vector.last.empty? features = vector.last .map {|h| "#{h.keys.first}:#{h[h.keys.first]}"}.join(" ") "#{vector.first} #{features}" end |
#use(input_path, testing: false) ⇒ Object
98 99 100 101 102 103 104 105 106 107 108 109 110 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 98 def use(input_path, testing: false) if [:output] output_file = File.open(.output, "w") CSV.foreach(input_path, ::LibsvmPreprocessor::CSV_OPTIONS) do |row| output_file.puts toSVM( push(row, testing: testing) ) end output_file.close else CSV.foreach(input_path, ::LibsvmPreprocessor::CSV_OPTIONS) do |row| puts toSVM( push(row, testing: testing) ) end end end |