Class: Preprocessor
- Inherits:
-
Object
- Object
- Preprocessor
- Defined in:
- lib/libsvm_preprocessor/preprocessor.rb
Constant Summary collapse
- OPTIONS_MAP =
{ 0 => { lang: "it", mode: :unigram, stemming: false, stopword: false }, 1 => { lang: "it", mode: :bigram, stemming: false, stopword: false }, 2 => { lang: "it", mode: :unigram, stemming: true, stopword: false }, 3 => { lang: "it", mode: :bigram, stemming: true, stopword: false }, 4 => { lang: "it", mode: :unigram, stemming: false, stopword: true }, 5 => { lang: "it", mode: :bigram, stemming: false, stopword: true }, 6 => { lang: "it", mode: :unigram, stemming: true, stopword: true }, 7 => { lang: "it", mode: :bigram, stemming: true, stopword: true }, 8 => { lang: "it", mode: :trichar, stemming: true, stopword: true }, 9 => { lang: "it", mode: :trichar, stemming: true, stopword: false }, 10 => { lang: "it", mode: :trichar, stemming: false, stopword: true }, 11 => { lang: "it", mode: :trichar, stemming: false, stopword: false }, }
Instance Attribute Summary collapse
-
#categories ⇒ Object
readonly
Returns the value of attribute categories.
-
#instances ⇒ Object
readonly
Returns the value of attribute instances.
-
#non_zero_features ⇒ Object
readonly
Returns the value of attribute non_zero_features.
Class Method Summary collapse
Instance Method Summary collapse
- #hash_of_ngrams ⇒ Object
-
#initialize(options = {}) ⇒ Preprocessor
constructor
A new instance of Preprocessor.
-
#nice_string(v) ⇒ Object
This method is only meant to stringify the vector in very same format of libsvm (in this way diff does not mess up).
- #options ⇒ Object
- #override_options(options) ⇒ Object
- #push(data, testing: false) ⇒ Object
- #toSVM(vector) ⇒ Object
- #use(input_path, output_file = nil, testing: false) ⇒ Object
Constructor Details
#initialize(options = {}) ⇒ Preprocessor
Returns a new instance of Preprocessor.
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 47 def initialize( = {}) if [:numeric_type] = () = .merge(output: [:output]) else = end @tokenizer = Tokenizer.new() @generator = FeatureGenerator.new() @non_zero_features = {} @non_zero_features[:testing] = 0 @non_zero_features[:training] = 0 @instances = {} @instances[:testing] = [] @instances[:training] = [] @categories = {} @current_category_id = -1 end |
Instance Attribute Details
#categories ⇒ Object (readonly)
Returns the value of attribute categories.
8 9 10 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 8 def categories @categories end |
#instances ⇒ Object (readonly)
Returns the value of attribute instances.
9 10 11 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 9 def instances @instances end |
#non_zero_features ⇒ Object (readonly)
Returns the value of attribute non_zero_features.
10 11 12 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 10 def non_zero_features @non_zero_features end |
Class Method Details
.options_map(key) ⇒ Object
39 40 41 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 39 def self.(key) OPTIONS_MAP[key].map { |k, v| "#{k}: #{v}"}.join(" | ") end |
.options_map_size ⇒ Object
35 36 37 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 35 def self. OPTIONS_MAP.size end |
Instance Method Details
#hash_of_ngrams ⇒ Object
27 28 29 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 27 def hash_of_ngrams @generator.hash_of_ngrams end |
#nice_string(v) ⇒ Object
This method is only meant to stringify the vector in very same format of libsvm (in this way diff does not mess up)
102 103 104 105 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 102 def nice_string(v) return v.join(" ") if v[1] != "" return "#{v[0]} " end |
#options ⇒ Object
43 44 45 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 43 def end |
#override_options(options) ⇒ Object
31 32 33 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 31 def () OPTIONS_MAP[[:numeric_type]] end |
#push(data, testing: false) ⇒ Object
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 70 def push(data, testing: false) category, string = data # If it is a new category I need to associate a new id if !@categories[category] @categories[category] = next_category_id end v = vectorize(category, string, testing: testing) if testing @instances[:testing] << v @non_zero_features[:testing] += v.last.size else @instances[:training] << v @non_zero_features[:training] += v.last.size end return v end |
#toSVM(vector) ⇒ Object
87 88 89 90 91 92 93 94 95 96 97 98 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 87 def toSVM(vector) # the following line is made to have clean diff with libshorttext return "#{vector.first} " if vector.last.empty? features = vector.last.map {|h| "#{h.keys.first}:1"}.join(" ") # With this lines it takes into account features (remove the # similar one above) #.map {|h| # "#{h.keys.first}:#{h[h.keys.first]}"}.join(" ") "#{vector.first} #{features}" end |
#use(input_path, output_file = nil, testing: false) ⇒ Object
107 108 109 110 111 112 113 114 115 116 117 118 119 |
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 107 def use(input_path, output_file=nil, testing: false) if output_file output_file = File.open(output_file, "w") CSV.foreach(input_path, ::LibsvmPreprocessor::CSV_OPTIONS) do |row| output_file.puts toSVM( push(row, testing: testing) ) end output_file.close else CSV.foreach(input_path, ::LibsvmPreprocessor::CSV_OPTIONS) do |row| puts toSVM( push(row, testing: testing) ) end end end |