Class: Preprocessor

Inherits:
Object
  • Object
show all
Defined in:
lib/libsvm_preprocessor/preprocessor.rb

Constant Summary collapse

OPTIONS_MAP =
{
  0  => { lang: "it", mode: :unigram, stemming: false, stopword: false },
  1  => { lang: "it", mode: :bigram, stemming: false, stopword: false },
  2  => { lang: "it", mode: :unigram, stemming: true, stopword: false },
  3  => { lang: "it", mode: :bigram, stemming: true, stopword: false },
  4  => { lang: "it", mode: :unigram, stemming: false, stopword: true },
  5  => { lang: "it", mode: :bigram, stemming: false, stopword: true },
  6  => { lang: "it", mode: :unigram, stemming: true, stopword: true },
  7  => { lang: "it", mode: :bigram, stemming: true, stopword: true },
  8  => { lang: "it", mode: :trichar, stemming: true, stopword: true },
  9  => { lang: "it", mode: :trichar, stemming: true, stopword: false },
  10 => { lang: "it", mode: :trichar, stemming: false, stopword: true },
  11 => { lang: "it", mode: :trichar, stemming: false, stopword: false },
}

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Preprocessor

Returns a new instance of Preprocessor.



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 47

def initialize(options = {})
  if options[:numeric_type]
    new_options = override_options(options)
    @options = new_options.merge(output: options[:output])
  else
    @options = options
  end

  @tokenizer  = Tokenizer.new(@options)
  @generator  = FeatureGenerator.new(@options)

  @non_zero_features = {}
  @non_zero_features[:testing]  = 0
  @non_zero_features[:training] = 0

  @instances  = {}
  @instances[:testing]  = []
  @instances[:training] = []

  @categories = {}
  @current_category_id = -1
end

Instance Attribute Details

#categoriesObject (readonly)

Returns the value of attribute categories.



8
9
10
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 8

def categories
  @categories
end

#instancesObject (readonly)

Returns the value of attribute instances.



9
10
11
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 9

def instances
  @instances
end

#non_zero_featuresObject (readonly)

Returns the value of attribute non_zero_features.



10
11
12
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 10

def non_zero_features
  @non_zero_features
end

Class Method Details

.options_map(key) ⇒ Object



39
40
41
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 39

def self.options_map(key)
  OPTIONS_MAP[key].map { |k, v| "#{k}: #{v}"}.join(" | ")
end

.options_map_sizeObject



35
36
37
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 35

def self.options_map_size
  OPTIONS_MAP.size
end

Instance Method Details

#hash_of_ngramsObject



27
28
29
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 27

def hash_of_ngrams
  @generator.hash_of_ngrams
end

#nice_string(v) ⇒ Object

This method is only meant to stringify the vector in very same format of libsvm (in this way diff does not mess up)



102
103
104
105
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 102

def nice_string(v)
  return v.join("  ") if v[1] != ""
  return "#{v[0]} "
end

#optionsObject



43
44
45
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 43

def options
  @options
end

#override_options(options) ⇒ Object



31
32
33
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 31

def override_options(options)
  OPTIONS_MAP[options[:numeric_type]]
end

#push(data, testing: false) ⇒ Object



70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 70

def push(data, testing: false)
  category, string = data
  # If it is a new category I need to associate a new id
  if !@categories[category]
    @categories[category] = next_category_id
  end
  v = vectorize(category, string, testing: testing)
  if testing
    @instances[:testing] << v
    @non_zero_features[:testing] += v.last.size
  else
    @instances[:training] << v
    @non_zero_features[:training] += v.last.size
  end
  return v
end

#toSVM(vector) ⇒ Object



87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 87

def toSVM(vector)
  # the following line is made to have clean diff with libshorttext
  return "#{vector.first} " if vector.last.empty?
  features = vector.last.map {|h| "#{h.keys.first}:1"}.join(" ")

  # With this lines it takes into account features (remove the
  #      similar one above)

  #.map {|h|
  #      "#{h.keys.first}:#{h[h.keys.first]}"}.join(" ")
  "#{vector.first}  #{features}"
end

#use(input_path, output_file = nil, testing: false) ⇒ Object



107
108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/libsvm_preprocessor/preprocessor.rb', line 107

def use(input_path, output_file=nil, testing: false)
  if output_file
    output_file = File.open(output_file, "w")
    CSV.foreach(input_path, ::LibsvmPreprocessor::CSV_OPTIONS) do |row|
      output_file.puts toSVM( push(row, testing: testing) )
    end
    output_file.close
  else
    CSV.foreach(input_path, ::LibsvmPreprocessor::CSV_OPTIONS) do |row|
      puts toSVM( push(row, testing: testing) )
    end
  end
end