Class: VectorEmbed
- Inherits:
-
Object
- Object
- VectorEmbed
- Defined in:
- lib/vector_embed.rb,
lib/vector_embed/maker.rb,
lib/vector_embed/version.rb,
lib/vector_embed/stop_word.rb,
lib/vector_embed/maker/date.rb,
lib/vector_embed/maker/ngram.rb,
lib/vector_embed/maker/number.rb,
lib/vector_embed/maker/phrase.rb,
lib/vector_embed/maker/boolean.rb
Defined Under Namespace
Constant Summary collapse
- JUST_A_NUMBER =
/\A\s*[+\-]?(?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?\s*\z/- UGLY_FLOAT =
/\A\.\d+\z/- BLANK =
/\A\s*\z/- NULL =
/\Anull\z/i- SLASH_N =
'\N'- TRUE =
/\Atrue\z/i- T =
/\At\z/i- FALSE =
/\Afalse\z/i- F =
/\Af\z/i- NULL_BYTE =
"\x00"- LABEL_MAKERS =
[Maker::Boolean, Maker::Number]
- FEATURE_MAKERS =
[Maker::Boolean, Maker::Date, Maker::Number, Maker::Ngram, Maker::Phrase]
- VERSION =
'0.5.1'
Instance Attribute Summary collapse
-
#dict ⇒ Object
readonly
Returns the value of attribute dict.
-
#logger ⇒ Object
Returns the value of attribute logger.
-
#options ⇒ Object
readonly
Returns the value of attribute options.
Instance Method Summary collapse
- #index(parts) ⇒ Object
-
#initialize(options = {}) ⇒ VectorEmbed
constructor
A new instance of VectorEmbed.
- #line(label, features = {}) ⇒ Object
- #preprocess(v) ⇒ Object
- #stats_report ⇒ Object
Constructor Details
#initialize(options = {}) ⇒ VectorEmbed
Returns a new instance of VectorEmbed.
29 30 31 32 33 34 35 36 37 |
# File 'lib/vector_embed.rb', line 29 def initialize( = {}) @options = .dup @mutex = Mutex.new @feature_makers = {} @logger = [:logger] || (l = Logger.new($stderr); l.level = (ENV['VERBOSE'] == 'true') ? Logger::DEBUG : Logger::INFO; l) if dict = @options.delete(:dict) @dict = dict.dup end end |
Instance Attribute Details
#dict ⇒ Object (readonly)
Returns the value of attribute dict.
26 27 28 |
# File 'lib/vector_embed.rb', line 26 def dict @dict end |
#logger ⇒ Object
Returns the value of attribute logger.
25 26 27 |
# File 'lib/vector_embed.rb', line 25 def logger @logger end |
#options ⇒ Object (readonly)
Returns the value of attribute options.
27 28 29 |
# File 'lib/vector_embed.rb', line 27 def @options end |
Instance Method Details
#index(parts) ⇒ Object
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# File 'lib/vector_embed.rb', line 62 def index(parts) sig = parts.join NULL_BYTE if dict sig = Digest::MD5.digest sig dict[sig] || @mutex.synchronize do dict[sig] ||= begin k = parts[0] @feature_makers[k].cardinality += 1 dict[sig] = dict.length + 1 end end else MurmurHash3::V32.str_hash(sig).to_s[0..6].to_i end end |
#line(label, features = {}) ⇒ Object
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
# File 'lib/vector_embed.rb', line 39 def line(label, features = {}) feature_pairs = features.inject([]) do |memo, (k, v)| case v when Array v.each_with_index do |vv, i| memo.concat feature_maker([k, i].join(NULL_BYTE), vv).pairs(vv) end else memo.concat feature_maker(k, v).pairs(v) end memo end.compact.sort_by do |k_value, _| k_value end.map do |pair| pair.join ':' end ([label_maker(label).value(label)] + feature_pairs).join ' ' end |
#preprocess(v) ⇒ Object
58 59 60 |
# File 'lib/vector_embed.rb', line 58 def preprocess(v) StopWord.remove stop_words, v end |
#stats_report ⇒ Object
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
# File 'lib/vector_embed.rb', line 78 def stats_report report = @feature_makers.map do |feature, maker| [feature, maker.class, maker.cardinality] end total_cardinality = report.inject(0) { |sum, row| sum += row[2]; sum } report.unshift %w{ Feature Class Cardinality } feature_width = report.map { |row| row[0].to_s.length }.max class_width = report.map { |row| row[1].to_s.length }.max cardinality_width = report.map { |row| row[2].to_s.length }.max report = report.map do |row| [ row[0].to_s.ljust(feature_width), row[1].to_s.ljust(class_width), row[2].to_s.rjust(cardinality_width), ].join(' | ') end total_width = report.first.length report.insert(1, ''.ljust(total_width, '-')) report.push(total_cardinality.to_s.rjust(total_width)) report.push('').join("\n") end |