Class: Statlysis::Similar
- Inherits:
-
Object
- Object
- Statlysis::Similar
- Includes:
- Common
- Defined in:
- lib/statlysis/similar.rb
Instance Attribute Summary collapse
-
#corpus ⇒ Object
Returns the value of attribute corpus.
-
#id_to_similar_ids ⇒ Object
Returns the value of attribute id_to_similar_ids.
-
#id_to_text_hash_proc ⇒ Object
Returns the value of attribute id_to_text_hash_proc.
-
#matrix ⇒ Object
Returns the value of attribute matrix.
Instance Method Summary collapse
-
#initialize(model_name, id_to_text_hash_proc) ⇒ Similar
constructor
A new instance of Similar.
- #process ⇒ Object (also: #run)
- #summary(doc_id) ⇒ Object
Methods included from Common
Constructor Details
#initialize(model_name, id_to_text_hash_proc) ⇒ Similar
Returns a new instance of Similar.
9 10 11 12 13 14 15 16 17 18 19 |
# File 'lib/statlysis/similar.rb', line 9 def initialize model_name, id_to_text_hash_proc # 初始化数据 cron.id_to_text_hash_proc = id_to_text_hash_proc # 初始化表和模型 cron.stat_table_name = [Statlysis.tablename_default_pre, "similar", model_name].compact.join("_") Utils.setup_pattern_table_and_model cron.stat_table_name cron.id_to_similar_ids = {} cron end |
Instance Attribute Details
#corpus ⇒ Object
Returns the value of attribute corpus.
6 7 8 |
# File 'lib/statlysis/similar.rb', line 6 def corpus @corpus end |
#id_to_similar_ids ⇒ Object
Returns the value of attribute id_to_similar_ids.
6 7 8 |
# File 'lib/statlysis/similar.rb', line 6 def id_to_similar_ids @id_to_similar_ids end |
#id_to_text_hash_proc ⇒ Object
Returns the value of attribute id_to_text_hash_proc.
6 7 8 |
# File 'lib/statlysis/similar.rb', line 6 def id_to_text_hash_proc @id_to_text_hash_proc end |
#matrix ⇒ Object
Returns the value of attribute matrix.
6 7 8 |
# File 'lib/statlysis/similar.rb', line 6 def matrix @matrix end |
Instance Method Details
#process ⇒ Object Also known as: run
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/statlysis/similar.rb', line 21 def process logger.info "SimilarProcess #{cron.stat_model} at #{DateTime.now}" require 'gsl' require 'tf-idf-similarity' # 初始化文档 logger.info "开始取出 cron.id_to_text_hash_proc" @id_to_text_hash = cron.id_to_text_hash_proc.call logger.info "开始把@id_to_text_hash转化为数组" as = @id_to_text_hash.to_a logger.info "开始把as slice为1200每次" as.each_slice(1200) do |a| logger.info "开始跑 #{a.size} 个条目的相似性" cron.corpus = TfIdfSimilarity::Collection.new a.each do |id, text| cron.corpus << TfIdfSimilarity::Document.new(text, :id => id) end =begin @id_to_text_hash.each do |id, text| cron.corpus << TfIdfSimilarity::Document.new(text, :id => id) end =end cron.matrix = cron.corpus.similarity_matrix matrix_array = cron.matrix.to_a # matrix的数组下标对应到真实的item_id matrix_idx_to_item_id_hash = {} cron.corpus.documents.each_with_index do |document, idx1| matrix_idx_to_item_id_hash[idx1] = document.id end # 取出matrix里各item的按相关度倒序的item_ids,并保存 cron.corpus.documents.each_with_index do |document, idx1| _item_id_to_score = Hash.new 0 matrix_array[idx1].each_with_index do |num, idx2| _item_id_to_score[matrix_idx_to_item_id_hash[idx2]] = (num.nan? ? 0.0 : num) end _item_id_to_score.delete document.id logger.info "对比文档:" logger.info "#{document.id} # #{summary(document.id)}" logger.info "相关文档:" _item_ids = _item_id_to_score.sort {|a1, b1| b1[1] <=> a1[1] } _item_ids[0..9].each do |item_id, score| logger.info "#{score} # #{summary(item_id)}" end cron.id_to_similar_ids[document.id] = _item_ids[0..99].map(&:first) logger.info end # save results to database cron.id_to_similar_ids.each do |id, similar_ids| s = cron.stat_model.find_or_create(:pattern => id) s.update :result => similar_ids.to_json end end # @id_to_text_hash.to_a.each_slice(1000) do |a| return true end |
#summary(doc_id) ⇒ Object
84 85 86 |
# File 'lib/statlysis/similar.rb', line 84 def summary doc_id @id_to_text_hash[doc_id].mb_chars[0..41].split("\n").join end |