Class: Statlysis::Similar

Inherits:
Object
  • Object
show all
Includes:
Common
Defined in:
lib/statlysis/similar.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Common

#cron

Constructor Details

#initialize(model_name, id_to_text_hash_proc) ⇒ Similar

Returns a new instance of Similar.



9
10
11
12
13
14
15
16
17
18
19
# File 'lib/statlysis/similar.rb', line 9

def initialize model_name, id_to_text_hash_proc
  # 初始化数据
  cron.id_to_text_hash_proc = id_to_text_hash_proc

  # 初始化表和模型
  cron.stat_table_name = [Statlysis.tablename_default_pre, "similar", model_name].compact.join("_")
  Utils.setup_pattern_table_and_model cron.stat_table_name

  cron.id_to_similar_ids = {}
  cron
end

Instance Attribute Details

#corpusObject

Returns the value of attribute corpus.



6
7
8
# File 'lib/statlysis/similar.rb', line 6

def corpus
  @corpus
end

#id_to_similar_idsObject

Returns the value of attribute id_to_similar_ids.



6
7
8
# File 'lib/statlysis/similar.rb', line 6

def id_to_similar_ids
  @id_to_similar_ids
end

#id_to_text_hash_procObject

Returns the value of attribute id_to_text_hash_proc.



6
7
8
# File 'lib/statlysis/similar.rb', line 6

def id_to_text_hash_proc
  @id_to_text_hash_proc
end

#matrixObject

Returns the value of attribute matrix.



6
7
8
# File 'lib/statlysis/similar.rb', line 6

def matrix
  @matrix
end

Instance Method Details

#processObject Also known as: run



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/statlysis/similar.rb', line 21

def process
  logger.info "SimilarProcess #{cron.stat_model} at #{DateTime.now}"
  require 'gsl'
  require 'tf-idf-similarity'

  # 初始化文档
  logger.info "开始取出 cron.id_to_text_hash_proc"
  @id_to_text_hash = cron.id_to_text_hash_proc.call

  logger.info "开始把@id_to_text_hash转化为数组"
  as = @id_to_text_hash.to_a

  logger.info "开始把as slice为1200每次"
  as.each_slice(1200) do |a|
  logger.info "开始跑 #{a.size}  个条目的相似性"
  cron.corpus = TfIdfSimilarity::Collection.new
  a.each do |id, text|
    cron.corpus << TfIdfSimilarity::Document.new(text, :id => id)
  end
=begin
  @id_to_text_hash.each do |id, text|
    cron.corpus << TfIdfSimilarity::Document.new(text, :id => id)
  end
=end

  cron.matrix = cron.corpus.similarity_matrix
  matrix_array = cron.matrix.to_a

  # matrix的数组下标对应到真实的item_id
  matrix_idx_to_item_id_hash = {}
  cron.corpus.documents.each_with_index do |document, idx1|
    matrix_idx_to_item_id_hash[idx1] = document.id
  end

  # 取出matrix里各item的按相关度倒序的item_ids,并保存
  cron.corpus.documents.each_with_index do |document, idx1|
    _item_id_to_score = Hash.new 0
    matrix_array[idx1].each_with_index do |num, idx2|
      _item_id_to_score[matrix_idx_to_item_id_hash[idx2]] = (num.nan? ? 0.0 : num)
    end
    _item_id_to_score.delete document.id
    logger.info "对比文档:"
    logger.info "#{document.id} # #{summary(document.id)}"
    logger.info "相关文档:"
    _item_ids = _item_id_to_score.sort {|a1, b1| b1[1] <=> a1[1] }
    _item_ids[0..9].each do |item_id, score|
      logger.info "#{score} #  #{summary(item_id)}"
    end
    cron.id_to_similar_ids[document.id] = _item_ids[0..99].map(&:first)
    logger.info
  end

  # save results to database
  cron.id_to_similar_ids.each do |id, similar_ids|
    s = cron.stat_model.find_or_create(:pattern => id)
    s.update :result => similar_ids.to_json
  end
  end # @id_to_text_hash.to_a.each_slice(1000) do |a|

  return true
end

#summary(doc_id) ⇒ Object



84
85
86
# File 'lib/statlysis/similar.rb', line 84

def summary doc_id
  @id_to_text_hash[doc_id].mb_chars[0..41].split("\n").join
end