Class: AnalyZ::HTML::WordVal

Inherits:
Object
  • Object
show all
Defined in:
lib/analy_z/html/word_val.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(html_path, selector = 'body', type_ary = ['名詞']) ⇒ WordVal

Returns a new instance of WordVal.



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/analy_z/html/word_val.rb', line 15

def initialize html_path, selector = 'body', type_ary = ['名詞']
  @sentences = {}
  Dir.glob(html_path).each do |f|
    print '.'
    @sentences[f] = parse_html(Nokogiri::HTML.parse(File.read(f), nil, nil).css(selector).to_html)
  end
    
  puts "\n=== creating sentences file ==="
  txt = ""
  @sentences.each do |k, sentences|
    print '.'
    txt << sentences.map{|s| s[0] }.join + '/=== EOS ===/'
  end
    
  FileUtils.mkdir_p('tmp') unless FileTest.exist?('tmp')
  text_file_path = "tmp/#{DateTime.now}.txt"
  File.write(text_file_path, txt)
    
  puts "\n=== analyzing... ==="
  analyze_words(@sentences, text_file_path)
end

Instance Attribute Details

#hse_tf_idfObject

Returns the value of attribute hse_tf_idf.



10
11
12
# File 'lib/analy_z/html/word_val.rb', line 10

def hse_tf_idf
  @hse_tf_idf
end

#idfObject

Returns the value of attribute idf.



8
9
10
# File 'lib/analy_z/html/word_val.rb', line 8

def idf
  @idf
end

#sentencesObject

Returns the value of attribute sentences.



13
14
15
# File 'lib/analy_z/html/word_val.rb', line 13

def sentences
  @sentences
end

#textsObject

Returns the value of attribute texts.



12
13
14
# File 'lib/analy_z/html/word_val.rb', line 12

def texts
  @texts
end

#tfObject

Returns the value of attribute tf.



7
8
9
# File 'lib/analy_z/html/word_val.rb', line 7

def tf
  @tf
end

#tf_idfObject

Returns the value of attribute tf_idf.



9
10
11
# File 'lib/analy_z/html/word_val.rb', line 9

def tf_idf
  @tf_idf
end

#wordsObject

Returns the value of attribute words.



11
12
13
# File 'lib/analy_z/html/word_val.rb', line 11

def words
  @words
end

Instance Method Details

#analyze_words(sentences, text_file_path, type_ary = ['名詞']) ⇒ Object



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/analy_z/html/word_val.rb', line 37

def analyze_words sentences, text_file_path, type_ary = ['名詞']
    
  @words, @tf, @idf, @hse = {}, {}, {}, {}
    
  puts "=== calculating tf and idf and hse ==="
  sentences.each do |key, sentence_ary|
    print '.'
    text = sentence_ary.map {|s| s[0] }.join
    @words[key] = parse_by_natto(text, type_ary)
    @tf[key] = calc_tf(@words[key])
    @idf[key] = calc_idf(@words[key], text_file_path)
    @hse[key] = calc_hse(@words[key], sentence_ary)
  end
    
  puts "\n=== calculating tf idf ==="
  @tf_idf = calc_tf_idf(@tf, @idf)
    
  puts "=== calculating hse tf idf ==="
  @hse_tf_idf = calc_hse_tf_idf(@tf_idf, @hse)
    
end

#calc_hse(words, sentence_ary) ⇒ Object



130
131
132
133
134
135
136
137
138
139
# File 'lib/analy_z/html/word_val.rb', line 130

def calc_hse words, sentence_ary
  sentence_ary = sentence_ary.select{|sentence| sentence[1] != 1}
  words.map do |word|
    rate = 1
    sentence_ary.each do |sentence|
      rate = sentence[1] if sentence[0].include?(word[0])
    end
    [word, rate]
  end.uniq
end

#calc_hse_tf_idf(tf_idf_list_hash, hse) ⇒ Object



157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# File 'lib/analy_z/html/word_val.rb', line 157

def calc_hse_tf_idf tf_idf_list_hash, hse
    
  hse_tf_idf = {}
    
  hse.each do |k, h|
    hse[k] = hse[k].select {|h| h[1] != 1 }
  end
    
  tf_idf_list_hash.each do |k, tf_idf_list|
    hse_tf_idf[k] = tf_idf_list.map do |tf_idf|
      rate = hse[k].assoc(tf_idf[0]) ? hse[k].assoc(tf_idf[0])[1] : 1
      [tf_idf[0], tf_idf[1] * rate]
    end
  end
    
  hse_tf_idf
end

#calc_idf(words, text_file_path) ⇒ Object



119
120
121
122
123
124
125
126
127
128
# File 'lib/analy_z/html/word_val.rb', line 119

def calc_idf words, text_file_path
  texts = File.read(text_file_path).split('/=== EOS ===/')
  words.map do |word|
    cnt = 0
    texts.each do |text|
      cnt += 1 if text.include?(word)
    end
    [word, Math.log(sentences.length / cnt.to_f)]
  end
end

#calc_tf(words) ⇒ Object



97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/analy_z/html/word_val.rb', line 97

def calc_tf words
  freq_hash = {}
    
  words.each_with_index do |word, i|
    freq_hash[word] = freq_hash.has_key?(word) ? freq_hash[word] + 1 : 1
  end
    
  tf_list = freq_hash.sort_by {|k, v| v }.reverse.map do |k, v|
    [k, v / words.length.to_f]
  end
    
  tf_list    
end

#calc_tf_idf(tf_list_hash, idf_list_hash) ⇒ Object



141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# File 'lib/analy_z/html/word_val.rb', line 141

def calc_tf_idf tf_list_hash, idf_list_hash
    
  tf_idfs = {}
    
  tf_list_hash.each do |k, tf|
    tf_idf = []
    idf_list_hash[k].each do |idf|
      tf_idf << [idf[0], idf[1] * tf.assoc(idf[0])[1]]
    end
    tf_idfs[k] = tf_idf.sort{ |a, b| b[1] <=> a[1] }.uniq
  end
    
  tf_idfs
    
end

#parse_by_natto(text, type_ary) ⇒ Object



86
87
88
89
90
91
92
93
94
95
# File 'lib/analy_z/html/word_val.rb', line 86

def parse_by_natto text, type_ary
  words = []
    
  Natto::MeCab.new.parse(text).split(/\n/).map do |row|
    row = row.split(/\t|,/)
    words << row[0] if type_ary.include?(row[1]) # row[0] is word, row[1] is a part of speech
  end
    
  words
end

#parse_html(html) ⇒ Object



59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/analy_z/html/word_val.rb', line 59

def parse_html html
  sentences, important_tags = [], []
  tag_rep = /<(".*?"|'.*?'|[^'"])*?>/
  h_tag_reg = /<[hH][1-4].*?>.*?<\/[hH][1-4]>/
    
  important_tags = html.scan(h_tag_reg)
    .map{|m| [m.gsub(/<(".*?"|'.*?'|[^'"])*?>/, ''), m.match(/[hH][1-4]/)[0] ]}
    
  sentences = html.gsub(/\"/, '')
      .split(tag_rep)
      .delete_if{|el| el != ['h1', 'h2', 'h3', 'h4'] && el =~ /\s/ || el.length <= 1}
      .map{|m| [m, 1]}
    
  sentences.each_with_index do |sentence, i|
    important_tags.each do |tag_data|
      rate = 2    * 1.75  if tag_data[1] == 'h1'
      rate = 1.5  * 1.75  if tag_data[1] == 'h2'
      rate = 1.17 * 1.75  if tag_data[1] == 'h3'
      rate = 1.17 * 1.75  if tag_data[1] == 'h4'
      sentences[i][1] = rate if sentence[0].include?(tag_data[0]) || tag_data[0].include?(sentence[0])
    end
  end
    
  sentences
    
end

#standardization_tf(tf_ary_list, ave_word_num) ⇒ Object



111
112
113
114
115
116
117
# File 'lib/analy_z/html/word_val.rb', line 111

def standardization_tf tf_ary_list, ave_word_num
  return tf_ary_list.map do |tf_ary|
    tf_ary.map do |tf|
      [tf[0], tf[1] * (tf_ary.length / ave_word_num.to_f), tf_ary.length / ave_word_num.to_f]
    end
  end
end