Class: AnalyZ::HTML::WordVal
- Inherits:
-
Object
- Object
- AnalyZ::HTML::WordVal
- Defined in:
- lib/analy_z/html/word_val.rb
Instance Attribute Summary collapse
-
#hse_tf_idf ⇒ Object
Returns the value of attribute hse_tf_idf.
-
#idf ⇒ Object
Returns the value of attribute idf.
-
#sentences ⇒ Object
Returns the value of attribute sentences.
-
#texts ⇒ Object
Returns the value of attribute texts.
-
#tf ⇒ Object
Returns the value of attribute tf.
-
#tf_idf ⇒ Object
Returns the value of attribute tf_idf.
-
#words ⇒ Object
Returns the value of attribute words.
Instance Method Summary collapse
- #analyze_words(sentences, text_file_path, type_ary = ['名詞']) ⇒ Object
- #calc_hse(words, sentence_ary) ⇒ Object
- #calc_hse_tf_idf(tf_idf_list_hash, hse) ⇒ Object
- #calc_idf(words, text_file_path) ⇒ Object
- #calc_tf(words) ⇒ Object
- #calc_tf_idf(tf_list_hash, idf_list_hash) ⇒ Object
-
#initialize(html_path, selector = 'body', type_ary = ['名詞']) ⇒ WordVal
constructor
A new instance of WordVal.
- #parse_by_natto(text, type_ary) ⇒ Object
- #parse_html(html) ⇒ Object
- #standardization_tf(tf_ary_list, ave_word_num) ⇒ Object
Constructor Details
#initialize(html_path, selector = 'body', type_ary = ['名詞']) ⇒ WordVal
Returns a new instance of WordVal.
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/analy_z/html/word_val.rb', line 15 def initialize html_path, selector = 'body', type_ary = ['名詞'] @sentences = {} Dir.glob(html_path).each do |f| print '.' @sentences[f] = parse_html(Nokogiri::HTML.parse(File.read(f), nil, nil).css(selector).to_html) end puts "\n=== creating sentences file ===" txt = "" @sentences.each do |k, sentences| print '.' txt << sentences.map{|s| s[0] }.join + '/=== EOS ===/' end FileUtils.mkdir_p('tmp') unless FileTest.exist?('tmp') text_file_path = "tmp/#{DateTime.now}.txt" File.write(text_file_path, txt) puts "\n=== analyzing... ===" analyze_words(@sentences, text_file_path) end |
Instance Attribute Details
#hse_tf_idf ⇒ Object
Returns the value of attribute hse_tf_idf.
10 11 12 |
# File 'lib/analy_z/html/word_val.rb', line 10 def hse_tf_idf @hse_tf_idf end |
#idf ⇒ Object
Returns the value of attribute idf.
8 9 10 |
# File 'lib/analy_z/html/word_val.rb', line 8 def idf @idf end |
#sentences ⇒ Object
Returns the value of attribute sentences.
13 14 15 |
# File 'lib/analy_z/html/word_val.rb', line 13 def sentences @sentences end |
#texts ⇒ Object
Returns the value of attribute texts.
12 13 14 |
# File 'lib/analy_z/html/word_val.rb', line 12 def texts @texts end |
#tf ⇒ Object
Returns the value of attribute tf.
7 8 9 |
# File 'lib/analy_z/html/word_val.rb', line 7 def tf @tf end |
#tf_idf ⇒ Object
Returns the value of attribute tf_idf.
9 10 11 |
# File 'lib/analy_z/html/word_val.rb', line 9 def tf_idf @tf_idf end |
#words ⇒ Object
Returns the value of attribute words.
11 12 13 |
# File 'lib/analy_z/html/word_val.rb', line 11 def words @words end |
Instance Method Details
#analyze_words(sentences, text_file_path, type_ary = ['名詞']) ⇒ Object
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
# File 'lib/analy_z/html/word_val.rb', line 37 def analyze_words sentences, text_file_path, type_ary = ['名詞'] @words, @tf, @idf, @hse = {}, {}, {}, {} puts "=== calculating tf and idf and hse ===" sentences.each do |key, sentence_ary| print '.' text = sentence_ary.map {|s| s[0] }.join @words[key] = parse_by_natto(text, type_ary) @tf[key] = calc_tf(@words[key]) @idf[key] = calc_idf(@words[key], text_file_path) @hse[key] = calc_hse(@words[key], sentence_ary) end puts "\n=== calculating tf idf ===" @tf_idf = calc_tf_idf(@tf, @idf) puts "=== calculating hse tf idf ===" @hse_tf_idf = calc_hse_tf_idf(@tf_idf, @hse) end |
#calc_hse(words, sentence_ary) ⇒ Object
130 131 132 133 134 135 136 137 138 139 |
# File 'lib/analy_z/html/word_val.rb', line 130 def calc_hse words, sentence_ary sentence_ary = sentence_ary.select{|sentence| sentence[1] != 1} words.map do |word| rate = 1 sentence_ary.each do |sentence| rate = sentence[1] if sentence[0].include?(word[0]) end [word, rate] end.uniq end |
#calc_hse_tf_idf(tf_idf_list_hash, hse) ⇒ Object
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
# File 'lib/analy_z/html/word_val.rb', line 157 def calc_hse_tf_idf tf_idf_list_hash, hse hse_tf_idf = {} hse.each do |k, h| hse[k] = hse[k].select {|h| h[1] != 1 } end tf_idf_list_hash.each do |k, tf_idf_list| hse_tf_idf[k] = tf_idf_list.map do |tf_idf| rate = hse[k].assoc(tf_idf[0]) ? hse[k].assoc(tf_idf[0])[1] : 1 [tf_idf[0], tf_idf[1] * rate] end end hse_tf_idf end |
#calc_idf(words, text_file_path) ⇒ Object
119 120 121 122 123 124 125 126 127 128 |
# File 'lib/analy_z/html/word_val.rb', line 119 def calc_idf words, text_file_path texts = File.read(text_file_path).split('/=== EOS ===/') words.map do |word| cnt = 0 texts.each do |text| cnt += 1 if text.include?(word) end [word, Math.log(sentences.length / cnt.to_f)] end end |
#calc_tf(words) ⇒ Object
97 98 99 100 101 102 103 104 105 106 107 108 109 |
# File 'lib/analy_z/html/word_val.rb', line 97 def calc_tf words freq_hash = {} words.each_with_index do |word, i| freq_hash[word] = freq_hash.has_key?(word) ? freq_hash[word] + 1 : 1 end tf_list = freq_hash.sort_by {|k, v| v }.reverse.map do |k, v| [k, v / words.length.to_f] end tf_list end |
#calc_tf_idf(tf_list_hash, idf_list_hash) ⇒ Object
141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
# File 'lib/analy_z/html/word_val.rb', line 141 def calc_tf_idf tf_list_hash, idf_list_hash tf_idfs = {} tf_list_hash.each do |k, tf| tf_idf = [] idf_list_hash[k].each do |idf| tf_idf << [idf[0], idf[1] * tf.assoc(idf[0])[1]] end tf_idfs[k] = tf_idf.sort{ |a, b| b[1] <=> a[1] }.uniq end tf_idfs end |
#parse_by_natto(text, type_ary) ⇒ Object
86 87 88 89 90 91 92 93 94 95 |
# File 'lib/analy_z/html/word_val.rb', line 86 def parse_by_natto text, type_ary words = [] Natto::MeCab.new.parse(text).split(/\n/).map do |row| row = row.split(/\t|,/) words << row[0] if type_ary.include?(row[1]) # row[0] is word, row[1] is a part of speech end words end |
#parse_html(html) ⇒ Object
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
# File 'lib/analy_z/html/word_val.rb', line 59 def parse_html html sentences, = [], [] tag_rep = /<(".*?"|'.*?'|[^'"])*?>/ h_tag_reg = /<[hH][1-4].*?>.*?<\/[hH][1-4]>/ = html.scan(h_tag_reg) .map{|m| [m.gsub(/<(".*?"|'.*?'|[^'"])*?>/, ''), m.match(/[hH][1-4]/)[0] ]} sentences = html.gsub(/\"/, '') .split(tag_rep) .delete_if{|el| el != ['h1', 'h2', 'h3', 'h4'] && el =~ /\s/ || el.length <= 1} .map{|m| [m, 1]} sentences.each_with_index do |sentence, i| .each do |tag_data| rate = 2 * 1.75 if tag_data[1] == 'h1' rate = 1.5 * 1.75 if tag_data[1] == 'h2' rate = 1.17 * 1.75 if tag_data[1] == 'h3' rate = 1.17 * 1.75 if tag_data[1] == 'h4' sentences[i][1] = rate if sentence[0].include?(tag_data[0]) || tag_data[0].include?(sentence[0]) end end sentences end |
#standardization_tf(tf_ary_list, ave_word_num) ⇒ Object
111 112 113 114 115 116 117 |
# File 'lib/analy_z/html/word_val.rb', line 111 def standardization_tf tf_ary_list, ave_word_num return tf_ary_list.map do |tf_ary| tf_ary.map do |tf| [tf[0], tf[1] * (tf_ary.length / ave_word_num.to_f), tf_ary.length / ave_word_num.to_f] end end end |