Class: AnalyZ::Analyzer
- Inherits:
-
Object
- Object
- AnalyZ::Analyzer
- Defined in:
- lib/analy_z.rb
Instance Attribute Summary collapse
-
#hse_tf_idf ⇒ Object
Returns the value of attribute hse_tf_idf.
-
#idf ⇒ Object
Returns the value of attribute idf.
-
#sentences ⇒ Object
Returns the value of attribute sentences.
-
#texts ⇒ Object
Returns the value of attribute texts.
-
#tf ⇒ Object
Returns the value of attribute tf.
-
#tf_idf ⇒ Object
Returns the value of attribute tf_idf.
-
#words ⇒ Object
Returns the value of attribute words.
Instance Method Summary collapse
- #analyze_words(sentences, type_ary = ['名詞']) ⇒ Object
- #calc_hse(words, sentence_ary) ⇒ Object
- #calc_hse_tf_idf(tf_idf_list_hash, hse) ⇒ Object
- #calc_idf(sentences, words) ⇒ Object
- #calc_tf(words) ⇒ Object
- #calc_tf_idf(tf_list_hash, idf_list_hash) ⇒ Object
-
#initialize(html_path, selector = 'body', type_ary = ['名詞']) ⇒ Analyzer
constructor
A new instance of Analyzer.
- #parse_by_natto(text, type_ary) ⇒ Object
- #parse_html(html) ⇒ Object
- #standardization_tf(tf_ary_list, ave_word_num) ⇒ Object
Constructor Details
#initialize(html_path, selector = 'body', type_ary = ['名詞']) ⇒ Analyzer
Returns a new instance of Analyzer.
19 20 21 22 23 24 25 |
# File 'lib/analy_z.rb', line 19 def initialize html_path, selector = 'body', type_ary = ['名詞'] @sentences = {} Dir.glob(html_path).each do |f| @sentences[f] = parse_html(Nokogiri::HTML.parse(File.read(f), nil, nil).css(selector).to_html) end analyze_words(@sentences) end |
Instance Attribute Details
#hse_tf_idf ⇒ Object
Returns the value of attribute hse_tf_idf.
14 15 16 |
# File 'lib/analy_z.rb', line 14 def hse_tf_idf @hse_tf_idf end |
#idf ⇒ Object
Returns the value of attribute idf.
12 13 14 |
# File 'lib/analy_z.rb', line 12 def idf @idf end |
#sentences ⇒ Object
Returns the value of attribute sentences.
17 18 19 |
# File 'lib/analy_z.rb', line 17 def sentences @sentences end |
#texts ⇒ Object
Returns the value of attribute texts.
16 17 18 |
# File 'lib/analy_z.rb', line 16 def texts @texts end |
#tf ⇒ Object
Returns the value of attribute tf.
11 12 13 |
# File 'lib/analy_z.rb', line 11 def tf @tf end |
#tf_idf ⇒ Object
Returns the value of attribute tf_idf.
13 14 15 |
# File 'lib/analy_z.rb', line 13 def tf_idf @tf_idf end |
#words ⇒ Object
Returns the value of attribute words.
15 16 17 |
# File 'lib/analy_z.rb', line 15 def words @words end |
Instance Method Details
#analyze_words(sentences, type_ary = ['名詞']) ⇒ Object
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
# File 'lib/analy_z.rb', line 27 def analyze_words sentences, type_ary = ['名詞'] @texts, @words, @tf, @idf, @hse = {}, {}, {}, {}, {} sentences.each{|k, sentence| @texts[k] = sentence.map {|s| s[0]}.join } sentences.each do |key, sentence_ary| text = sentence_ary.map {|s| s[0] }.join @words[key] = parse_by_natto(text, type_ary) @tf[key] = calc_tf(@words[key]) @idf[key] = calc_idf(@texts, @words[key]) @hse[key] = calc_hse(@words[key], sentence_ary) end @tf_idf = calc_tf_idf(@tf, @idf) @hse_tf_idf = calc_hse_tf_idf(@tf_idf, @hse) end |
#calc_hse(words, sentence_ary) ⇒ Object
114 115 116 117 118 119 120 121 122 123 |
# File 'lib/analy_z.rb', line 114 def calc_hse words, sentence_ary sentence_ary = sentence_ary.select{|sentence| sentence[1] != 1} words.map do |word| rate = 1 sentence_ary.each do |sentence| rate = sentence[1] if sentence[0].include?(word[0]) end [word, rate] end.uniq end |
#calc_hse_tf_idf(tf_idf_list_hash, hse) ⇒ Object
141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
# File 'lib/analy_z.rb', line 141 def calc_hse_tf_idf tf_idf_list_hash, hse hse_tf_idf = {} hse.each do |k, h| hse[k] = hse[k].select {|h| h[1] != 1 } end tf_idf_list_hash.each do |k, tf_idf_list| hse_tf_idf[k] = tf_idf_list.map do |tf_idf| rate = hse[k].assoc(tf_idf[0]) ? hse[k].assoc(tf_idf[0])[1] : 1 [tf_idf[0], tf_idf[1] * rate] end end hse_tf_idf end |
#calc_idf(sentences, words) ⇒ Object
106 107 108 109 110 111 112 |
# File 'lib/analy_z.rb', line 106 def calc_idf sentences, words words.map do |word| cnt = 0 sentences.each {|k, v| cnt += 1 if v.include?(word) } [word, Math.log(sentences.length / cnt.to_f)] end end |
#calc_tf(words) ⇒ Object
84 85 86 87 88 89 90 91 92 93 94 95 96 |
# File 'lib/analy_z.rb', line 84 def calc_tf words freq_hash = {} words.each_with_index do |word, i| freq_hash[word] = freq_hash.has_key?(word) ? freq_hash[word] + 1 : 1 end tf_list = freq_hash.sort_by {|k, v| v }.reverse.map do |k, v| [k, v / words.length.to_f] end tf_list end |
#calc_tf_idf(tf_list_hash, idf_list_hash) ⇒ Object
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
# File 'lib/analy_z.rb', line 125 def calc_tf_idf tf_list_hash, idf_list_hash tf_idfs = {} tf_list_hash.each do |k, tf| tf_idf = [] idf_list_hash[k].each do |idf| tf_idf << [idf[0], idf[1] * tf.assoc(idf[0])[1]] end tf_idfs[k] = tf_idf.sort{ |a, b| b[1] <=> a[1] }.uniq end tf_idfs end |
#parse_by_natto(text, type_ary) ⇒ Object
73 74 75 76 77 78 79 80 81 82 |
# File 'lib/analy_z.rb', line 73 def parse_by_natto text, type_ary words = [] Natto::MeCab.new.parse(text).split(/\n/).map do |row| row = row.split(/\t|,/) words << row[0] if type_ary.include?(row[1]) # row[0] is word, row[1] is a part of speech end words end |
#parse_html(html) ⇒ Object
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
# File 'lib/analy_z.rb', line 46 def parse_html html sentences, = [], [] tag_rep = /<(".*?"|'.*?'|[^'"])*?>/ h_tag_reg = /<[hH][1-4].*?>.*?<\/[hH][1-4]>/ = html.scan(h_tag_reg) .map{|m| [m.gsub(/<(".*?"|'.*?'|[^'"])*?>/, ''), m.match(/[hH][1-4]/)[0] ]} sentences = html.gsub(/\"/, '') .split(tag_rep) .delete_if{|el| el != ['h1', 'h2', 'h3', 'h4'] && el =~ /\s/ || el.length <= 1} .map{|m| [m, 1]} sentences.each_with_index do |sentence, i| .each do |tag_data| rate = 2 * 1.75 if tag_data[1] == 'h1' rate = 1.5 * 1.75 if tag_data[1] == 'h2' rate = 1.17 * 1.75 if tag_data[1] == 'h3' rate = 1.17 * 1.75 if tag_data[1] == 'h4' sentences[i][1] = rate if sentence[0].include?(tag_data[0]) || tag_data[0].include?(sentence[0]) end end sentences end |
#standardization_tf(tf_ary_list, ave_word_num) ⇒ Object
98 99 100 101 102 103 104 |
# File 'lib/analy_z.rb', line 98 def standardization_tf tf_ary_list, ave_word_num return tf_ary_list.map do |tf_ary| tf_ary.map do |tf| [tf[0], tf[1] * (tf_ary.length / ave_word_num.to_f), tf_ary.length / ave_word_num.to_f] end end end |