Class: AnalyZ::Analyzer

Inherits:

Object

Object
AnalyZ::Analyzer

show all

Defined in:: lib/analy_z.rb

Instance Attribute Summary collapse

#hse_tf_idf ⇒ Object

Returns the value of attribute hse_tf_idf.
#idf ⇒ Object

Returns the value of attribute idf.
#sentences ⇒ Object

Returns the value of attribute sentences.
#texts ⇒ Object

Returns the value of attribute texts.
#tf ⇒ Object

Returns the value of attribute tf.
#tf_idf ⇒ Object

Returns the value of attribute tf_idf.
#words ⇒ Object

Returns the value of attribute words.

Instance Method Summary collapse

Constructor Details

#initialize(html_path, selector = 'body', type_ary = ['名詞']) ⇒ `Analyzer`

Returns a new instance of Analyzer.

# File 'lib/analy_z.rb', line 19

def initialize html_path, selector = 'body', type_ary = ['名詞']
  @sentences = {}
  Dir.glob(html_path).each do |f|
    @sentences[f] = parse_html(Nokogiri::HTML.parse(File.read(f), nil, nil).css(selector).to_html)
  end
  analyze_words(@sentences)
end

Instance Attribute Details

#hse_tf_idf ⇒ `Object`

Returns the value of attribute hse_tf_idf.



14
15
16

# File 'lib/analy_z.rb', line 14

def hse_tf_idf
  @hse_tf_idf
end

#idf ⇒ `Object`

Returns the value of attribute idf.



12
13
14

# File 'lib/analy_z.rb', line 12

def idf
  @idf
end

#sentences ⇒ `Object`

Returns the value of attribute sentences.



17
18
19

# File 'lib/analy_z.rb', line 17

def sentences
  @sentences
end

#texts ⇒ `Object`

Returns the value of attribute texts.



16
17
18

# File 'lib/analy_z.rb', line 16

def texts
  @texts
end

#tf ⇒ `Object`

Returns the value of attribute tf.



11
12
13

# File 'lib/analy_z.rb', line 11

def tf
  @tf
end

#tf_idf ⇒ `Object`

Returns the value of attribute tf_idf.



13
14
15

# File 'lib/analy_z.rb', line 13

def tf_idf
  @tf_idf
end

#words ⇒ `Object`

Returns the value of attribute words.



15
16
17

# File 'lib/analy_z.rb', line 15

def words
  @words
end

Instance Method Details

#analyze_words(sentences, type_ary = ['名詞']) ⇒ `Object`

# File 'lib/analy_z.rb', line 27

def analyze_words sentences, type_ary = ['名詞']

  @texts, @words, @tf, @idf, @hse = {}, {}, {}, {}, {}

  sentences.each{|k, sentence| @texts[k] = sentence.map {|s| s[0]}.join }

  sentences.each do |key, sentence_ary|
    text = sentence_ary.map {|s| s[0] }.join
    @words[key] = parse_by_natto(text, type_ary)
    @tf[key] = calc_tf(@words[key])
    @idf[key] = calc_idf(@texts, @words[key])
    @hse[key] = calc_hse(@words[key], sentence_ary)
  end

  @tf_idf = calc_tf_idf(@tf, @idf)
  @hse_tf_idf = calc_hse_tf_idf(@tf_idf, @hse)

end

#calc_hse(words, sentence_ary) ⇒ `Object`

# File 'lib/analy_z.rb', line 114

def calc_hse words, sentence_ary
  sentence_ary = sentence_ary.select{|sentence| sentence[1] != 1}
  words.map do |word|
    rate = 1
    sentence_ary.each do |sentence|
      rate = sentence[1] if sentence[0].include?(word[0])
    end
    [word, rate]
  end.uniq
end

#calc_hse_tf_idf(tf_idf_list_hash, hse) ⇒ `Object`

# File 'lib/analy_z.rb', line 141

def calc_hse_tf_idf tf_idf_list_hash, hse

  hse_tf_idf = {}

  hse.each do |k, h|
    hse[k] = hse[k].select {|h| h[1] != 1 }
  end

  tf_idf_list_hash.each do |k, tf_idf_list|
    hse_tf_idf[k] = tf_idf_list.map do |tf_idf|
      rate = hse[k].assoc(tf_idf[0]) ? hse[k].assoc(tf_idf[0])[1] : 1
      [tf_idf[0], tf_idf[1] * rate]
    end
  end

  hse_tf_idf
end

#calc_idf(sentences, words) ⇒ `Object`

# File 'lib/analy_z.rb', line 106

def calc_idf sentences, words
  words.map do |word|
    cnt = 0
    sentences.each {|k, v| cnt += 1 if v.include?(word) }
    [word, Math.log(sentences.length / cnt.to_f)]
  end
end

#calc_tf(words) ⇒ `Object`

# File 'lib/analy_z.rb', line 84

def calc_tf words
  freq_hash = {}

  words.each_with_index do |word, i|
    freq_hash[word] = freq_hash.has_key?(word) ? freq_hash[word] + 1 : 1
  end

  tf_list = freq_hash.sort_by {|k, v| v }.reverse.map do |k, v|
    [k, v / words.length.to_f]
  end

  tf_list    
end

#calc_tf_idf(tf_list_hash, idf_list_hash) ⇒ `Object`

# File 'lib/analy_z.rb', line 125

def calc_tf_idf tf_list_hash, idf_list_hash

  tf_idfs = {}

  tf_list_hash.each do |k, tf|
    tf_idf = []
    idf_list_hash[k].each do |idf|
      tf_idf << [idf[0], idf[1] * tf.assoc(idf[0])[1]]
    end
    tf_idfs[k] = tf_idf.sort{ |a, b| b[1] <=> a[1] }.uniq
  end

  tf_idfs

end

#parse_by_natto(text, type_ary) ⇒ `Object`

# File 'lib/analy_z.rb', line 73

def parse_by_natto text, type_ary
  words = []

  Natto::MeCab.new.parse(text).split(/\n/).map do |row|
    row = row.split(/\t|,/)
    words << row[0] if type_ary.include?(row[1]) # row[0] is word, row[1] is a part of speech
  end

  words
end

#parse_html(html) ⇒ `Object`

# File 'lib/analy_z.rb', line 46

def parse_html html
  sentences, important_tags = [], []
  tag_rep = /<(".*?"|'.*?'|[^'"])*?>/
  h_tag_reg = /<[hH][1-4].*?>.*?<\/[hH][1-4]>/

  important_tags = html.scan(h_tag_reg)
    .map{|m| [m.gsub(/<(".*?"|'.*?'|[^'"])*?>/, ''), m.match(/[hH][1-4]/)[0] ]}

  sentences = html.gsub(/\"/, '')
      .split(tag_rep)
      .delete_if{|el| el != ['h1', 'h2', 'h3', 'h4'] && el =~ /\s/ || el.length <= 1}
      .map{|m| [m, 1]}

  sentences.each_with_index do |sentence, i|
    important_tags.each do |tag_data|
      rate = 2    * 1.75  if tag_data[1] == 'h1'
      rate = 1.5  * 1.75  if tag_data[1] == 'h2'
      rate = 1.17 * 1.75  if tag_data[1] == 'h3'
      rate = 1.17 * 1.75  if tag_data[1] == 'h4'
      sentences[i][1] = rate if sentence[0].include?(tag_data[0]) || tag_data[0].include?(sentence[0])
    end
  end

  sentences

end

#standardization_tf(tf_ary_list, ave_word_num) ⇒ `Object`

# File 'lib/analy_z.rb', line 98

def standardization_tf tf_ary_list, ave_word_num
  return tf_ary_list.map do |tf_ary|
    tf_ary.map do |tf|
      [tf[0], tf[1] * (tf_ary.length / ave_word_num.to_f), tf_ary.length / ave_word_num.to_f]
    end
  end
end

Class: AnalyZ::Analyzer

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(html_path, selector = 'body', type_ary = ['名詞']) ⇒ Analyzer

Instance Attribute Details

#hse_tf_idf ⇒ Object

#idf ⇒ Object

#sentences ⇒ Object

#texts ⇒ Object

#tf ⇒ Object

#tf_idf ⇒ Object

#words ⇒ Object

Instance Method Details

#analyze_words(sentences, type_ary = ['名詞']) ⇒ Object

#calc_hse(words, sentence_ary) ⇒ Object

#calc_hse_tf_idf(tf_idf_list_hash, hse) ⇒ Object

#calc_idf(sentences, words) ⇒ Object

#calc_tf(words) ⇒ Object

#calc_tf_idf(tf_list_hash, idf_list_hash) ⇒ Object

#parse_by_natto(text, type_ary) ⇒ Object

#parse_html(html) ⇒ Object

#standardization_tf(tf_ary_list, ave_word_num) ⇒ Object