Class: Linkterm::Base
- Inherits:
-
Object
- Object
- Linkterm::Base
- Defined in:
- lib/linkterm.rb
Instance Method Summary collapse
- #idf(word) ⇒ Object
-
#initialize(options = {}) ⇒ Base
constructor
A new instance of Base.
- #rule_table ⇒ Object
Constructor Details
#initialize(options = {}) ⇒ Base
Returns a new instance of Base.
72 73 74 75 76 |
# File 'lib/linkterm.rb', line 72 def initialize( = {}) @doc_dir = [:doc_dir] @documents = Documents.new Dir.open(@doc_dir).reject {|f| /^\./ =~ f }.map {|f| "#{@doc_dir}/" + f } end |
Instance Method Details
#idf(word) ⇒ Object
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
# File 'lib/linkterm.rb', line 78 def idf(word) if defined? @documents_count all_count = Hash.new(0) counter = Counter.new(@documents.filenames.length, 'idf') @documents.each do |document| counter.display! words = Set.new [] document.each {|paragraph| paragraph.each {|node| words << node.surface.downcase if node.category == '名詞' and node.surface.length > 1 and not /^\W+$/ =~ node.surface and not /^\d+$/ =~ node.surface } } words.each {|word| all_count[word] += 1 } end @documents_count = all_count end Math.log( @documents.filenames.length.to_f / @documents_count[word] ) end |
#rule_table ⇒ Object
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
# File 'lib/linkterm.rb', line 101 def rule_table table = [] counter = Counter.new(@documents.filenames.size, 'trans to table') #@documents.each(0..3) do |document| @documents.each() do |document| counter.display! document.each {|paragraph| hash = Hash.new 0 paragraph.each {|node| hash[node.surface.downcase] += 1 if node.category == '名詞' and node.surface.length > 2 and not /^\W+$/ =~ node.surface and not /^\d+$/ =~ node.surface } word_count = hash # TF-IDF値の高いものみ取り出す word_count = word_count.sort_by {|word, count| count * idf(word) }.reverse[0...3].to_hash #word_count = word_count.select {|word, count| count * idf(word) > 1.5 }.to_hash word_count = word_count.map {|word, _| word }.sort table << word_count unless word_count.empty? } end # rule induction table = table.map {|line| arr = line.partitions {|term| (/^[a-z_]+$/ =~ term) ? :eng : (term.length > 10) ? :long : :short } arr << [] until arr.size >= 3 arr } array = (0...table.first.length).to_a.parm counter = Counter.new(array.length, 'induce rule') rules = array.map {|c1, c2| counter.display! pattern = '[ [x,y] | x <- table.map{|u| u[c1] }.to_set, y <- table.map{|u| u[c2] }.to_set, !x.empty?, !y.empty? ]'.lisc_do binding pattern.map {|a1, a2| spt = table.select {|u| u[c1] == a1 and u[c2] == a2 }.length / table.select {|u| u[c1] == a1 }.length {:a1 => a1, :a2 => a2, :spt => spt} } }.flatten.select {|r| r[:spt] >= 1/2 } rules end |