Class: JDict::DictIndex
- Inherits:
-
Object
- Object
- JDict::DictIndex
- Defined in:
- lib/index.rb
Constant Summary collapse
- LANGUAGE_DEFAULT =
JDict::JMDictConstants::Languages::ENGLISH
- NUM_ENTRIES_TO_INDEX =
50- ENTITY_REGEX =
/<!ENTITY\s([^ ]*)\s\"(.*)">/
Instance Attribute Summary collapse
-
#path ⇒ Object
readonly
Returns the value of attribute path.
Instance Method Summary collapse
-
#build_index(overwrite = false, dictionary_path = nil) ⇒ Integer
Builds the full-text search index.
-
#build_pos_hash ⇒ Object
Creates the hash of part-of-speech symbols to full definitions from the dictionary.
- #built? ⇒ Boolean
-
#create_schema ⇒ Object
Creates the SQL schema for the Amalgalite database.
-
#get_pos(pos) ⇒ String
Retrieves the definition of a part-of-speech from its abbreviation.
-
#initialize(path) ⇒ DictIndex
constructor
Initialize a full-text search index backend for JMdict.
- #make_query(term, exact) ⇒ Object
-
#open_reader(dictionary_path) ⇒ XML::Reader
Creates an XML::Reader object for the given path.
-
#pos_to_sym(entity) ⇒ Symbol
Converts a part-of-speech entity reference string into a symbol.
- #rebuild_index ⇒ Object
-
#search(term, exact = false, language = LANGUAGE_DEFAULT) ⇒ Array(Entry)
Returns the search results as an array of
Entry.
Constructor Details
#initialize(path) ⇒ DictIndex
Initialize a full-text search index backend for JMdict
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
# File 'lib/index.rb', line 27 def initialize(path) @dictionary_path = path @index_path = File.dirname(@dictionary_path) @pos_hash = {} raise "No dictionary found at path #{@dictionary_path}" unless File.exists? @dictionary_path db_file = File.join(@index_path, "fts5.db") File.unlink(db_file) if JDict.config.debug && File.exist?(db_file) @index = Amalgalite::Database.new(db_file) create_schema build_index unless built? #make the hash from abbreviated parts of speech to full definitions @pos_hash ||= build_pos_hash end |
Instance Attribute Details
#path ⇒ Object (readonly)
Returns the value of attribute path.
23 24 25 |
# File 'lib/index.rb', line 23 def path @path end |
Instance Method Details
#build_index(overwrite = false, dictionary_path = nil) ⇒ Integer
Builds the full-text search index
115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
# File 'lib/index.rb', line 115 def build_index(overwrite=false, dictionary_path=nil) @dictionary_path = dictionary_path unless dictionary_path.nil? raise "No dictionary path was provided" if @dictionary_path.nil? raise "Dictionary not found at path #{@dictionary_path}" unless File.exists?(@dictionary_path) reader = open_reader(@dictionary_path) puts "Building index..." # whenever there is a reader error, print its block parameters XML::Error.set_handler { |*args| p args } # components of an entry entry_sequence_num, kanji, kana, senses = 0, [], [], [] glosses = {} parts_of_speech = [] entries_added = 0 @index.transaction do |db_transaction| # read until the end while reader.read # check what type of node we're currently on case reader.node_type # start-of-element node when XML::Reader::TYPE_ELEMENT case reader.name when JDict::JMDictConstants::Elements::SEQUENCE entry_sequence_num = reader.next_text.to_i # TODO: Raise an exception if reader.next_text.empty? inside the when's # JMdict shouldn't have any empty elements, I believe. when JDict::JMDictConstants::Elements::KANJI text = reader.next_text kanji << text unless text.empty? when JDict::JMDictConstants::Elements::KANA text = reader.next_text kana << text unless text.empty? when JDict::JMDictConstants::Elements::GLOSS language = reader.node.lang || LANGUAGE_DEFAULT language = language.intern text = reader.next_text unless text.empty? (glosses[language] ||= []) << text end when JDict::JMDictConstants::Elements::CROSSREFERENCE text = reader.next_text end # XML entity references are treated as a different node type # the parent node of the entity reference itself has the actual tag name when XML::Reader::TYPE_ENTITY_REFERENCE if reader.node.parent.name == JDict::JMDictConstants::Elements::PART_OF_SPEECH text = reader.name parts_of_speech << text unless text.empty? end # end-of-element node when XML::Reader::TYPE_END_ELEMENT case reader.name when JDict::JMDictConstants::Elements::SENSE # build sense senses << Sense.new(parts_of_speech, glosses) # glosses.each do |language, texts| # senses << Sense.new(parts_of_speech, # texts.join(', ').strip, # language) # end # clear data for the next sense glosses = {} parts_of_speech = [] # we're at the end of the entry element, so index it when JDict::JMDictConstants::Elements::ENTRY raise "No kana found for this entry!" if kana.empty? #index insert_data = Entry.new(entry_sequence_num, kanji, kana, senses).to_sql db_transaction.prepare("INSERT INTO search( sequence_number, kanji, kana, senses ) VALUES( :sequence_number, :kanji, :kana, :senses );") do |stmt| stmt.execute( insert_data ) end # clear data for the next entry kanji, kana, senses = [], [], [] entries_added += 1 end end end end # puts "#{@index.size} entries indexed" # Done reading & indexing reader.close # @index.close end |
#build_pos_hash ⇒ Object
Creates the hash of part-of-speech symbols to full definitions from the dictionary
242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 |
# File 'lib/index.rb', line 242 def build_pos_hash pos_hash = {} reader = open_reader(@dictionary_path) done = false until done reader.read case reader.node_type when XML::Reader::TYPE_DOCUMENT_TYPE # segfaults when attempting this: # cs.each do |child| # p child.to_s # end doctype_string = reader.node.to_s entities = doctype_string.scan(ENTITY_REGEX) entities.map do |entity| abbrev = entity[0] full = entity[1] sym = pos_to_sym(abbrev) pos_hash[sym] = full end done = true when XML::Reader::TYPE_ELEMENT done = true end end pos_hash end |
#built? ⇒ Boolean
64 65 66 |
# File 'lib/index.rb', line 64 def built? @index.first_value_from( "SELECT count(*) from search" ) != 0 end |
#create_schema ⇒ Object
Creates the SQL schema for the Amalgalite database
49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
# File 'lib/index.rb', line 49 def create_schema schema = @index.schema unless schema.tables['search'] @index.execute_batch <<-SQL CREATE VIRTUAL TABLE search USING fts5( sequence_number, kanji, kana, senses ); SQL @index.reload_schema! end end |
#get_pos(pos) ⇒ String
Retrieves the definition of a part-of-speech from its abbreviation
280 281 282 283 |
# File 'lib/index.rb', line 280 def get_pos(pos) build_pos_hash if @pos_hash.empty? @pos_hash[pos_to_sym(pos)] end |
#make_query(term, exact) ⇒ Object
68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/index.rb', line 68 def make_query(term, exact) # convert full-width katakana to hiragana # TODO: convert half-width katakana to hiragana term.tr!('ァ-ン','ぁ-ん') if term.start_with?('seq:') query = "sequence_number : \"#{term[4..-1]}\"" else query = "{kanji kana senses} : \"#{term}\"" query += "*" unless exact end query end |
#open_reader(dictionary_path) ⇒ XML::Reader
Creates an XML::Reader object for the given path
230 231 232 233 234 235 236 237 238 239 |
# File 'lib/index.rb', line 230 def open_reader(dictionary_path) # open reader reader = nil Dir.chdir(Dir.pwd) do jmdict_path = File.join(dictionary_path) reader = XML::Reader.file(jmdict_path, :encoding => XML::Encoding::UTF_8) # create a reader for JMdict raise "Failed to create XML::Reader for #{dictionary_path}!" if reader.nil? end reader end |
#pos_to_sym(entity) ⇒ Symbol
Converts a part-of-speech entity reference string into a symbol
273 274 275 |
# File 'lib/index.rb', line 273 def pos_to_sym(entity) entity.gsub('-', '_').to_sym end |
#rebuild_index ⇒ Object
222 223 224 225 |
# File 'lib/index.rb', line 222 def rebuild_index raise "Index already exists at path #{@index_path}" if File.exists? @index_path build_index end |
#search(term, exact = false, language = LANGUAGE_DEFAULT) ⇒ Array(Entry)
Returns the search results as an array of Entry
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
# File 'lib/index.rb', line 87 def search(term, exact=false, language=LANGUAGE_DEFAULT) raise "Index not found at path #{@index_path}" unless File.exists? @index_path results = [] query = make_query(term, exact) @index.execute("SELECT sequence_number, kanji, kana, senses, bm25(search) as score FROM search WHERE search MATCH ? LIMIT ?", query, JDict.config.num_results) do |row| entry = Entry.from_sql(row) score = 0.0 is_exact_match = entry.kanji == term || entry.kana.any? { |k| k == term } score = 1.0 if is_exact_match should_add = !exact || (exact && is_exact_match) # add the result results << [score, entry] if should_add end # Sort the results by first column (score) and return only the second column (entry) results.sort { |entry_a, entry_b| entry_a[0] <=> entry_a[0] }.map { |entry| entry[1] } end |