Class: JDict::DictIndex
- Inherits:
-
Object
- Object
- JDict::DictIndex
- Defined in:
- lib/index.rb
Constant Summary collapse
- LANGUAGE_DEFAULT =
JDict::JMDictConstants::Languages::ENGLISH
- NUM_ENTRIES_TO_INDEX =
50- ENTITY_REGEX =
/<!ENTITY\s([^ ]*)\s\"(.*)">/
Instance Attribute Summary collapse
-
#path ⇒ Object
readonly
Returns the value of attribute path.
Instance Method Summary collapse
-
#build(overwrite = false, dictionary_path = nil) ⇒ Integer
Builds the full-text search index.
-
#build_pos_hash ⇒ Object
Creates the hash of part-of-speech symbols to full definitions from the dictionary.
- #built? ⇒ Boolean
-
#create_schema ⇒ Object
Creates the SQL schema for the Amalgalite database.
-
#get_pos(pos) ⇒ String
Retrieves the definition of a part-of-speech from its abbreviation.
-
#initialize(index_path, dictionary_path = nil, lazy_loading = JDict.configuration.lazy_index_loading) ⇒ DictIndex
constructor
Initialize a full-text search index backend for JMdict.
-
#open_reader(dictionary_path) ⇒ XML::Reader
Creates an XML::Reader object for the given path.
-
#pos_to_sym(entity) ⇒ Symbol
Converts a part-of-speech entity reference string into a symbol.
- #rebuild ⇒ Object
-
#search(term, language = LANGUAGE_DEFAULT, exact = false) ⇒ Array(Entry)
Returns the search results as an array of
Entry.
Constructor Details
#initialize(index_path, dictionary_path = nil, lazy_loading = JDict.configuration.lazy_index_loading) ⇒ DictIndex
Initialize a full-text search index backend for JMdict
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
# File 'lib/index.rb', line 32 def initialize(index_path, dictionary_path=nil, lazy_loading=JDict.configuration.lazy_index_loading) raise "Index path was nil" if index_path.nil? path_specified = dictionary_path.nil? ? false : true if path_specified and not File.exists? dictionary_path raise "Dictionary not found at path #{dictionary_path}" end @path = index_path @dictionary_path = dictionary_path @pos_hash = {} # create path if nonexistent FileUtils.mkdir_p(@path) db_file = File.join(@path, "fts5.db") File.unlink(db_file) if JDict.configuration.debug && File.exist?(db_file) @index = Amalgalite::Database.new(db_file) create_schema #check if the index has already been built before Ferret creates it already_built = built? #build the index right now if "lazy loading" isn't on and the index is empty build unless lazy_loading or (already_built && !JDict.configuration.debug) #make the hash from abbreviated parts of speech to full definitions build_pos_hash end |
Instance Attribute Details
#path ⇒ Object (readonly)
Returns the value of attribute path.
27 28 29 |
# File 'lib/index.rb', line 27 def path @path end |
Instance Method Details
#build(overwrite = false, dictionary_path = nil) ⇒ Integer
Builds the full-text search index
140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 |
# File 'lib/index.rb', line 140 def build(overwrite=false, dictionary_path=nil) @dictionary_path = dictionary_path unless dictionary_path.nil? raise "No dictionary path was provided" if @dictionary_path.nil? raise "Dictionary not found at path #{@dictionary_path}" unless File.exists?(@dictionary_path) reader = open_reader(@dictionary_path) puts "Building index..." # whenever there is a reader error, print its block parameters XML::Error.set_handler { |*args| p args } # components of an entry kanji, kana, senses = [], [], [] glosses = {} parts_of_speech = [] entries_added = 0 @index.transaction do |db_transaction| # read until the end while reader.read # check what type of node we're currently on case reader.node_type # start-of-element node when XML::Reader::TYPE_ELEMENT case reader.name when JDict::JMDictConstants::Elements::SEQUENCE entry_sequence_num = reader.next_text # TODO: Raise an exception if reader.next_text.empty? inside the when's # JMdict shouldn't have any empty elements, I believe. when JDict::JMDictConstants::Elements::KANJI text = reader.next_text kanji << text unless text.empty? when JDict::JMDictConstants::Elements::KANA text = reader.next_text kana << text unless text.empty? when JDict::JMDictConstants::Elements::GLOSS language = reader.node.lang || LANGUAGE_DEFAULT language = language.intern text = reader.next_text unless text.empty? (glosses[language] ||= []) << text end when JDict::JMDictConstants::Elements::CROSSREFERENCE text = reader.next_text end # XML entity references are treated as a different node type # the parent node of the entity reference itself has the actual tag name when XML::Reader::TYPE_ENTITY_REFERENCE if reader.node.parent.name == JDict::JMDictConstants::Elements::PART_OF_SPEECH text = reader.name parts_of_speech << text unless text.empty? end # end-of-element node when XML::Reader::TYPE_END_ELEMENT case reader.name when JDict::JMDictConstants::Elements::SENSE # build sense senses << Sense.new(parts_of_speech, glosses) # glosses.each do |language, texts| # senses << Sense.new(parts_of_speech, # texts.join(', ').strip, # language) # end # clear data for the next sense glosses = {} parts_of_speech = [] # we're at the end of the entry element, so index it when JDict::JMDictConstants::Elements::ENTRY raise "No kana found for this entry!" if kana.empty? #index # @index.add_entry(i, Entry.new(kanji, kana, senses)) insert_data = Entry.new(kanji, kana, senses).to_sql db_transaction.prepare("INSERT INTO search( kanji, kana, senses ) VALUES( :kanji, :kana, :senses );") do |stmt| stmt.execute( insert_data ) end # TODO: add entry_sequence_num to the entry # clear data for the next entry kanji, kana, senses = [], [], [] entries_added += 1 #debug if JDict.configuration.debug break if entries_added >= NUM_ENTRIES_TO_INDEX # # if @index.size.modulo(1000) == 0 # if @index.size.modulo(100) == 0 # # puts "#{@index.size/1000} thousand" # puts "\r#{@index.size/100} hundred" # end end end end end end # puts "#{@index.size} entries indexed" # Done reading & indexing reader.close # @index.close end |
#build_pos_hash ⇒ Object
Creates the hash of part-of-speech symbols to full definitions from the dictionary
279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 |
# File 'lib/index.rb', line 279 def build_pos_hash @pos_hash ||= begin pos_hash = {} reader = open_reader(@dictionary_path) done = false while done == false reader.read case reader.node_type when XML::Reader::TYPE_DOCUMENT_TYPE # random segfault when attempting this # cs.each do |child| # p child.to_s # end doctype_string = reader.node.to_s entities = doctype_string.scan(ENTITY_REGEX) entities.map do |entity| abbrev = entity[0] full = entity[1] sym = pos_to_sym(abbrev) pos_hash[sym] = full end done = true when XML::Reader::TYPE_ELEMENT done = true end end pos_hash end end |
#built? ⇒ Boolean
134 |
# File 'lib/index.rb', line 134 def built?; @index.first_value_from( "SELECT count(*) from search" ) != 0; end |
#create_schema ⇒ Object
Creates the SQL schema for the Amalgalite database
65 66 67 68 69 70 71 72 73 74 75 76 77 |
# File 'lib/index.rb', line 65 def create_schema schema = @index.schema unless schema.tables['search'] @index.execute_batch " CREATE VIRTUAL TABLE search USING fts5(\n kanji,\n kana,\n senses\n );\n SQL\n @index.reload_schema!\n end\nend\n" |
#get_pos(pos) ⇒ String
Retrieves the definition of a part-of-speech from its abbreviation
319 320 321 322 |
# File 'lib/index.rb', line 319 def get_pos(pos) build_pos_hash if @pos_hash.empty? @pos_hash[pos_to_sym(pos)] end |
#open_reader(dictionary_path) ⇒ XML::Reader
Creates an XML::Reader object for the given path
267 268 269 270 271 272 273 274 275 276 |
# File 'lib/index.rb', line 267 def open_reader(dictionary_path) # open reader reader = nil Dir.chdir(Dir.pwd) do jmdict_path = File.join(dictionary_path) reader = XML::Reader.file(jmdict_path, :encoding => XML::Encoding::UTF_8) # create a reader for JMdict raise "Failed to create XML::Reader for #{dictionary_path}!" if reader.nil? end reader end |
#pos_to_sym(entity) ⇒ Symbol
Converts a part-of-speech entity reference string into a symbol
312 313 314 |
# File 'lib/index.rb', line 312 def pos_to_sym(entity) entity.gsub('-', '_').to_sym end |
#rebuild ⇒ Object
259 260 261 262 |
# File 'lib/index.rb', line 259 def rebuild raise "Index already exists at path #{@path}" if File.exists? @path build end |
#search(term, language = LANGUAGE_DEFAULT, exact = false) ⇒ Array(Entry)
Returns the search results as an array of Entry
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
# File 'lib/index.rb', line 83 def search(term, language=LANGUAGE_DEFAULT, exact=false) raise "Index not found at path #{@path}" unless File.exists? @path # no results yet... results = [] @entries_cache = [] # convert full-width katakana to hiragana # TODO: convert half-width katakana to hiragana term.tr!('ァ-ン','ぁ-ん') # search for: # kanji... one field # kana ... up to 10 fields # sense... up to 10 fields # query = 'kanji OR ' + (0..10).map { |x| "kana_#{x} OR sense_#{x}" }.join(' OR ') + ":\"#{term}\"" query = "{kanji kana senses} : \"#{term}\"" query += "*" unless exact @index.execute("SELECT kanji, kana, senses, bm25(search) as score FROM search WHERE search MATCH ? LIMIT ?", query, JDict.configuration.num_results) do |row| entry = Entry.from_sql(row) score = 0.0 # load entry from the index. from cache, if it's available # load from cache if it's available # if entry = @entries_cache[docid] # entry = Entry.from_index_doc(@ferret_index[docid].load) # @entries_cache[docid] = entry # end # # load entry from the index # if entry.nil? # entry = Entry.from_index_doc(@ferret_index[docid].load) # @entries_cache[docid] = entry # end is_exact_match = false is_exact_match = entry.kanji == term || entry.kana.any? { |k| k == term } # add the result results << [score, entry] end @entries_cache = [] results.sort { |x, y| y[0] <=> x[0] }.map { |x| x[1] } end |