Class: JDict::DictIndex

Inherits:
Object
  • Object
show all
Defined in:
lib/index.rb

Constant Summary collapse

LANGUAGE_DEFAULT =
JDict::JMDictConstants::Languages::ENGLISH
NUM_ENTRIES_TO_INDEX =
50
ENTITY_REGEX =
/<!ENTITY\s([^ ]*)\s\"(.*)">/

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(index_path, dictionary_path = nil, lazy_loading = JDict.configuration.lazy_index_loading) ⇒ DictIndex

Initialize a full-text search index backend for JMdict

Parameters:

  • index_path (String)

    desired filesystem path where you’d like the *search index* stored

  • dictionary_path (String) (defaults to: nil)

    desired filesystem path where you’d like the dictionary stored

  • lazy_loading (Boolean) (defaults to: JDict.configuration.lazy_index_loading)

    lazily load the index just when it’s needed, instead of building it ahead of time



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/index.rb', line 32

def initialize(index_path, dictionary_path=nil, lazy_loading=JDict.configuration.lazy_index_loading)
  raise "Index path was nil" if index_path.nil?

  path_specified = dictionary_path.nil? ? false : true
  if path_specified and not File.exists? dictionary_path
    raise "Dictionary not found at path #{dictionary_path}"
  end

  @path = index_path
  @dictionary_path = dictionary_path
  @pos_hash = {}

  # create path if nonexistent
  FileUtils.mkdir_p(@path)
  db_file = File.join(@path, "fts5.db")

  File.unlink(db_file) if JDict.configuration.debug && File.exist?(db_file)

  @index = Amalgalite::Database.new(db_file)

  create_schema

  #check if the index has already been built before Ferret creates it
  already_built = built?

  #build the index right now if "lazy loading" isn't on and the index is empty
  build unless lazy_loading or (already_built && !JDict.configuration.debug)

  #make the hash from abbreviated parts of speech to full definitions
  build_pos_hash
end

Instance Attribute Details

#pathObject (readonly)

Returns the value of attribute path.



27
28
29
# File 'lib/index.rb', line 27

def path
  @path
end

Instance Method Details

#build(overwrite = false, dictionary_path = nil) ⇒ Integer

Builds the full-text search index

Parameters:

  • overwrite (Boolean) (defaults to: false)

    force a build even if the index path already exists

  • dictionary_path (String) (defaults to: nil)

    path to the dictionary file

Returns:

  • (Integer)

    the number of indexed entries



140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
# File 'lib/index.rb', line 140

def build(overwrite=false, dictionary_path=nil)
  @dictionary_path = dictionary_path unless dictionary_path.nil?
  raise "No dictionary path was provided" if @dictionary_path.nil?
  raise "Dictionary not found at path #{@dictionary_path}" unless File.exists?(@dictionary_path)
  
  reader = open_reader(@dictionary_path)

  puts "Building index..."

  # whenever there is a reader error, print its block parameters
  XML::Error.set_handler { |*args| p args }

  # components of an entry
  kanji, kana, senses = [], [], []
  glosses = {}
  parts_of_speech = []

  entries_added = 0
  
  @index.transaction do |db_transaction|

    # read until the end
    while reader.read

      # check what type of node we're currently on
      case reader.node_type
        
        # start-of-element node
      when XML::Reader::TYPE_ELEMENT
        case reader.name
        when JDict::JMDictConstants::Elements::SEQUENCE
          entry_sequence_num = reader.next_text

          # TODO: Raise an exception if reader.next_text.empty? inside the when's
          #       JMdict shouldn't have any empty elements, I believe.
        when JDict::JMDictConstants::Elements::KANJI
          text = reader.next_text
          kanji << text unless text.empty?

        when JDict::JMDictConstants::Elements::KANA
          text = reader.next_text
          kana << text unless text.empty?

        when JDict::JMDictConstants::Elements::GLOSS
          language = reader.node.lang || LANGUAGE_DEFAULT
          language = language.intern
          text = reader.next_text
          unless text.empty?
            (glosses[language] ||= []) << text
          end

        when JDict::JMDictConstants::Elements::CROSSREFERENCE
          text = reader.next_text
        end

        # XML entity references are treated as a different node type
        # the parent node of the entity reference itself has the actual tag name
      when XML::Reader::TYPE_ENTITY_REFERENCE
        if reader.node.parent.name == JDict::JMDictConstants::Elements::PART_OF_SPEECH
            text = reader.name
            parts_of_speech << text unless text.empty?
        end

        # end-of-element node
      when XML::Reader::TYPE_END_ELEMENT
        case reader.name

        when JDict::JMDictConstants::Elements::SENSE
          # build sense
          senses << Sense.new(parts_of_speech, glosses)
          # glosses.each do |language, texts|
          #   senses << Sense.new(parts_of_speech,
          #                       texts.join(', ').strip,
          #                       language)
          # end

          # clear data for the next sense
          glosses = {}
          parts_of_speech = []

          # we're at the end of the entry element, so index it
        when JDict::JMDictConstants::Elements::ENTRY
          raise "No kana found for this entry!" if kana.empty?

          #index
          # @index.add_entry(i, Entry.new(kanji, kana, senses))
          insert_data = Entry.new(kanji, kana, senses).to_sql

          db_transaction.prepare("INSERT INTO search( kanji, kana, senses ) VALUES( :kanji, :kana, :senses );") do |stmt|
            stmt.execute( insert_data )
          end

          # TODO: add entry_sequence_num to the entry

          # clear data for the next entry
          kanji, kana, senses = [], [], []

          entries_added += 1
          #debug
          if JDict.configuration.debug
            break if entries_added >= NUM_ENTRIES_TO_INDEX
            #   # if @index.size.modulo(1000) == 0
            #   if @index.size.modulo(100) == 0
            #     # puts "#{@index.size/1000} thousand"
            #     puts "\r#{@index.size/100} hundred"
            #   end
          end
        end
      end
    end
  end

  # puts "#{@index.size} entries indexed"

  # Done reading & indexing
  reader.close
  # @index.close
end

#build_pos_hashObject

Creates the hash of part-of-speech symbols to full definitions from the dictionary



279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
# File 'lib/index.rb', line 279

def build_pos_hash
  @pos_hash ||= begin
    pos_hash = {}
    reader = open_reader(@dictionary_path)
    done = false
    while done == false
        reader.read
        case reader.node_type
        when XML::Reader::TYPE_DOCUMENT_TYPE
            # random segfault when attempting this
            # cs.each do |child|
            #   p child.to_s
            # end
            doctype_string = reader.node.to_s
            entities = doctype_string.scan(ENTITY_REGEX)
            entities.map do |entity|
              abbrev = entity[0]
              full = entity[1]
              sym = pos_to_sym(abbrev)
              pos_hash[sym] = full
            end
            done = true
        when XML::Reader::TYPE_ELEMENT
            done = true
        end
    end
    pos_hash
  end
end

#built?Boolean

Returns:

  • (Boolean)


134
# File 'lib/index.rb', line 134

def built?; @index.first_value_from( "SELECT count(*) from search" ) != 0; end

#create_schemaObject

Creates the SQL schema for the Amalgalite database



65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/index.rb', line 65

def create_schema
  schema = @index.schema
  unless schema.tables['search']
    @index.execute_batch "    CREATE VIRTUAL TABLE search USING fts5(\n        kanji,\n        kana,\n        senses\n    );\n    SQL\n    @index.reload_schema!\n  end\nend\n"

#get_pos(pos) ⇒ String

Retrieves the definition of a part-of-speech from its abbreviation

Parameters:

  • pos (String)

    the abbreviation for the part-of-speech

Returns:

  • (String)

    the full description of the part-of-speech



319
320
321
322
# File 'lib/index.rb', line 319

def get_pos(pos)
  build_pos_hash if @pos_hash.empty?
  @pos_hash[pos_to_sym(pos)]
end

#open_reader(dictionary_path) ⇒ XML::Reader

Creates an XML::Reader object for the given path

Parameters:

  • dictionary_path (String)

    path to the dictionary file

Returns:

  • (XML::Reader)

    the reader for the given dictionary



267
268
269
270
271
272
273
274
275
276
# File 'lib/index.rb', line 267

def open_reader(dictionary_path)
  # open reader
  reader = nil
  Dir.chdir(Dir.pwd) do
    jmdict_path = File.join(dictionary_path)
    reader = XML::Reader.file(jmdict_path, :encoding => XML::Encoding::UTF_8) # create a reader for JMdict
    raise "Failed to create XML::Reader for #{dictionary_path}!" if reader.nil?
  end
  reader
end

#pos_to_sym(entity) ⇒ Symbol

Converts a part-of-speech entity reference string into a symbol

Parameters:

  • entity (String)

    the entity reference string

Returns:

  • (Symbol)

    the part-of-speech symbol



312
313
314
# File 'lib/index.rb', line 312

def pos_to_sym(entity)
  entity.gsub('-', '_').to_sym
end

#rebuildObject



259
260
261
262
# File 'lib/index.rb', line 259

def rebuild
  raise "Index already exists at path #{@path}" if File.exists? @path
  build
end

#search(term, language = LANGUAGE_DEFAULT, exact = false) ⇒ Array(Entry)

Returns the search results as an array of Entry

Parameters:

  • term (String)

    the search string

  • language (Symbol) (defaults to: LANGUAGE_DEFAULT)

    the language to return results in

Returns:

  • (Array(Entry))

    the results of the search



83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# File 'lib/index.rb', line 83

def search(term, language=LANGUAGE_DEFAULT, exact=false)
  raise "Index not found at path #{@path}" unless File.exists? @path
  
  # no results yet...
  results = []

  @entries_cache = []
  
  # convert full-width katakana to hiragana
  # TODO: convert half-width katakana to hiragana
  term.tr!('ァ-ン','ぁ-ん')
  
  # search for:
  #   kanji... one field
  #   kana ... up to 10 fields
  #   sense... up to 10 fields
  # query = 'kanji OR ' + (0..10).map { |x| "kana_#{x} OR sense_#{x}" }.join(' OR ') + ":\"#{term}\""

  query = "{kanji kana senses} : \"#{term}\""
  query += "*" unless exact

  @index.execute("SELECT kanji, kana, senses, bm25(search) as score FROM search WHERE search MATCH ? LIMIT ?", query, JDict.configuration.num_results) do |row|
    entry = Entry.from_sql(row)
    score = 0.0

    # load entry from the index. from cache, if it's available
    # load from cache if it's available
    # if entry = @entries_cache[docid]
    #   entry = Entry.from_index_doc(@ferret_index[docid].load)
    #   @entries_cache[docid] = entry
    # end        
    
    # # load entry from the index
    # if entry.nil?
    #   entry = Entry.from_index_doc(@ferret_index[docid].load)
    #   @entries_cache[docid] = entry
    # end
    
    is_exact_match = false
    is_exact_match = entry.kanji == term ||
      entry.kana.any? { |k| k == term }
    
    # add the result
    results << [score, entry]
  end

  @entries_cache = []
  
  results.sort { |x, y| y[0] <=> x[0] }.map { |x| x[1] }
end