Class: JDict::DictIndex

Inherits:
Object
  • Object
show all
Defined in:
lib/index.rb

Constant Summary collapse

LANGUAGE_DEFAULT =
JDict::JMDictConstants::Languages::ENGLISH
NUM_ENTRIES_TO_INDEX =
50
ENTITY_REGEX =
/<!ENTITY\s([^ ]*)\s\"(.*)">/

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(path) ⇒ DictIndex

Initialize a full-text search index backend for JMdict

Parameters:

  • path (String)

    path to the dictionary



27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/index.rb', line 27

def initialize(path)
  @dictionary_path = path
  @index_path = File.dirname(@dictionary_path)
  @pos_hash = {}

  raise "No dictionary found at path #{@dictionary_path}" unless File.exists? @dictionary_path

  db_file = File.join(@index_path, "fts5.db")

  File.unlink(db_file) if JDict.config.debug && File.exist?(db_file)

  @index = Amalgalite::Database.new(db_file)

  create_schema

  build_index unless built?

  #make the hash from abbreviated parts of speech to full definitions
  @pos_hash ||= build_pos_hash
end

Instance Attribute Details

#pathObject (readonly)

Returns the value of attribute path.



23
24
25
# File 'lib/index.rb', line 23

def path
  @path
end

Instance Method Details

#build_index(overwrite = false, dictionary_path = nil) ⇒ Integer

Builds the full-text search index

Parameters:

  • overwrite (Boolean) (defaults to: false)

    force a build even if the index path already exists

  • dictionary_path (String) (defaults to: nil)

    path to the dictionary file

Returns:

  • (Integer)

    the number of indexed entries



115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# File 'lib/index.rb', line 115

def build_index(overwrite=false, dictionary_path=nil)
  @dictionary_path = dictionary_path unless dictionary_path.nil?
  raise "No dictionary path was provided" if @dictionary_path.nil?
  raise "Dictionary not found at path #{@dictionary_path}" unless File.exists?(@dictionary_path)

  reader = open_reader(@dictionary_path)

  puts "Building index..."

  # whenever there is a reader error, print its block parameters
  XML::Error.set_handler { |*args| p args }

  # components of an entry
  entry_sequence_num, kanji, kana, senses = 0, [], [], []
  glosses = {}
  parts_of_speech = []

  entries_added = 0

  @index.transaction do |db_transaction|

    # read until the end
    while reader.read

      # check what type of node we're currently on
      case reader.node_type

        # start-of-element node
      when XML::Reader::TYPE_ELEMENT
        case reader.name
        when JDict::JMDictConstants::Elements::SEQUENCE
          entry_sequence_num = reader.next_text.to_i

          # TODO: Raise an exception if reader.next_text.empty? inside the when's
          #       JMdict shouldn't have any empty elements, I believe.
        when JDict::JMDictConstants::Elements::KANJI
          text = reader.next_text
          kanji << text unless text.empty?

        when JDict::JMDictConstants::Elements::KANA
          text = reader.next_text
          kana << text unless text.empty?

        when JDict::JMDictConstants::Elements::GLOSS
          language = reader.node.lang || LANGUAGE_DEFAULT
          language = language.intern
          text = reader.next_text
          unless text.empty?
            (glosses[language] ||= []) << text
          end

        when JDict::JMDictConstants::Elements::CROSSREFERENCE
          text = reader.next_text
        end

        # XML entity references are treated as a different node type
        # the parent node of the entity reference itself has the actual tag name
      when XML::Reader::TYPE_ENTITY_REFERENCE
        if reader.node.parent.name == JDict::JMDictConstants::Elements::PART_OF_SPEECH
          text = reader.name
          parts_of_speech << text unless text.empty?
        end

        # end-of-element node
      when XML::Reader::TYPE_END_ELEMENT
        case reader.name

        when JDict::JMDictConstants::Elements::SENSE
          # build sense
          senses << Sense.new(parts_of_speech, glosses)
          # glosses.each do |language, texts|
          #   senses << Sense.new(parts_of_speech,
          #                       texts.join(', ').strip,
          #                       language)
          # end

          # clear data for the next sense
          glosses = {}
          parts_of_speech = []

          # we're at the end of the entry element, so index it
        when JDict::JMDictConstants::Elements::ENTRY
          raise "No kana found for this entry!" if kana.empty?

          #index
          insert_data = Entry.new(entry_sequence_num, kanji, kana, senses).to_sql

          db_transaction.prepare("INSERT INTO search( sequence_number, kanji, kana, senses ) VALUES( :sequence_number, :kanji, :kana, :senses );") do |stmt|
            stmt.execute( insert_data )
          end

          # clear data for the next entry
          kanji, kana, senses = [], [], []

          entries_added += 1
        end
      end
    end
  end

  # puts "#{@index.size} entries indexed"

  # Done reading & indexing
  reader.close
  # @index.close
end

#build_pos_hashObject

Creates the hash of part-of-speech symbols to full definitions from the dictionary



242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
# File 'lib/index.rb', line 242

def build_pos_hash
  pos_hash = {}
  reader = open_reader(@dictionary_path)
  done = false
  until done
    reader.read
    case reader.node_type
    when XML::Reader::TYPE_DOCUMENT_TYPE
      # segfaults when attempting this:
      # cs.each do |child|
      #   p child.to_s
      # end
      doctype_string = reader.node.to_s
      entities = doctype_string.scan(ENTITY_REGEX)
      entities.map do |entity|
        abbrev = entity[0]
        full = entity[1]
        sym = pos_to_sym(abbrev)
        pos_hash[sym] = full
      end
      done = true
    when XML::Reader::TYPE_ELEMENT
      done = true
    end
  end
  pos_hash
end

#built?Boolean

Returns:

  • (Boolean)


64
65
66
# File 'lib/index.rb', line 64

def built?
  @index.first_value_from( "SELECT count(*) from search" ) != 0
end

#create_schemaObject

Creates the SQL schema for the Amalgalite database



49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/index.rb', line 49

def create_schema
  schema = @index.schema
  unless schema.tables['search']
    @index.execute_batch <<-SQL
    CREATE VIRTUAL TABLE search USING fts5(
        sequence_number,
        kanji,
        kana,
        senses
    );
    SQL
    @index.reload_schema!
  end
end

#get_pos(pos) ⇒ String

Retrieves the definition of a part-of-speech from its abbreviation

Parameters:

  • pos (String)

    the abbreviation for the part-of-speech

Returns:

  • (String)

    the full description of the part-of-speech



280
281
282
283
# File 'lib/index.rb', line 280

def get_pos(pos)
  build_pos_hash if @pos_hash.empty?
  @pos_hash[pos_to_sym(pos)]
end

#make_query(term, exact) ⇒ Object



68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/index.rb', line 68

def make_query(term, exact)
  # convert full-width katakana to hiragana
  # TODO: convert half-width katakana to hiragana
  term.tr!('ァ-ン','ぁ-ん')

  if term.start_with?('seq:')
    query = "sequence_number : \"#{term[4..-1]}\""
  else
    query = "{kanji kana senses} : \"#{term}\""
    query += "*" unless exact
  end

  query
end

#open_reader(dictionary_path) ⇒ XML::Reader

Creates an XML::Reader object for the given path

Parameters:

  • dictionary_path (String)

    path to the dictionary file

Returns:

  • (XML::Reader)

    the reader for the given dictionary



230
231
232
233
234
235
236
237
238
239
# File 'lib/index.rb', line 230

def open_reader(dictionary_path)
  # open reader
  reader = nil
  Dir.chdir(Dir.pwd) do
    jmdict_path = File.join(dictionary_path)
    reader = XML::Reader.file(jmdict_path, :encoding => XML::Encoding::UTF_8) # create a reader for JMdict
    raise "Failed to create XML::Reader for #{dictionary_path}!" if reader.nil?
  end
  reader
end

#pos_to_sym(entity) ⇒ Symbol

Converts a part-of-speech entity reference string into a symbol

Parameters:

  • entity (String)

    the entity reference string

Returns:

  • (Symbol)

    the part-of-speech symbol



273
274
275
# File 'lib/index.rb', line 273

def pos_to_sym(entity)
  entity.gsub('-', '_').to_sym
end

#rebuild_indexObject



222
223
224
225
# File 'lib/index.rb', line 222

def rebuild_index
  raise "Index already exists at path #{@index_path}" if File.exists? @index_path
  build_index
end

#search(term, exact = false, language = LANGUAGE_DEFAULT) ⇒ Array(Entry)

Returns the search results as an array of Entry

Parameters:

  • term (String)

    the search string

  • language (Symbol) (defaults to: LANGUAGE_DEFAULT)

    the language to return results in

Returns:

  • (Array(Entry))

    the results of the search



87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/index.rb', line 87

def search(term, exact=false, language=LANGUAGE_DEFAULT)
  raise "Index not found at path #{@index_path}" unless File.exists? @index_path

  results = []

  query = make_query(term, exact)

  @index.execute("SELECT sequence_number, kanji, kana, senses, bm25(search) as score FROM search WHERE search MATCH ? LIMIT ?", query, JDict.config.num_results) do |row|
    entry = Entry.from_sql(row)
    score = 0.0

    is_exact_match = entry.kanji == term || entry.kana.any? { |k| k == term }
    score = 1.0 if is_exact_match

    should_add = !exact || (exact && is_exact_match)

    # add the result
    results << [score, entry] if should_add
  end

  # Sort the results by first column (score) and return only the second column (entry)
  results.sort { |entry_a, entry_b| entry_a[0] <=> entry_a[0] }.map { |entry| entry[1] }
end