Class: Score

Inherits:
Object show all
Defined in:
lib/selecta.rb

Constant Summary collapse

BOUNDARY_CHARS =

A word boundary character is any ASCII character that’s not alphanumeric. This isn’t strictly correct: characters like ZERO WIDTH NON-JOINER, non-Latin punctuation, etc. will be incorrectly treated as non-boundary characters. This is necessary for performance: even building a Set of boundary characters based only on the input text is prohibitively slow (2-3 seconds for 80,000 input paths on a 2014 MacBook Pro).

(0..127).map(&:chr).select do |char|
  char !~ /[A-Za-z0-9_]/
end.to_set

Class Method Summary collapse

Class Method Details

.each_index_of_char_in_string(string, char) ⇒ Object

Find all occurrences of the character in the string, returning their indexes.



433
434
435
436
437
438
439
440
441
442
# File 'lib/selecta.rb', line 433

def each_index_of_char_in_string(string, char)
  index = 0
  while index
    index = string.index(char, index)
    if index
      yield index
      index += 1
    end
  end
end

.find_end_of_match(string, chars, score, first_index) ⇒ Object

Find each of the characters in the string, moving strictly left to right.



445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
# File 'lib/selecta.rb', line 445

def find_end_of_match(string, chars, score, first_index)
  last_index = first_index

  # Remember the type of the last character match for special scoring.
  last_type = nil

  chars.each do |this_char|
    # Where's the next occurrence of this character? The optimal algorithm
    # would consider all instances of query character, but that's slower
    # than this eager method.
    index = string.index(this_char, last_index + 1)

    # This character doesn't occur in the string, so this can't be a match.
    return [nil, nil] unless index

    if index == last_index + 1
      # This matching character immediately follows the last matching
      # character. The first two sequential characters score; subsequent
      # ones don't.
      if last_type != :sequential
        last_type = :sequential
        score += 1
      end
    # This character follows a boundary character.
    elsif BOUNDARY_CHARS.include?(string[index - 1])
      if last_type != :boundary
        last_type = :boundary
        score += 1
      end
    # This character isn't special.
    else
      last_type = :normal
      score += index - last_index
    end

    last_index = index
  end

  [score, last_index]
end

.score(string, query_chars) ⇒ Object



405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
# File 'lib/selecta.rb', line 405

def score(string, query_chars)
  query_chars -= [" "] # Change by Jonas Müller
  first_char, *rest = query_chars

  # Keep track of the best match that we've seen. This is uglier than
  # building a list of matches and then sorting them, but it's faster.
  best_score = Float::INFINITY
  best_range = nil

  # Iterate over each instance of the first query character. E.g., if we're
  # querying the string "axbx" for "x", we'll start at index 1 and index 3.
  each_index_of_char_in_string(string, first_char) do |first_index|
    score = 1

    # Find the best score starting at this index.
    score, last_index = find_end_of_match(string, rest, score, first_index)

    # Did we do better than we have for the best starting point so far?
    if last_index && score < best_score
      best_score = score
      best_range = (first_index..last_index)
    end
  end

  [best_score, best_range]
end