Class: Clip::Tokenizer

Inherits:

Object

Object
Clip::Tokenizer

show all

Defined in:: lib/clip/tokenizer.rb

Constant Summary collapse

INPUT_VECTOR_SIZE =

Instance Method Summary collapse

Constructor Details

#initialize(bpe_path = dir + "/../bpe_simple_vocab_16e6.txt.gz") ⇒ `Tokenizer`

Returns a new instance of Tokenizer.

# File 'lib/clip/tokenizer.rb', line 8

def initialize(bpe_path = __dir__ + "/../bpe_simple_vocab_16e6.txt.gz")
  @byte_encoder = bytes_to_unicode
  @byte_decoder = @byte_encoder.invert
  merges = Zlib::GzipReader.open(bpe_path).read.split("\n")[1..(49152 - 256 - 2)]
  merges = merges.map { |merge| merge.split(" ") }
  vocab = @byte_encoder.values
  vocab += vocab.map { |v| "#{v}</w>" }
  merges.each { |merge| vocab << merge.join }
  vocab += [ "<|startoftext|>", "<|endoftext|>" ]
  @encoder = Hash[vocab.zip(0...vocab.size)]
  @decoder = @encoder.invert
  @bpe_ranks = Hash[merges.zip(0...merges.size)]
  @cache = { "<|startoftext|>" => "<|startoftext|>", "<|endoftext|>" => "<|endoftext|>" }
  @pattern = Regexp.new("<\\|startoftext\\|>|<\\|endoftext\\|>|'s|'t|'re|'ve|'m|'ll|'d|\\p{L}+|\\p{N}|[^\\s\\p{L}\\p{N}]+", Regexp::IGNORECASE)
end

Instance Method Details

#basic_clean(text) ⇒ `Object`



56
57
58

# File 'lib/clip/tokenizer.rb', line 56

def basic_clean(text)
  text
end

#bpe(token) ⇒ `Object`

# File 'lib/clip/tokenizer.rb', line 64

def bpe(token)
  return @cache[token] if @cache.key?(token)

  word = token.chars[0..-2] + [ "#{token[-1]}</w>" ]
  pairs = get_pairs(word)

  until pairs.empty?
    bigram = pairs.min_by { |pair| @bpe_ranks.fetch(pair, Float::INFINITY) }
    break unless @bpe_ranks.key?(bigram)

    first, second = bigram
    new_word = []
    i = 0
    while i < word.size
      j = word[i..-1]&.index(first)
      j = j.nil? ? nil : j + i

      if j.nil?
        new_word.concat(word[i..-1])
        break
      else
        new_word.concat(word[i...j])
        if word[j] == first && word[j + 1] == second
          new_word << "#{first}#{second}"
          i = j + 2
        else
          new_word << word[j]
          i = j + 1
        end
      end
    end

    word = new_word
    break if word.size == 1

    pairs = get_pairs(word)
  end

  result = word.join(" ")
  @cache[token] = result
  result
end

#bytes_to_unicode ⇒ `Object`

# File 'lib/clip/tokenizer.rb', line 24

def bytes_to_unicode
  # Define base ranges for printable ASCII and extended Unicode
  bs = (33..126).to_a + (161..172).to_a + (174..255).to_a # Printable characters
  cs = bs.dup # Start with the same set of characters for mapping

  # Map remaining bytes (0–255) to unique Unicode codepoints starting from 256
  n = 0
  (0...256).each do |b|
    unless bs.include?(b)
      bs << b                   # Add the byte to the mapping
      cs << (256 + n)           # Assign a unique Unicode codepoint
      n += 1                    # Increment the counter for unmapped bytes
    end
  end

  # Convert codepoints to UTF-8 strings
  cs = cs.map { |n| n.chr(Encoding::UTF_8) }

  # Create a hash mapping bytes (0–255) to Unicode strings
  Hash[bs.zip(cs)]
end

#decode(tokens) ⇒ `Object`

# File 'lib/clip/tokenizer.rb', line 128

def decode(tokens)
  text = tokens.map { |token| @decoder[token] }.join
  text = text.gsub("</w>", " ")

  decoded_bytes = text.each_char.map do |c|
    @byte_decoder[c]
  end

  decoded_bytes.compact.pack("C*").force_encoding("utf-8")
end

#encode(text) ⇒ `Object`

# File 'lib/clip/tokenizer.rb', line 107

def encode(text)
  bpe_tokens = []
  cleaned_text = whitespace_clean(basic_clean(text)).downcase
  cleaned_text = "<|startoftext|>#{cleaned_text}<|endoftext|>"
  cleaned_text.scan(@pattern) do |token|
    utf8_bytes = token.encode("utf-8").bytes

    mapped_chars = utf8_bytes.map do |b|
      @byte_encoder[b]
    end
    encoded = mapped_chars.join

    bpe_subtokens = bpe(encoded).split(" ")

    bpe_subtokens.each do |subtok|
      bpe_tokens << @encoder[subtok]
    end
  end
  pad_array(bpe_tokens)
end

#get_pairs(word) ⇒ `Object`

# File 'lib/clip/tokenizer.rb', line 46

def get_pairs(word)
  pairs = Set.new
  prev_char = word[0]
  word[1..-1].each do |char|
    pairs.add([ prev_char, char ])
    prev_char = char
  end
  pairs
end

#pad_array(array) ⇒ `Object`



139
140
141

# File 'lib/clip/tokenizer.rb', line 139

def pad_array(array)
  array.fill(0, array.length...INPUT_VECTOR_SIZE).first(INPUT_VECTOR_SIZE)
end

#whitespace_clean(text) ⇒ `Object`



60
61
62

# File 'lib/clip/tokenizer.rb', line 60

def whitespace_clean(text)
  text.gsub(/\s+/, " ").strip
end

Class: Clip::Tokenizer

Constant Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(bpe_path = __dir__ + "/../bpe_simple_vocab_16e6.txt.gz") ⇒ Tokenizer

Instance Method Details

#basic_clean(text) ⇒ Object

#bpe(token) ⇒ Object

#bytes_to_unicode ⇒ Object

#decode(tokens) ⇒ Object

#encode(text) ⇒ Object

#get_pairs(word) ⇒ Object

#pad_array(array) ⇒ Object

#whitespace_clean(text) ⇒ Object

#initialize(bpe_path = dir + "/../bpe_simple_vocab_16e6.txt.gz") ⇒ `Tokenizer`

#basic_clean(text) ⇒ `Object`

#bpe(token) ⇒ `Object`

#bytes_to_unicode ⇒ `Object`

#decode(tokens) ⇒ `Object`

#encode(text) ⇒ `Object`

#get_pairs(word) ⇒ `Object`

#pad_array(array) ⇒ `Object`

#whitespace_clean(text) ⇒ `Object`