Class: Clip::Tokenizer
- Inherits:
-
Object
- Object
- Clip::Tokenizer
- Defined in:
- lib/clip/tokenizer.rb
Constant Summary collapse
- INPUT_VECTOR_SIZE =
77
Instance Method Summary collapse
- #basic_clean(text) ⇒ Object
- #bpe(token) ⇒ Object
- #bytes_to_unicode ⇒ Object
- #decode(tokens) ⇒ Object
- #encode(text) ⇒ Object
- #get_pairs(word) ⇒ Object
-
#initialize(bpe_path = __dir__ + "/../bpe_simple_vocab_16e6.txt.gz") ⇒ Tokenizer
constructor
A new instance of Tokenizer.
- #pad_array(array) ⇒ Object
- #whitespace_clean(text) ⇒ Object
Constructor Details
#initialize(bpe_path = __dir__ + "/../bpe_simple_vocab_16e6.txt.gz") ⇒ Tokenizer
Returns a new instance of Tokenizer.
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 |
# File 'lib/clip/tokenizer.rb', line 8 def initialize(bpe_path = __dir__ + "/../bpe_simple_vocab_16e6.txt.gz") @byte_encoder = bytes_to_unicode @byte_decoder = @byte_encoder.invert merges = Zlib::GzipReader.open(bpe_path).read.split("\n")[1..(49152 - 256 - 2)] merges = merges.map { |merge| merge.split(" ") } vocab = @byte_encoder.values vocab += vocab.map { |v| "#{v}</w>" } merges.each { |merge| vocab << merge.join } vocab += [ "<|startoftext|>", "<|endoftext|>" ] @encoder = Hash[vocab.zip(0...vocab.size)] @decoder = @encoder.invert @bpe_ranks = Hash[merges.zip(0...merges.size)] @cache = { "<|startoftext|>" => "<|startoftext|>", "<|endoftext|>" => "<|endoftext|>" } @pattern = Regexp.new("<\\|startoftext\\|>|<\\|endoftext\\|>|'s|'t|'re|'ve|'m|'ll|'d|\\p{L}+|\\p{N}|[^\\s\\p{L}\\p{N}]+", Regexp::IGNORECASE) end |
Instance Method Details
#basic_clean(text) ⇒ Object
56 57 58 |
# File 'lib/clip/tokenizer.rb', line 56 def basic_clean(text) text end |
#bpe(token) ⇒ Object
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
# File 'lib/clip/tokenizer.rb', line 64 def bpe(token) return @cache[token] if @cache.key?(token) word = token.chars[0..-2] + [ "#{token[-1]}</w>" ] pairs = get_pairs(word) until pairs.empty? bigram = pairs.min_by { |pair| @bpe_ranks.fetch(pair, Float::INFINITY) } break unless @bpe_ranks.key?(bigram) first, second = bigram new_word = [] i = 0 while i < word.size j = word[i..-1]&.index(first) j = j.nil? ? nil : j + i if j.nil? new_word.concat(word[i..-1]) break else new_word.concat(word[i...j]) if word[j] == first && word[j + 1] == second new_word << "#{first}#{second}" i = j + 2 else new_word << word[j] i = j + 1 end end end word = new_word break if word.size == 1 pairs = get_pairs(word) end result = word.join(" ") @cache[token] = result result end |
#bytes_to_unicode ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
# File 'lib/clip/tokenizer.rb', line 24 def bytes_to_unicode # Define base ranges for printable ASCII and extended Unicode bs = (33..126).to_a + (161..172).to_a + (174..255).to_a # Printable characters cs = bs.dup # Start with the same set of characters for mapping # Map remaining bytes (0–255) to unique Unicode codepoints starting from 256 n = 0 (0...256).each do |b| unless bs.include?(b) bs << b # Add the byte to the mapping cs << (256 + n) # Assign a unique Unicode codepoint n += 1 # Increment the counter for unmapped bytes end end # Convert codepoints to UTF-8 strings cs = cs.map { |n| n.chr(Encoding::UTF_8) } # Create a hash mapping bytes (0–255) to Unicode strings Hash[bs.zip(cs)] end |
#decode(tokens) ⇒ Object
128 129 130 131 132 133 134 135 136 137 |
# File 'lib/clip/tokenizer.rb', line 128 def decode(tokens) text = tokens.map { |token| @decoder[token] }.join text = text.gsub("</w>", " ") decoded_bytes = text.each_char.map do |c| @byte_decoder[c] end decoded_bytes.compact.pack("C*").force_encoding("utf-8") end |
#encode(text) ⇒ Object
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
# File 'lib/clip/tokenizer.rb', line 107 def encode(text) bpe_tokens = [] cleaned_text = whitespace_clean(basic_clean(text)).downcase cleaned_text = "<|startoftext|>#{cleaned_text}<|endoftext|>" cleaned_text.scan(@pattern) do |token| utf8_bytes = token.encode("utf-8").bytes mapped_chars = utf8_bytes.map do |b| @byte_encoder[b] end encoded = mapped_chars.join bpe_subtokens = bpe(encoded).split(" ") bpe_subtokens.each do |subtok| bpe_tokens << @encoder[subtok] end end pad_array(bpe_tokens) end |
#get_pairs(word) ⇒ Object
46 47 48 49 50 51 52 53 54 |
# File 'lib/clip/tokenizer.rb', line 46 def get_pairs(word) pairs = Set.new prev_char = word[0] word[1..-1].each do |char| pairs.add([ prev_char, char ]) prev_char = char end pairs end |
#pad_array(array) ⇒ Object
139 140 141 |
# File 'lib/clip/tokenizer.rb', line 139 def pad_array(array) array.fill(0, array.length...INPUT_VECTOR_SIZE).first(INPUT_VECTOR_SIZE) end |
#whitespace_clean(text) ⇒ Object
60 61 62 |
# File 'lib/clip/tokenizer.rb', line 60 def whitespace_clean(text) text.gsub(/\s+/, " ").strip end |