Module: ActiveSupport::Multibyte::Unicode

Extended by:
Unicode
Included in:
Unicode
Defined in:
lib/active_support/multibyte/unicode.rb

Constant Summary collapse

NORMALIZATION_FORMS =

A list of all available normalization forms. See www.unicode.org/reports/tr15/tr15-29.html for more information about normalization.

[:c, :kc, :d, :kd]
NORMALIZATION_FORM_ALIASES =

:nodoc:

{ # :nodoc:
  c: :nfc,
  d: :nfd,
  kc: :nfkc,
  kd: :nfkd
}
UNICODE_VERSION =

The Unicode version that is supported by the implementation

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#default_normalization_formObject

The default normalization used for operations that require normalization. It can be set to any of the normalizations in NORMALIZATION_FORMS.

ActiveSupport::Multibyte::Unicode.default_normalization_form = :c


28
29
30
# File 'lib/active_support/multibyte/unicode.rb', line 28

def default_normalization_form
  @default_normalization_form
end

Instance Method Details

#compose(codepoints) ⇒ Object

Compose decomposed characters to the composed form.



67
68
69
# File 'lib/active_support/multibyte/unicode.rb', line 67

def compose(codepoints)
  codepoints.pack("U*").unicode_normalize(:nfc).codepoints
end

#decompose(type, codepoints) ⇒ Object

Decompose composed characters to the decomposed form.



58
59
60
61
62
63
64
# File 'lib/active_support/multibyte/unicode.rb', line 58

def decompose(type, codepoints)
  if type == :compatibility
    codepoints.pack("U*").unicode_normalize(:nfkd).codepoints
  else
    codepoints.pack("U*").unicode_normalize(:nfd).codepoints
  end
end

#normalize(string, form = nil) ⇒ Object

Returns the KC normalization of the string by default. NFKC is considered the best normalization form for passing strings to databases and validations.

  • string - The string to perform normalization on.

  • form - The form you want to normalize in. Should be one of the following: :c, :kc, :d, or :kd. Default is ActiveSupport::Multibyte::Unicode.default_normalization_form.



118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# File 'lib/active_support/multibyte/unicode.rb', line 118

def normalize(string, form = nil)
  form ||= @default_normalization_form

  # See https://www.unicode.org/reports/tr15, Table 1
  if alias_form = NORMALIZATION_FORM_ALIASES[form]
    ActiveSupport::Deprecation.warn(<<-MSG.squish)
      ActiveSupport::Multibyte::Unicode#normalize is deprecated and will be
      removed from Rails 6.1. Use String#unicode_normalize(:#{alias_form}) instead.
    MSG

    string.unicode_normalize(alias_form)
  else
    ActiveSupport::Deprecation.warn(<<-MSG.squish)
      ActiveSupport::Multibyte::Unicode#normalize is deprecated and will be
      removed from Rails 6.1. Use String#unicode_normalize instead.
    MSG

    raise ArgumentError, "#{form} is not a valid normalization variant", caller
  end
end

#pack_graphemes(unpacked) ⇒ Object

Reverse operation of unpack_graphemes.

Unicode.pack_graphemes(Unicode.unpack_graphemes('क्षि')) # => 'क्षि'


48
49
50
51
52
53
54
55
# File 'lib/active_support/multibyte/unicode.rb', line 48

def pack_graphemes(unpacked)
  ActiveSupport::Deprecation.warn(<<-MSG.squish)
    ActiveSupport::Multibyte::Unicode#pack_graphemes is deprecated and will be
    removed from Rails 6.1. Use array.flatten.pack("U*") instead.
  MSG

  unpacked.flatten.pack("U*")
end

#tidy_bytes(string, force = false) ⇒ Object

Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.

Passing true will forcibly tidy all bytes, assuming that the string’s encoding is entirely CP1252 or ISO-8859-1.



78
79
80
81
82
# File 'lib/active_support/multibyte/unicode.rb', line 78

def tidy_bytes(string, force = false)
  return string if string.empty?
  return recode_windows1252_chars(string) if force
  string.scrub { |bad| recode_windows1252_chars(bad) }
end

#unpack_graphemes(string) ⇒ Object

Unpack the string at grapheme boundaries. Returns a list of character lists.

Unicode.unpack_graphemes('क्षि') # => [[2325, 2381], [2359], [2367]]
Unicode.unpack_graphemes('Café') # => [[67], [97], [102], [233]]


36
37
38
39
40
41
42
43
# File 'lib/active_support/multibyte/unicode.rb', line 36

def unpack_graphemes(string)
  ActiveSupport::Deprecation.warn(<<-MSG.squish)
    ActiveSupport::Multibyte::Unicode#unpack_graphemes is deprecated and will be
    removed from Rails 6.1. Use string.scan(/\X/).map(&:codepoints) instead.
  MSG

  string.scan(/\X/).map(&:codepoints)
end