Class: CharacterSet

Inherits:
Object
  • Object
show all
Extended by:
CommonSets
Includes:
SetMethodAdapters, SharedMethods, Enumerable
Defined in:
lib/character_set/pure.rb,
lib/character_set.rb,
lib/character_set/parser.rb,
lib/character_set/writer.rb,
lib/character_set/version.rb,
lib/character_set/character.rb,
lib/character_set/common_sets.rb,
lib/character_set/ruby_fallback.rb,
lib/character_set/shared_methods.rb,
lib/character_set/core_ext/regexp_ext.rb,
lib/character_set/core_ext/string_ext.rb,
lib/character_set/set_method_adapters.rb,
lib/character_set/expression_converter.rb,
lib/character_set/ruby_fallback/set_methods.rb,
lib/character_set/ruby_fallback/plane_methods.rb,
lib/character_set/ruby_fallback/character_set_methods.rb,
ext/character_set/character_set.c

Overview

Various methods shared by the pure-Ruby and the extended implementation.

Many of these methods are hotspots, so they are defined directly on the including classes for better performance.

Defined Under Namespace

Modules: CommonSets, CoreExt, ExpressionConverter, Parser, RubyFallback, SetMethodAdapters, SharedMethods, Writer Classes: Character, Pure

Constant Summary collapse

VERSION =
'1.0.0'

Class Method Summary collapse

Instance Method Summary collapse

Methods included from CommonSets

ascii, bmp, crypt, emoji, method_missing, newline, respond_to_missing?, unicode, url_fragment, url_host, url_path, url_query, whitespace

Methods included from SharedMethods

included

Class Method Details

.from_ranges(ranges) ⇒ Object

******************************



407
408
409
410
411
412
413
414
415
416
# File 'ext/character_set/character_set.c', line 407

static VALUE
class_method_from_ranges(VALUE self, VALUE ranges) {
  VALUE new_set, range_count, i;
  new_set = rb_class_new_instance(0, 0, self);
  range_count = RARRAY_LEN(ranges);
  for (i = 0; i < range_count; i++) {
    merge_rb_range(new_set, RARRAY_AREF(ranges, i));
  }
  return new_set;
}

.of(str) ⇒ Object



635
636
637
638
639
640
641
642
# File 'ext/character_set/character_set.c', line 635

static VALUE
class_method_of(VALUE self, VALUE str) {
  cp_byte *cp_arr;
  raise_arg_err_unless_string(str);
  cp_arr = calloc(UNICODE_BYTES, sizeof(cp_byte));
  each_cp(str, add_str_cp_to_arr, cp_arr);
  return NEW_CHARACTER_SET(self, cp_arr);
}

Instance Method Details

#&(other) ⇒ Object



175
176
177
178
# File 'ext/character_set/character_set.c', line 175

static VALUE
method_intersection(VALUE self, VALUE other) {
  RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) && TSTBIT(b, cp));
}

#+(other) ⇒ Object



185
186
187
188
# File 'ext/character_set/character_set.c', line 185

static VALUE
method_union(VALUE self, VALUE other) {
  RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) || TSTBIT(b, cp));
}

#-(other) ⇒ Object



190
191
192
193
# File 'ext/character_set/character_set.c', line 190

static VALUE
method_difference(VALUE self, VALUE other) {
  RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) > TSTBIT(b, cp));
}

#<(other) ⇒ Object



383
384
385
386
387
388
# File 'ext/character_set/character_set.c', line 383

static VALUE
method_proper_subset_p(VALUE self, VALUE other) {
  int is, is_proper;
  is = a_subset_of_b(self, other, &is_proper);
  return (is && is_proper) ? Qtrue : Qfalse;
}

#<<(cp_num) ⇒ Object



219
220
221
222
# File 'ext/character_set/character_set.c', line 219

static VALUE
method_add(VALUE self, VALUE cp_num) {
  return toggle_codepoint(self, cp_num, 1, 0) ? self : Qnil;
}

#<=(other) ⇒ Object



377
378
379
380
381
# File 'ext/character_set/character_set.c', line 377

static VALUE
method_subset_p(VALUE self, VALUE other) {
  int is_proper;
  return a_subset_of_b(self, other, &is_proper) ? Qtrue : Qfalse;
}

#==(other) ⇒ Object



262
263
264
265
266
267
268
269
270
# File 'ext/character_set/character_set.c', line 262

static VALUE
method_eql_p(VALUE self, VALUE other) {
  if (!is_character_set(other)) return Qfalse;
  if (self == other) return Qtrue; // same object_id

  COMPARE_SETS(if (TSTBIT(cps, cp) != TSTBIT(other_cps, cp)) return Qfalse);

  return Qtrue;
}

#===(num) ⇒ Object



195
196
197
198
199
200
# File 'ext/character_set/character_set.c', line 195

static VALUE
method_include_p(VALUE self, VALUE num) {
  cp_byte *cps;
  FETCH_CODEPOINTS(self, cps);
  return (TSTBIT(cps, FIX2ULONG(num)) ? Qtrue : Qfalse);
}

#>(other) ⇒ Object



396
397
398
399
400
401
# File 'ext/character_set/character_set.c', line 396

static VALUE
method_proper_superset_p(VALUE self, VALUE other) {
  int is, is_proper;
  is = a_subset_of_b(other, self, &is_proper);
  return (is && is_proper) ? Qtrue : Qfalse;
}

#>=(other) ⇒ Object



390
391
392
393
394
# File 'ext/character_set/character_set.c', line 390

static VALUE
method_superset_p(VALUE self, VALUE other) {
  int is_proper;
  return a_subset_of_b(other, self, &is_proper) ? Qtrue : Qfalse;
}

#^(other) ⇒ Object



180
181
182
183
# File 'ext/character_set/character_set.c', line 180

static VALUE
method_exclusion(VALUE self, VALUE other) {
  RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) ^ TSTBIT(b, cp));
}

#add(cp_num) ⇒ Object



219
220
221
222
# File 'ext/character_set/character_set.c', line 219

static VALUE
method_add(VALUE self, VALUE cp_num) {
  return toggle_codepoint(self, cp_num, 1, 0) ? self : Qnil;
}

#add?(cp_num) ⇒ Boolean

Returns:

  • (Boolean)


224
225
226
227
# File 'ext/character_set/character_set.c', line 224

static VALUE
method_add_p(VALUE self, VALUE cp_num) {
  return toggle_codepoint(self, cp_num, 1, 1) ? self : Qnil;
}

#astral_partObject



476
477
478
479
# File 'ext/character_set/character_set.c', line 476

static VALUE
method_astral_part(VALUE self) {
  return new_set_from_section(self, UNICODE_PLANE_SIZE, UNICODE_CP_COUNT - 1);
}

#bmp_partObject



471
472
473
474
# File 'ext/character_set/character_set.c', line 471

static VALUE
method_bmp_part(VALUE self) {
  return new_set_from_section(self, 0, UNICODE_PLANE_SIZE - 1);
}

#case_insensitiveObject



545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
# File 'ext/character_set/character_set.c', line 545

static VALUE
method_case_insensitive(VALUE self) {
  cp_index i;
  cp_byte *new_cps;

  new_cps = calloc(UNICODE_BYTES, sizeof(cp_byte));

  FOR_EACH_ACTIVE_CODEPOINT(SETBIT(new_cps, cp));

  for (i = 0; i < CASEFOLD_COUNT; i++) {
    casefold_mapping m = unicode_casefold_table[i];

    if      (TSTBIT(cps, m.from)) { SETBIT(new_cps, m.to); }
    else if (TSTBIT(cps, m.to)) { SETBIT(new_cps, m.from); }
  }

  return NEW_CHARACTER_SET(RBASIC(self)->klass, new_cps);

  // OnigCaseFoldType flags;
  // rb_encoding *enc;
  //
  // enc = rb_utf8_encoding();
  //
  // ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE (not public on ruby < 2.4)
  // flags = (1<<13) | (1<<14);
  //
  // // case_map args: flags, pp, end, to, to_end, enc
  // enc->case_map(flags, (const OnigUChar**)&cp, ?, ?, ?, enc);
}

#clearObject



152
153
154
155
156
157
158
159
160
161
162
# File 'ext/character_set/character_set.c', line 152

static VALUE
method_clear(VALUE self) {
  cp_index cp;
  cp_byte *cps;
  rb_check_frozen(self);
  FETCH_CODEPOINTS(self, cps);
  for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
    CLRBIT(cps, cp);
  }
  return self;
}

#countObject



72
73
74
75
# File 'ext/character_set/character_set.c', line 72

static VALUE
method_length(VALUE self) {
  return enumerator_length(self, 0, 0);
}

#cover?(str) ⇒ Boolean

Returns:

  • (Boolean)


664
665
666
667
668
669
670
# File 'ext/character_set/character_set.c', line 664

static VALUE
method_cover_p(VALUE self, VALUE str) {
  cp_byte *cps;
  raise_arg_err_unless_string(str);
  FETCH_CODEPOINTS(self, cps);
  return each_cp(str, str_cp_in_arr, cps);
}

#delete(cp_num) ⇒ Object



229
230
231
232
# File 'ext/character_set/character_set.c', line 229

static VALUE
method_delete(VALUE self, VALUE cp_num) {
  return toggle_codepoint(self, cp_num, 0, 0) ? self : Qnil;
}

#delete?(cp_num) ⇒ Boolean

Returns:

  • (Boolean)


234
235
236
237
# File 'ext/character_set/character_set.c', line 234

static VALUE
method_delete_p(VALUE self, VALUE cp_num) {
  return toggle_codepoint(self, cp_num, 0, 1) ? self : Qnil;
}

#delete_ifObject



140
141
142
143
144
# File 'ext/character_set/character_set.c', line 140

static VALUE
method_delete_if(VALUE self) {
  RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
  return delete_if_block_result(self, 1);
}

#delete_in(str) ⇒ Object



719
720
721
722
# File 'ext/character_set/character_set.c', line 719

static VALUE
method_delete_in(VALUE self, VALUE str) {
  return apply_to_str(self, str, 1, 0);
}

#delete_in!(str) ⇒ Object



724
725
726
727
# File 'ext/character_set/character_set.c', line 724

static VALUE
method_delete_in_bang(VALUE self, VALUE str) {
  return apply_to_str(self, str, 1, 1);
}

#difference(other) ⇒ Object



190
191
192
193
# File 'ext/character_set/character_set.c', line 190

static VALUE
method_difference(VALUE self, VALUE other) {
  RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) > TSTBIT(b, cp));
}

#disjoint?(other) ⇒ Boolean

Returns:

  • (Boolean)


252
253
254
255
# File 'ext/character_set/character_set.c', line 252

static VALUE
method_disjoint_p(VALUE self, VALUE other) {
  return method_intersect_p(self, other) ? Qfalse : Qtrue;
}

#eachObject

Set compatibility methods



77
78
79
80
81
82
# File 'ext/character_set/character_set.c', line 77

static VALUE
method_each(VALUE self) {
  RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
  FOR_EACH_ACTIVE_CODEPOINT(rb_yield(LONG2FIX(cp)));
  return self;
}

#empty?Boolean

Returns:

  • (Boolean)


104
105
106
107
108
# File 'ext/character_set/character_set.c', line 104

static VALUE
method_empty_p(VALUE self) {
  FOR_EACH_ACTIVE_CODEPOINT(return Qfalse);
  return Qtrue;
}

#eql?(other) ⇒ Boolean

Returns:

  • (Boolean)


262
263
264
265
266
267
268
269
270
# File 'ext/character_set/character_set.c', line 262

static VALUE
method_eql_p(VALUE self, VALUE other) {
  if (!is_character_set(other)) return Qfalse;
  if (self == other) return Qtrue; // same object_id

  COMPARE_SETS(if (TSTBIT(cps, cp) != TSTBIT(other_cps, cp)) return Qfalse);

  return Qtrue;
}

#ext_inversion(*args) ⇒ Object



518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
# File 'ext/character_set/character_set.c', line 518

static VALUE
method_ext_inversion(int argc, VALUE *argv, VALUE self) {
  int include_surrogates;
  cp_index upto;
  VALUE other;
  other = 0;
  rb_check_arity(argc, 0, 2);
  include_surrogates = ((argc > 0) && (argv[0] == Qtrue));
  if ((argc > 1) && FIXNUM_P(argv[1])) {
    upto = FIX2ULONG(argv[1]);
    RETURN_NEW_SET_BASED_ON(
      cp <= upto && !TSTBIT(a, cp) && (include_surrogates || NON_SURROGATE(cp))
    );
  }
  RETURN_NEW_SET_BASED_ON(
    !TSTBIT(a, cp) && (include_surrogates || NON_SURROGATE(cp))
  );
}

#hashObject



110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'ext/character_set/character_set.c', line 110

static VALUE
method_hash(VALUE self) {
  cp_index cp, hash, four_byte_value;
  cp_byte *cps;
  FETCH_CODEPOINTS(self, cps);

  hash = 17;
  for (cp = 0; cp < UNICODE_CP_COUNT; cp++) {
    if (cp % 32 == 0) {
      if (cp != 0) { hash = hash * 23 + four_byte_value; }
      four_byte_value = 0;
    }
    if (TSTBIT(cps, cp)) four_byte_value++;
  }

  return LONG2FIX(hash);
}

#include?(num) ⇒ Boolean

Returns:

  • (Boolean)


195
196
197
198
199
200
# File 'ext/character_set/character_set.c', line 195

static VALUE
method_include_p(VALUE self, VALUE num) {
  cp_byte *cps;
  FETCH_CODEPOINTS(self, cps);
  return (TSTBIT(cps, FIX2ULONG(num)) ? Qtrue : Qfalse);
}

#initialize_clone(other) ⇒ Object



335
336
337
338
339
# File 'ext/character_set/character_set.c', line 335

static VALUE
method_initialize_copy(VALUE self, VALUE other) {
  merge_character_set(self, other);
  return other;
}

#initialize_dup(other) ⇒ Object



335
336
337
338
339
# File 'ext/character_set/character_set.c', line 335

static VALUE
method_initialize_copy(VALUE self, VALUE other) {
  merge_character_set(self, other);
  return other;
}

#intersect?(other) ⇒ Boolean

Returns:

  • (Boolean)


246
247
248
249
250
# File 'ext/character_set/character_set.c', line 246

static VALUE
method_intersect_p(VALUE self, VALUE other) {
  COMPARE_SETS(if (TSTBIT(cps, cp) && TSTBIT(other_cps, cp)) return Qtrue);
  return Qfalse;
}

#intersection(other) ⇒ Object



175
176
177
178
# File 'ext/character_set/character_set.c', line 175

static VALUE
method_intersection(VALUE self, VALUE other) {
  RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) && TSTBIT(b, cp));
}

#keep_ifObject



146
147
148
149
150
# File 'ext/character_set/character_set.c', line 146

static VALUE
method_keep_if(VALUE self) {
  RETURN_SIZED_ENUMERATOR(self, 0, 0, enumerator_length);
  return delete_if_block_result(self, 0);
}

#keep_in(str) ⇒ Object



729
730
731
732
# File 'ext/character_set/character_set.c', line 729

static VALUE
method_keep_in(VALUE self, VALUE str) {
  return apply_to_str(self, str, 0, 0);
}

#keep_in!(str) ⇒ Object



734
735
736
737
# File 'ext/character_set/character_set.c', line 734

static VALUE
method_keep_in_bang(VALUE self, VALUE str) {
  return apply_to_str(self, str, 0, 1);
}

#lengthObject



72
73
74
75
# File 'ext/character_set/character_set.c', line 72

static VALUE
method_length(VALUE self) {
  return enumerator_length(self, 0, 0);
}

#member?(num) ⇒ Boolean

Returns:

  • (Boolean)


195
196
197
198
199
200
# File 'ext/character_set/character_set.c', line 195

static VALUE
method_include_p(VALUE self, VALUE num) {
  cp_byte *cps;
  FETCH_CODEPOINTS(self, cps);
  return (TSTBIT(cps, FIX2ULONG(num)) ? Qtrue : Qfalse);
}

#member_in_plane?(plane_num) ⇒ Boolean

Returns:

  • (Boolean)


505
506
507
508
509
510
511
512
513
514
# File 'ext/character_set/character_set.c', line 505

static VALUE
method_member_in_plane_p(VALUE self, VALUE plane_num) {
  int plane;
  Check_Type(plane_num, T_FIXNUM);
  plane = FIX2INT(plane_num);
  if (plane < 0 || plane >= UNICODE_PLANE_COUNT) {
    rb_raise(rb_eArgError, "plane must be between 0 and 16");
  }
  return set_has_member_in_plane(self, plane);
}

#merge(other) ⇒ Object



323
324
325
326
327
328
329
330
331
332
333
# File 'ext/character_set/character_set.c', line 323

static VALUE
method_merge(VALUE self, VALUE other) {
  rb_check_frozen(self);
  if (is_character_set(other)) {
    return merge_character_set(self, other);
  }
  else if (TYPE(other) == T_ARRAY) {
    return merge_rb_array(self, other);
  }
  return merge_rb_range(self, other);
}

#planesObject



494
495
496
497
498
499
500
501
502
503
# File 'ext/character_set/character_set.c', line 494

static VALUE
method_planes(VALUE self) {
  unsigned int i;
  VALUE planes;
  planes = rb_ary_new();
  for (i = 0; i < UNICODE_PLANE_COUNT; i++) {
    if (set_has_member_in_plane(self, i)) rb_ary_push(planes, INT2FIX(i));
  }
  return planes;
}

#proper_subset?(other) ⇒ Boolean

Returns:

  • (Boolean)


383
384
385
386
387
388
# File 'ext/character_set/character_set.c', line 383

static VALUE
method_proper_subset_p(VALUE self, VALUE other) {
  int is, is_proper;
  is = a_subset_of_b(self, other, &is_proper);
  return (is && is_proper) ? Qtrue : Qfalse;
}

#proper_superset?(other) ⇒ Boolean

Returns:

  • (Boolean)


396
397
398
399
400
401
# File 'ext/character_set/character_set.c', line 396

static VALUE
method_proper_superset_p(VALUE self, VALUE other) {
  int is, is_proper;
  is = a_subset_of_b(other, self, &is_proper);
  return (is && is_proper) ? Qtrue : Qfalse;
}

#rangesObject



418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
# File 'ext/character_set/character_set.c', line 418

static VALUE
method_ranges(VALUE self) {
  VALUE ranges, codepoint, previous_codepoint, current_start, current_end;

  ranges = rb_ary_new();
  previous_codepoint = 0;
  current_start = 0;
  current_end = 0;

  FOR_EACH_ACTIVE_CODEPOINT(
    codepoint = LONG2FIX(cp);

    if (!previous_codepoint) {
      current_start = codepoint;
    }
    else if (previous_codepoint + 2 != codepoint) {
      // gap found, finalize previous range
      rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
      current_start = codepoint;
    }
    current_end = codepoint;
    previous_codepoint = codepoint;
  );

  // add final range
  if (current_start) {
    rb_ary_push(ranges, rb_range_new(current_start, current_end, 0));
  }

  return ranges;
}

#sample(*args) ⇒ Object



450
451
452
453
454
455
456
457
# File 'ext/character_set/character_set.c', line 450

static VALUE
method_sample(int argc, VALUE *argv, VALUE self) {
  VALUE to_a_args[1], array;
  rb_check_arity(argc, 0, 1);
  to_a_args[0] = Qtrue;
  array = method_to_a(1, to_a_args, self);
  return rb_funcall(array, rb_intern("sample"), argc, argc ? argv[0] : 0);
}

#sizeObject



72
73
74
75
# File 'ext/character_set/character_set.c', line 72

static VALUE
method_length(VALUE self) {
  return enumerator_length(self, 0, 0);
}

#subset?(other) ⇒ Boolean

Returns:

  • (Boolean)


377
378
379
380
381
# File 'ext/character_set/character_set.c', line 377

static VALUE
method_subset_p(VALUE self, VALUE other) {
  int is_proper;
  return a_subset_of_b(self, other, &is_proper) ? Qtrue : Qfalse;
}

#subtract(other) ⇒ Object



341
342
343
344
345
346
# File 'ext/character_set/character_set.c', line 341

static VALUE
method_subtract(VALUE self, VALUE other) {
  rb_check_frozen(self);
  COMPARE_SETS(if (TSTBIT(other_cps, cp)) CLRBIT(cps, cp));
  return self;
}

#superset?(other) ⇒ Boolean

Returns:

  • (Boolean)


390
391
392
393
394
# File 'ext/character_set/character_set.c', line 390

static VALUE
method_superset_p(VALUE self, VALUE other) {
  int is_proper;
  return a_subset_of_b(other, self, &is_proper) ? Qtrue : Qfalse;
}

#to_a(*args) ⇒ Object

returns an Array of Strings of length 1 if passed true.



86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'ext/character_set/character_set.c', line 86

static VALUE
method_to_a(int argc, VALUE *argv, VALUE self) {
  VALUE arr;
  rb_encoding *enc;
  rb_check_arity(argc, 0, 1);

  arr = rb_ary_new();
  if (!argc || NIL_P(argv[0]) || argv[0] == Qfalse) {
    FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, LONG2FIX(cp)));
  }
  else {
    enc = rb_utf8_encoding();
    FOR_EACH_ACTIVE_CODEPOINT(rb_ary_push(arr, rb_enc_uint_chr((int)cp, enc)));
  }

  return arr;
}

#union(other) ⇒ Object



185
186
187
188
# File 'ext/character_set/character_set.c', line 185

static VALUE
method_union(VALUE self, VALUE other) {
  RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) || TSTBIT(b, cp));
}

#used_by?(str) ⇒ Boolean

Returns:

  • (Boolean)


649
650
651
652
653
654
655
656
657
# File 'ext/character_set/character_set.c', line 649

static VALUE
method_used_by_p(VALUE self, VALUE str) {
  cp_byte *cps;
  VALUE only_uses_other_cps;
  raise_arg_err_unless_string(str);
  FETCH_CODEPOINTS(self, cps);
  only_uses_other_cps = each_cp(str, str_cp_not_in_arr, cps);
  return only_uses_other_cps == Qfalse ? Qtrue : Qfalse;
}

#|(other) ⇒ Object



185
186
187
188
# File 'ext/character_set/character_set.c', line 185

static VALUE
method_union(VALUE self, VALUE other) {
  RETURN_NEW_SET_BASED_ON(TSTBIT(a, cp) || TSTBIT(b, cp));
}