Method: String#split

Defined in:
string.c

#split(pattern = $;, [limit]) ⇒ Array

Divides str into substrings based on a delimiter, returning an array of these substrings.

If pattern is a String, then its contents are used as the delimiter when splitting str. If pattern is a single space, str is split on whitespace, with leading whitespace and runs of contiguous whitespace characters ignored.

If pattern is a Regexp, str is divided where the pattern matches. Whenever the pattern matches a zero-length string, str is split into individual characters. If pattern contains groups, the respective matches will be returned in the array as well.

If pattern is omitted, the value of $; is used. If $; is nil (which is the default), str is split on whitespace as if ` ' were specified.

If the limit parameter is omitted, trailing null fields are suppressed. If limit is a positive number, at most that number of fields will be returned (if limit is 1, the entire string is returned as the only entry in an array). If negative, there is no limit to the number of fields returned, and trailing null fields are not suppressed.

When the input str is empty an empty Array is returned as the string is considered to have no fields to split.

" now's  the time".split        #=> ["now's", "the", "time"]
" now's  the time".split(' ')   #=> ["now's", "the", "time"]
" now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
"1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
"hello".split(//)               #=> ["h", "e", "l", "l", "o"]
"hello".split(//, 3)            #=> ["h", "e", "llo"]
"hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]

"mellow yellow".split("ello")   #=> ["m", "w y", "w"]
"1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
"1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
"1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]

"".split(',', -1)               #=> []

Returns:



6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
# File 'string.c', line 6355

static VALUE
rb_str_split_m(int argc, VALUE *argv, VALUE str)
{
    rb_encoding *enc;
    VALUE spat;
    VALUE limit;
    enum {awk, string, regexp} split_type;
    long beg, end, i = 0;
    int lim = 0;
    VALUE result, tmp;

    if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
  lim = NUM2INT(limit);
  if (lim <= 0) limit = Qnil;
  else if (lim == 1) {
      if (RSTRING_LEN(str) == 0)
    return rb_ary_new2(0);
      return rb_ary_new3(1, str);
  }
  i = 1;
    }

    enc = STR_ENC_GET(str);
    if (NIL_P(spat)) {
  if (!NIL_P(rb_fs)) {
      spat = rb_fs;
      goto fs_set;
  }
  split_type = awk;
    }
    else {
      fs_set:
  spat = get_pat_quoted(spat, 0);
  if (BUILTIN_TYPE(spat) == T_STRING) {
      rb_encoding *enc2 = STR_ENC_GET(spat);

      mustnot_broken(spat);
      split_type = string;
      if (RSTRING_LEN(spat) == 0) {
    /* Special case - split into chars */
    spat = rb_reg_regcomp(spat);
    split_type = regexp;
      }
      else if (rb_enc_asciicompat(enc2) == 1) {
    if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
        split_type = awk;
    }
      }
      else {
    int l;
    if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
        RSTRING_LEN(spat) == l) {
        split_type = awk;
    }
      }
  }
  else {
      split_type = regexp;
  }
    }

    result = rb_ary_new();
    beg = 0;
    if (split_type == awk) {
  char *ptr = RSTRING_PTR(str);
  char *eptr = RSTRING_END(str);
  char *bptr = ptr;
  int skip = 1;
  unsigned int c;

  end = beg;
  if (is_ascii_string(str)) {
      while (ptr < eptr) {
    c = (unsigned char)*ptr++;
    if (skip) {
        if (ascii_isspace(c)) {
      beg = ptr - bptr;
        }
        else {
      end = ptr - bptr;
      skip = 0;
      if (!NIL_P(limit) && lim <= i) break;
        }
    }
    else if (ascii_isspace(c)) {
        rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
        skip = 1;
        beg = ptr - bptr;
        if (!NIL_P(limit)) ++i;
    }
    else {
        end = ptr - bptr;
    }
      }
  }
  else {
      while (ptr < eptr) {
    int n;

    c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
    ptr += n;
    if (skip) {
        if (rb_isspace(c)) {
      beg = ptr - bptr;
        }
        else {
      end = ptr - bptr;
      skip = 0;
      if (!NIL_P(limit) && lim <= i) break;
        }
    }
    else if (rb_isspace(c)) {
        rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
        skip = 1;
        beg = ptr - bptr;
        if (!NIL_P(limit)) ++i;
    }
    else {
        end = ptr - bptr;
    }
      }
  }
    }
    else if (split_type == string) {
  char *ptr = RSTRING_PTR(str);
  char *temp = ptr;
  char *eptr = RSTRING_END(str);
  char *sptr = RSTRING_PTR(spat);
  long slen = RSTRING_LEN(spat);

  mustnot_broken(str);
  enc = rb_enc_check(str, spat);
  while (ptr < eptr &&
         (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
      /* Check we are at the start of a char */
      char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
      if (t != ptr + end) {
    ptr = t;
    continue;
      }
      rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
      ptr += end + slen;
      if (!NIL_P(limit) && lim <= ++i) break;
  }
  beg = ptr - temp;
    }
    else {
  char *ptr = RSTRING_PTR(str);
  long len = RSTRING_LEN(str);
  long start = beg;
  long idx;
  int last_null = 0;
  struct re_registers *regs;

  while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
      regs = RMATCH_REGS(rb_backref_get());
      if (start == end && BEG(0) == END(0)) {
    if (!ptr) {
        rb_ary_push(result, str_new_empty(str));
        break;
    }
    else if (last_null == 1) {
        rb_ary_push(result, rb_str_subseq(str, beg,
                  rb_enc_fast_mbclen(ptr+beg,
                   ptr+len,
                   enc)));
        beg = start;
    }
    else {
                    if (ptr+start == ptr+len)
                        start++;
                    else
                        start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
        last_null = 1;
        continue;
    }
      }
      else {
    rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
    beg = start = END(0);
      }
      last_null = 0;

      for (idx=1; idx < regs->num_regs; idx++) {
    if (BEG(idx) == -1) continue;
    if (BEG(idx) == END(idx))
        tmp = str_new_empty(str);
    else
        tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
    rb_ary_push(result, tmp);
      }
      if (!NIL_P(limit) && lim <= ++i) break;
  }
    }
    if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
  if (RSTRING_LEN(str) == beg)
      tmp = str_new_empty(str);
  else
      tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
  rb_ary_push(result, tmp);
    }
    if (NIL_P(limit) && lim == 0) {
  long len;
  while ((len = RARRAY_LEN(result)) > 0 &&
         (tmp = RARRAY_AREF(result, len-1), RSTRING_LEN(tmp) == 0))
      rb_ary_pop(result);
    }

    return result;
}

Comments