Method: String#split

Defined in:
string.c

#split(pattern = nil, [limit]) ⇒ Array #split(pattern = nil, [limit]) {|sub| ... } ⇒ String

Divides str into substrings based on a delimiter, returning an array of these substrings.

If pattern is a String, then its contents are used as the delimiter when splitting str. If pattern is a single space, str is split on whitespace, with leading and trailing whitespace and runs of contiguous whitespace characters ignored.

If pattern is a Regexp, str is divided where the pattern matches. Whenever the pattern matches a zero-length string, str is split into individual characters. If pattern contains groups, the respective matches will be returned in the array as well.

If pattern is nil, the value of $; is used. If $; is nil (which is the default), str is split on whitespace as if ‘ ’ were specified.

If the limit parameter is omitted, trailing null fields are suppressed. If limit is a positive number, at most that number of split substrings will be returned (captured groups will be returned as well, but are not counted towards the limit). If limit is 1, the entire string is returned as the only entry in an array. If negative, there is no limit to the number of fields returned, and trailing null fields are not suppressed.

When the input str is empty an empty Array is returned as the string is considered to have no fields to split.

" now's  the time ".split       #=> ["now's", "the", "time"]
" now's  the time ".split(' ')  #=> ["now's", "the", "time"]
" now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
"1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
"hello".split(//)               #=> ["h", "e", "l", "l", "o"]
"hello".split(//, 3)            #=> ["h", "e", "llo"]
"hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]

"mellow yellow".split("ello")   #=> ["m", "w y", "w"]
"1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
"1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
"1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]

"1:2:3".split(/(:)()()/, 2)     #=> ["1", ":", "", "", "2:3"]

"".split(',', -1)               #=> []

If a block is given, invoke the block with each split substring.

Overloads:

  • #split(pattern = nil, [limit]) ⇒ Array

    Returns:

  • #split(pattern = nil, [limit]) {|sub| ... } ⇒ String

    Yields:

    Returns:



8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
# File 'string.c', line 8150

static VALUE
rb_str_split_m(int argc, VALUE *argv, VALUE str)
{
    rb_encoding *enc;
    VALUE spat;
    VALUE limit;
    split_type_t split_type;
    long beg, end, i = 0, empty_count = -1;
    int lim = 0;
    VALUE result, tmp;

    result = rb_block_given_p() ? Qfalse : Qnil;
    if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
	lim = NUM2INT(limit);
	if (lim <= 0) limit = Qnil;
	else if (lim == 1) {
	    if (RSTRING_LEN(str) == 0)
                return result ? rb_ary_new2(0) : str;
            tmp = str_duplicate(rb_cString, str);
	    if (!result) {
		rb_yield(tmp);
                return str;
	    }
	    return rb_ary_new3(1, tmp);
	}
	i = 1;
    }
    if (NIL_P(limit) && !lim) empty_count = 0;

    enc = STR_ENC_GET(str);
    split_type = SPLIT_TYPE_REGEXP;
    if (!NIL_P(spat)) {
	spat = get_pat_quoted(spat, 0);
    }
    else if (NIL_P(spat = rb_fs)) {
	split_type = SPLIT_TYPE_AWK;
    }
    else if (!(spat = rb_fs_check(spat))) {
	rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
    }
    else {
        rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
    }
    if (split_type != SPLIT_TYPE_AWK) {
        switch (BUILTIN_TYPE(spat)) {
          case T_REGEXP:
            rb_reg_options(spat); /* check if uninitialized */
            tmp = RREGEXP_SRC(spat);
            split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
            if (split_type == SPLIT_TYPE_AWK) {
                spat = tmp;
                split_type = SPLIT_TYPE_STRING;
            }
            break;

          case T_STRING:
	    mustnot_broken(spat);
            split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
            break;

          default:
            UNREACHABLE_RETURN(Qnil);
	}
    }

#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))

    if (result) result = rb_ary_new();
    beg = 0;
    char *ptr = RSTRING_PTR(str);
    char *eptr = RSTRING_END(str);
    if (split_type == SPLIT_TYPE_AWK) {
	char *bptr = ptr;
	int skip = 1;
	unsigned int c;

	end = beg;
	if (is_ascii_string(str)) {
	    while (ptr < eptr) {
		c = (unsigned char)*ptr++;
		if (skip) {
		    if (ascii_isspace(c)) {
			beg = ptr - bptr;
		    }
		    else {
			end = ptr - bptr;
			skip = 0;
			if (!NIL_P(limit) && lim <= i) break;
		    }
		}
		else if (ascii_isspace(c)) {
		    SPLIT_STR(beg, end-beg);
		    skip = 1;
		    beg = ptr - bptr;
		    if (!NIL_P(limit)) ++i;
		}
		else {
		    end = ptr - bptr;
		}
	    }
	}
	else {
	    while (ptr < eptr) {
		int n;

		c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
		ptr += n;
		if (skip) {
		    if (rb_isspace(c)) {
			beg = ptr - bptr;
		    }
		    else {
			end = ptr - bptr;
			skip = 0;
			if (!NIL_P(limit) && lim <= i) break;
		    }
		}
		else if (rb_isspace(c)) {
		    SPLIT_STR(beg, end-beg);
		    skip = 1;
		    beg = ptr - bptr;
		    if (!NIL_P(limit)) ++i;
		}
		else {
		    end = ptr - bptr;
		}
	    }
	}
    }
    else if (split_type == SPLIT_TYPE_STRING) {
	char *str_start = ptr;
	char *substr_start = ptr;
	char *sptr = RSTRING_PTR(spat);
	long slen = RSTRING_LEN(spat);

	mustnot_broken(str);
	enc = rb_enc_check(str, spat);
	while (ptr < eptr &&
	       (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
	    /* Check we are at the start of a char */
	    char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
	    if (t != ptr + end) {
		ptr = t;
		continue;
	    }
	    SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
	    ptr += end + slen;
	    substr_start = ptr;
	    if (!NIL_P(limit) && lim <= ++i) break;
	}
	beg = ptr - str_start;
    }
    else if (split_type == SPLIT_TYPE_CHARS) {
        char *str_start = ptr;
        int n;

        mustnot_broken(str);
        enc = rb_enc_get(str);
        while (ptr < eptr &&
               (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
            SPLIT_STR(ptr - str_start, n);
            ptr += n;
            if (!NIL_P(limit) && lim <= ++i) break;
        }
        beg = ptr - str_start;
    }
    else {
	long len = RSTRING_LEN(str);
	long start = beg;
	long idx;
	int last_null = 0;
	struct re_registers *regs;
        VALUE match = 0;

        for (; rb_reg_search(spat, str, start, 0) >= 0;
             (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
            match = rb_backref_get();
            if (!result) rb_match_busy(match);
            regs = RMATCH_REGS(match);
            end = BEG(0);
	    if (start == end && BEG(0) == END(0)) {
		if (!ptr) {
		    SPLIT_STR(0, 0);
		    break;
		}
		else if (last_null == 1) {
                    SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
		    beg = start;
		}
		else {
                    if (start == len)
                        start++;
                    else
                        start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
		    last_null = 1;
		    continue;
		}
	    }
	    else {
		SPLIT_STR(beg, end-beg);
		beg = start = END(0);
	    }
	    last_null = 0;

	    for (idx=1; idx < regs->num_regs; idx++) {
		if (BEG(idx) == -1) continue;
		SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
	    }
	    if (!NIL_P(limit) && lim <= ++i) break;
	}
        if (match) rb_match_unbusy(match);
    }
    if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
	SPLIT_STR(beg, RSTRING_LEN(str)-beg);
    }

    return result ? result : str;
}