Method: String#split

Defined in:
string.c

#split(pattern = nil, [limit]) ⇒ Array

Divides str into substrings based on a delimiter, returning an array of these substrings.

If pattern is a String, then its contents are used as the delimiter when splitting str. If pattern is a single space, str is split on whitespace, with leading whitespace and runs of contiguous whitespace characters ignored.

If pattern is a Regexp, str is divided where the pattern matches. Whenever the pattern matches a zero-length string, str is split into individual characters. If pattern contains groups, the respective matches will be returned in the array as well.

If pattern is nil, the value of $; is used. If $; is nil (which is the default), str is split on whitespace as if ` ' were specified.

If the limit parameter is omitted, trailing null fields are suppressed. If limit is a positive number, at most that number of fields will be returned (if limit is 1, the entire string is returned as the only entry in an array). If negative, there is no limit to the number of fields returned, and trailing null fields are not suppressed.

When the input str is empty an empty Array is returned as the string is considered to have no fields to split.

" now's  the time".split        #=> ["now's", "the", "time"]
" now's  the time".split(' ')   #=> ["now's", "the", "time"]
" now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
"1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
"hello".split(//)               #=> ["h", "e", "l", "l", "o"]
"hello".split(//, 3)            #=> ["h", "e", "llo"]
"hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]

"mellow yellow".split("ello")   #=> ["m", "w y", "w"]
"1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
"1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
"1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]

"".split(',', -1)               #=> []


6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
# File 'string.c', line 6762

static VALUE
rb_str_split_m(int argc, VALUE *argv, VALUE str)
{
    rb_encoding *enc;
    VALUE spat;
    VALUE limit;
    enum {awk, string, regexp} split_type;
    long beg, end, i = 0;
    int lim = 0;
    VALUE result, tmp;

    if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
  lim = NUM2INT(limit);
  if (lim <= 0) limit = Qnil;
  else if (lim == 1) {
      if (RSTRING_LEN(str) == 0)
    return rb_ary_new2(0);
      return rb_ary_new3(1, str);
  }
  i = 1;
    }

    enc = STR_ENC_GET(str);
    if (NIL_P(spat) && NIL_P(spat = rb_fs)) {
  split_type = awk;
    }
    else {
  spat = get_pat_quoted(spat, 0);
  if (BUILTIN_TYPE(spat) == T_STRING) {
      rb_encoding *enc2 = STR_ENC_GET(spat);

      mustnot_broken(spat);
      split_type = string;
      if (RSTRING_LEN(spat) == 0) {
    /* Special case - split into chars */
    spat = rb_reg_regcomp(spat);
    split_type = regexp;
      }
      else if (rb_enc_asciicompat(enc2) == 1) {
    if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
        split_type = awk;
    }
      }
      else {
    int l;
    if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
        RSTRING_LEN(spat) == l) {
        split_type = awk;
    }
      }
  }
  else {
      split_type = regexp;
  }
    }

    result = rb_ary_new();
    beg = 0;
    if (split_type == awk) {
  char *ptr = RSTRING_PTR(str);
  char *eptr = RSTRING_END(str);
  char *bptr = ptr;
  int skip = 1;
  unsigned int c;

  end = beg;
  if (is_ascii_string(str)) {
      while (ptr < eptr) {
    c = (unsigned char)*ptr++;
    if (skip) {
        if (ascii_isspace(c)) {
      beg = ptr - bptr;
        }
        else {
      end = ptr - bptr;
      skip = 0;
      if (!NIL_P(limit) && lim <= i) break;
        }
    }
    else if (ascii_isspace(c)) {
        rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
        skip = 1;
        beg = ptr - bptr;
        if (!NIL_P(limit)) ++i;
    }
    else {
        end = ptr - bptr;
    }
      }
  }
  else {
      while (ptr < eptr) {
    int n;

    c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
    ptr += n;
    if (skip) {
        if (rb_isspace(c)) {
      beg = ptr - bptr;
        }
        else {
      end = ptr - bptr;
      skip = 0;
      if (!NIL_P(limit) && lim <= i) break;
        }
    }
    else if (rb_isspace(c)) {
        rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
        skip = 1;
        beg = ptr - bptr;
        if (!NIL_P(limit)) ++i;
    }
    else {
        end = ptr - bptr;
    }
      }
  }
    }
    else if (split_type == string) {
  char *ptr = RSTRING_PTR(str);
  char *temp = ptr;
  char *eptr = RSTRING_END(str);
  char *sptr = RSTRING_PTR(spat);
  long slen = RSTRING_LEN(spat);

  mustnot_broken(str);
  enc = rb_enc_check(str, spat);
  while (ptr < eptr &&
         (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
      /* Check we are at the start of a char */
      char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
      if (t != ptr + end) {
    ptr = t;
    continue;
      }
      rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
      ptr += end + slen;
      if (!NIL_P(limit) && lim <= ++i) break;
  }
  beg = ptr - temp;
    }
    else {
  char *ptr = RSTRING_PTR(str);
  long len = RSTRING_LEN(str);
  long start = beg;
  long idx;
  int last_null = 0;
  struct re_registers *regs;

  while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
      regs = RMATCH_REGS(rb_backref_get());
      if (start == end && BEG(0) == END(0)) {
    if (!ptr) {
        rb_ary_push(result, str_new_empty(str));
        break;
    }
    else if (last_null == 1) {
        rb_ary_push(result, rb_str_subseq(str, beg,
                  rb_enc_fast_mbclen(ptr+beg,
                   ptr+len,
                   enc)));
        beg = start;
    }
    else {
                    if (ptr+start == ptr+len)
                        start++;
                    else
                        start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
        last_null = 1;
        continue;
    }
      }
      else {
    rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
    beg = start = END(0);
      }
      last_null = 0;

      for (idx=1; idx < regs->num_regs; idx++) {
    if (BEG(idx) == -1) continue;
    if (BEG(idx) == END(idx))
        tmp = str_new_empty(str);
    else
        tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
    rb_ary_push(result, tmp);
      }
      if (!NIL_P(limit) && lim <= ++i) break;
  }
    }
    if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
  if (RSTRING_LEN(str) == beg)
      tmp = str_new_empty(str);
  else
      tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
  rb_ary_push(result, tmp);
    }
    if (NIL_P(limit) && lim == 0) {
  long len;
  while ((len = RARRAY_LEN(result)) > 0 &&
         (tmp = RARRAY_AREF(result, len-1), RSTRING_LEN(tmp) == 0))
      rb_ary_pop(result);
    }

    return result;
}