Class: Regexp

Inherits:
Object show all
Defined in:
re.c,
re.c

Overview

A Regexp holds a regular expression, used to match a pattern against strings. Regexps are created using the /.../ and %r{...} literals, and by the Regexp::new constructor.

:include: doc/regexp.rdoc

Constant Summary collapse

IGNORECASE =

see Regexp.options and Regexp.new

INT2FIX(ONIG_OPTION_IGNORECASE)
EXTENDED =

see Regexp.options and Regexp.new

INT2FIX(ONIG_OPTION_EXTEND)
MULTILINE =

see Regexp.options and Regexp.new

INT2FIX(ONIG_OPTION_MULTILINE)
FIXEDENCODING =

see Regexp.options and Regexp.new

INT2FIX(ARG_ENCODING_FIXED)
NOENCODING =

see Regexp.options and Regexp.new

INT2FIX(ARG_ENCODING_NONE)

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#new(string, [options]) ⇒ Regexp #new(regexp) ⇒ Regexp #compile(string, [options]) ⇒ Regexp #compile(regexp) ⇒ Regexp

Constructs a new regular expression from pattern, which can be either a String or a Regexp (in which case that regexp’s options are propagated), and new options may not be specified (a change as of Ruby 1.8).

If options is an Integer, it should be one or more of the constants Regexp::EXTENDED, Regexp::IGNORECASE, and Regexp::MULTILINE, or-ed together. Otherwise, if options is not nil or false, the regexp will be case insensitive.

r1 = Regexp.new('^a-z+:\\s+\w+') #=> /^a-z+:\s+\w+/
r2 = Regexp.new('cat', true)     #=> /cat/i
r3 = Regexp.new(r2)              #=> /cat/i
r4 = Regexp.new('dog', Regexp::EXTENDED | Regexp::IGNORECASE) #=> /dog/ix

Overloads:

  • #new(string, [options]) ⇒ Regexp
  • #new(regexp) ⇒ Regexp
  • #compile(string, [options]) ⇒ Regexp
  • #compile(regexp) ⇒ Regexp


3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
# File 're.c', line 3425

static VALUE
rb_reg_initialize_m(int argc, VALUE *argv, VALUE self)
{
    int flags = 0;
    VALUE str;
    rb_encoding *enc = 0;

    rb_check_arity(argc, 1, 3);
    if (RB_TYPE_P(argv[0], T_REGEXP)) {
	VALUE re = argv[0];

	if (argc > 1) {
	    rb_warn("flags ignored");
	}
	rb_reg_check(re);
	flags = rb_reg_options(re);
	str = RREGEXP_SRC(re);
    }
    else {
	if (argc >= 2) {
	    if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]);
	    else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE;
	}
	if (argc == 3 && !NIL_P(argv[2])) {
	    char *kcode = StringValuePtr(argv[2]);
	    if (kcode[0] == 'n' || kcode[0] == 'N') {
		enc = rb_ascii8bit_encoding();
		flags |= ARG_ENCODING_NONE;
	    }
	    else {
		rb_warn("encoding option is ignored - %s", kcode);
	    }
	}
	str = StringValue(argv[0]);
    }
    if (enc && rb_enc_get(str) != enc)
	rb_reg_init_str_enc(self, str, enc, flags);
    else
	rb_reg_init_str(self, str, flags);
    return self;
}

Class Method Details

.compileObject

Alias for Regexp.new

.escape(str) ⇒ String .quote(str) ⇒ String

Escapes any characters that would have special meaning in a regular expression. Returns a new escaped string with the same or compatible encoding. For any string, Regexp.new(Regexp.escape(str))=~str will be true.

Regexp.escape('\*?{}.')   #=> \\\*\?\{\}\.

Overloads:



3578
3579
3580
3581
3582
# File 're.c', line 3578

static VALUE
rb_reg_s_quote(VALUE c, VALUE str)
{
    return rb_reg_quote(reg_operand(str, TRUE));
}

.last_matchMatchData .last_match(n) ⇒ String

The first form returns the MatchData object generated by the last successful pattern match. Equivalent to reading the special global variable $~ (see Special global variables in Regexp for details).

The second form returns the nth field in this MatchData object. n can be a string or symbol to reference a named capture.

Note that the last_match is local to the thread and method scope of the method that did the pattern match.

/c(.)t/ =~ 'cat'        #=> 0
Regexp.last_match       #=> #<MatchData "cat" 1:"a">
Regexp.last_match(0)    #=> "cat"
Regexp.last_match(1)    #=> "a"
Regexp.last_match(2)    #=> nil

/(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "var = val"
Regexp.last_match       #=> #<MatchData "var = val" lhs:"var" rhs:"val">
Regexp.last_match(:lhs) #=> "var"
Regexp.last_match(:rhs) #=> "val"

Overloads:



3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
# File 're.c', line 3983

static VALUE
rb_reg_s_last_match(int argc, VALUE *argv, VALUE _)
{
    if (rb_check_arity(argc, 0, 1) == 1) {
        VALUE match = rb_backref_get();
        int n;
        if (NIL_P(match)) return Qnil;
        n = match_backref_number(match, argv[0]);
	return rb_reg_nth_match(n, match);
    }
    return match_getter();
}

.escape(str) ⇒ String .quote(str) ⇒ String

Escapes any characters that would have special meaning in a regular expression. Returns a new escaped string with the same or compatible encoding. For any string, Regexp.new(Regexp.escape(str))=~str will be true.

Regexp.escape('\*?{}.')   #=> \\\*\?\{\}\.

Overloads:



3578
3579
3580
3581
3582
# File 're.c', line 3578

static VALUE
rb_reg_s_quote(VALUE c, VALUE str)
{
    return rb_reg_quote(reg_operand(str, TRUE));
}

.try_convert(obj) ⇒ nil

Try to convert obj into a Regexp, using to_regexp method. Returns converted regexp or nil if obj cannot be converted for any reason.

Regexp.try_convert(/re/)         #=> /re/
Regexp.try_convert("re")         #=> nil

o = Object.new
Regexp.try_convert(o)            #=> nil
def o.to_regexp() /foo/ end
Regexp.try_convert(o)            #=> /foo/

Returns:

  • (nil)


3619
3620
3621
3622
3623
# File 're.c', line 3619

static VALUE
rb_reg_s_try_convert(VALUE dummy, VALUE re)
{
    return rb_check_regexp_type(re);
}

.union(pat1, pat2, ...) ⇒ Regexp .union(pats_ary) ⇒ Regexp

Return a Regexp object that is the union of the given patterns, i.e., will match any of its parts. The patterns can be Regexp objects, in which case their options will be preserved, or Strings. If no patterns are given, returns /(?!)/. The behavior is unspecified if any given pattern contains capture.

Regexp.union                         #=> /(?!)/
Regexp.union("penzance")             #=> /penzance/
Regexp.union("a+b*c")                #=> /a\+b\*c/
Regexp.union("skiing", "sledding")   #=> /skiing|sledding/
Regexp.union(["skiing", "sledding"]) #=> /skiing|sledding/
Regexp.union(/dogs/, /cats/i)        #=> /(?-mix:dogs)|(?i-mx:cats)/

Note: the arguments for ::union will try to be converted into a regular expression literal via #to_regexp.

Overloads:



3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
# File 're.c', line 3761

static VALUE
rb_reg_s_union_m(VALUE self, VALUE args)
{
    VALUE v;
    if (RARRAY_LEN(args) == 1 &&
        !NIL_P(v = rb_check_array_type(rb_ary_entry(args, 0)))) {
        return rb_reg_s_union(self, v);
    }
    return rb_reg_s_union(self, args);
}

Instance Method Details

#==(other_rxp) ⇒ Boolean #eql?(other_rxp) ⇒ Boolean

Equality—Two regexps are equal if their patterns are identical, they have the same character set code, and their casefold? values are the same.

/abc/  == /abc/x   #=> false
/abc/  == /abc/i   #=> false
/abc/  == /abc/u   #=> false
/abc/u == /abc/n   #=> false

Overloads:

  • #==(other_rxp) ⇒ Boolean

    Returns:

    • (Boolean)
  • #eql?(other_rxp) ⇒ Boolean

    Returns:

    • (Boolean)


3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
# File 're.c', line 3030

static VALUE
rb_reg_equal(VALUE re1, VALUE re2)
{
    if (re1 == re2) return Qtrue;
    if (!RB_TYPE_P(re2, T_REGEXP)) return Qfalse;
    rb_reg_check(re1); rb_reg_check(re2);
    if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse;
    if (RREGEXP_PTR(re1)->options != RREGEXP_PTR(re2)->options) return Qfalse;
    if (RREGEXP_SRC_LEN(re1) != RREGEXP_SRC_LEN(re2)) return Qfalse;
    if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse;
    if (memcmp(RREGEXP_SRC_PTR(re1), RREGEXP_SRC_PTR(re2), RREGEXP_SRC_LEN(re1)) == 0) {
	return Qtrue;
    }
    return Qfalse;
}

#===(str) ⇒ Boolean

Case Equality—Used in case statements.

a = "HELLO"
case a
when /\A[a-z]*\z/; print "Lower case\n"
when /\A[A-Z]*\z/; print "Upper case\n"
else;              print "Mixed case\n"
end
#=> "Upper case"

Following a regular expression literal with the #=== operator allows you to compare against a String.

/^[a-z]*$/ === “HELLO” #=> false /^[A-Z]*$/ === “HELLO” #=> true

Returns:

  • (Boolean)


3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
# File 're.c', line 3215

VALUE
rb_reg_eqq(VALUE re, VALUE str)
{
    long start;

    str = reg_operand(str, FALSE);
    if (NIL_P(str)) {
	rb_backref_set(Qnil);
	return Qfalse;
    }
    start = rb_reg_search(re, str, 0, 0);
    if (start < 0) {
	return Qfalse;
    }
    return Qtrue;
}

#=~(str) ⇒ Integer?

Match—Matches rxp against str.

/at/ =~ "input data"   #=> 7
/ax/ =~ "input data"   #=> nil

If =~ is used with a regexp literal with named captures, captured strings (or nil) is assigned to local variables named by the capture names.

/(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "  x = y  "
p lhs    #=> "x"
p rhs    #=> "y"

If it is not matched, nil is assigned for the variables.

/(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "  x = "
p lhs    #=> nil
p rhs    #=> nil

This assignment is implemented in the Ruby parser. The parser detects ‘regexp-literal =~ expression’ for the assignment. The regexp must be a literal without interpolation and placed at left hand side.

The assignment does not occur if the regexp is not a literal.

re = /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
re =~ "  x = y  "
p lhs    # undefined local variable
p rhs    # undefined local variable

A regexp interpolation, #{}, also disables the assignment.

rhs_pat = /(?<rhs>\w+)/
/(?<lhs>\w+)\s*=\s*#{rhs_pat}/ =~ "x = y"
p lhs    # undefined local variable

The assignment does not occur if the regexp is placed at the right hand side.

"  x = y  " =~ /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
p lhs, rhs # undefined local variable

Returns:



3185
3186
3187
3188
3189
3190
3191
3192
# File 're.c', line 3185

VALUE
rb_reg_match(VALUE re, VALUE str)
{
    long pos = reg_match_pos(re, &str, 0);
    if (pos < 0) return Qnil;
    pos = rb_str_sublen(str, pos);
    return LONG2FIX(pos);
}

#casefold?Boolean

Returns the value of the case-insensitive flag.

/a/.casefold?           #=> false
/a/i.casefold?          #=> true
/(?i:a)/.casefold?      #=> false

Returns:

  • (Boolean)


720
721
722
723
724
725
726
# File 're.c', line 720

static VALUE
rb_reg_casefold_p(VALUE re)
{
    rb_reg_check(re);
    if (RREGEXP_PTR(re)->options & ONIG_OPTION_IGNORECASE) return Qtrue;
    return Qfalse;
}

#encodingEncoding

Returns the Encoding object that represents the encoding of obj.

Returns:



1013
1014
1015
1016
1017
1018
1019
1020
1021
# File 'encoding.c', line 1013

VALUE
rb_obj_encoding(VALUE obj)
{
    int idx = rb_enc_get_index(obj);
    if (idx < 0) {
	rb_raise(rb_eTypeError, "unknown encoding");
    }
    return rb_enc_from_encoding_index(idx & ENC_INDEX_MASK);
}

#==(other_rxp) ⇒ Boolean #eql?(other_rxp) ⇒ Boolean

Equality—Two regexps are equal if their patterns are identical, they have the same character set code, and their casefold? values are the same.

/abc/  == /abc/x   #=> false
/abc/  == /abc/i   #=> false
/abc/  == /abc/u   #=> false
/abc/u == /abc/n   #=> false

Overloads:

  • #==(other_rxp) ⇒ Boolean

    Returns:

    • (Boolean)
  • #eql?(other_rxp) ⇒ Boolean

    Returns:

    • (Boolean)


3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
# File 're.c', line 3030

static VALUE
rb_reg_equal(VALUE re1, VALUE re2)
{
    if (re1 == re2) return Qtrue;
    if (!RB_TYPE_P(re2, T_REGEXP)) return Qfalse;
    rb_reg_check(re1); rb_reg_check(re2);
    if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse;
    if (RREGEXP_PTR(re1)->options != RREGEXP_PTR(re2)->options) return Qfalse;
    if (RREGEXP_SRC_LEN(re1) != RREGEXP_SRC_LEN(re2)) return Qfalse;
    if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse;
    if (memcmp(RREGEXP_SRC_PTR(re1), RREGEXP_SRC_PTR(re2), RREGEXP_SRC_LEN(re1)) == 0) {
	return Qtrue;
    }
    return Qfalse;
}

#fixed_encoding?Boolean

Returns false if rxp is applicable to a string with any ASCII compatible encoding. Returns true otherwise.

r = /a/
r.fixed_encoding?                               #=> false
r =~ "\u{6666} a"                               #=> 2
r =~ "\xa1\xa2 a".force_encoding("euc-jp")      #=> 2
r =~ "abc".force_encoding("euc-jp")             #=> 0

r = /a/u
r.fixed_encoding?                               #=> true
r.encoding                                      #=> #<Encoding:UTF-8>
r =~ "\u{6666} a"                               #=> 2
r =~ "\xa1\xa2".force_encoding("euc-jp")        #=> Encoding::CompatibilityError
r =~ "abc".force_encoding("euc-jp")             #=> 0

r = /\u{6666}/
r.fixed_encoding?                               #=> true
r.encoding                                      #=> #<Encoding:UTF-8>
r =~ "\u{6666} a"                               #=> 0
r =~ "\xa1\xa2".force_encoding("euc-jp")        #=> Encoding::CompatibilityError
r =~ "abc".force_encoding("euc-jp")             #=> nil

Returns:

  • (Boolean)


1386
1387
1388
1389
1390
1391
1392
1393
# File 're.c', line 1386

static VALUE
rb_reg_fixed_encoding_p(VALUE re)
{
    if (FL_TEST(re, KCODE_FIXED))
        return Qtrue;
    else
        return Qfalse;
}

#hashInteger

Produce a hash based on the text and options of this regular expression.

See also Object#hash.

Returns:



2996
2997
2998
2999
3000
3001
# File 're.c', line 2996

static VALUE
rb_reg_hash(VALUE re)
{
    st_index_t hashval = reg_hash(re);
    return ST2FIX(hashval);
}

#initialize_copy(re) ⇒ Object

:nodoc:



3773
3774
3775
3776
3777
3778
3779
# File 're.c', line 3773

static VALUE
rb_reg_init_copy(VALUE copy, VALUE re)
{
    if (!OBJ_INIT_COPY(copy, re)) return copy;
    rb_reg_check(re);
    return rb_reg_init_str(copy, RREGEXP_SRC(re), rb_reg_options(re));
}

#inspectString

Produce a nicely formatted string-version of rxp. Perhaps surprisingly, #inspect actually produces the more natural version of the string than #to_s.

/ab+c/ix.inspect        #=> "/ab+c/ix"

Returns:



512
513
514
515
516
517
518
519
# File 're.c', line 512

static VALUE
rb_reg_inspect(VALUE re)
{
    if (!RREGEXP_PTR(re) || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
        return rb_any_to_s(re);
    }
    return rb_reg_desc(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), re);
}

#match(str) ⇒ MatchData? #match(str, pos) ⇒ MatchData?

Returns a MatchData object describing the match, or nil if there was no match. This is equivalent to retrieving the value of the special variable $~ following a normal match. If the second parameter is present, it specifies the position in the string to begin the search.

/(.)(.)(.)/.match("abc")[2]   #=> "b"
/(.)(.)/.match("abc", 1)[2]   #=> "c"

If a block is given, invoke the block with MatchData if match succeed, so that you can write

/M(.*)/.match("Matz") do |m|
  puts m[0]
  puts m[1]
end

instead of

if m = /M(.*)/.match("Matz")
  puts m[0]
  puts m[1]
end

The return value is a value from block execution in this case.

Overloads:



3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
# File 're.c', line 3296

static VALUE
rb_reg_match_m(int argc, VALUE *argv, VALUE re)
{
    VALUE result, str, initpos;
    long pos;

    if (rb_scan_args(argc, argv, "11", &str, &initpos) == 2) {
	pos = NUM2LONG(initpos);
    }
    else {
	pos = 0;
    }

    pos = reg_match_pos(re, &str, pos);
    if (pos < 0) {
	rb_backref_set(Qnil);
	return Qnil;
    }
    result = rb_backref_get();
    rb_match_busy(result);
    if (!NIL_P(result) && rb_block_given_p()) {
	return rb_yield(result);
    }
    return result;
}

#match?(str) ⇒ Boolean #match?(str, pos) ⇒ Boolean

Returns a true or false indicates whether the regexp is matched or not without updating $~ and other related variables. If the second parameter is present, it specifies the position in the string to begin the search.

/R.../.match?("Ruby")    #=> true
/R.../.match?("Ruby", 1) #=> false
/P.../.match?("Ruby")    #=> false
$&                       #=> nil

Overloads:

  • #match?(str) ⇒ Boolean

    Returns:

    • (Boolean)
  • #match?(str, pos) ⇒ Boolean

    Returns:

    • (Boolean)


3338
3339
3340
3341
3342
3343
# File 're.c', line 3338

static VALUE
rb_reg_match_m_p(int argc, VALUE *argv, VALUE re)
{
    long pos = rb_check_arity(argc, 1, 2) > 1 ? NUM2LONG(argv[1]) : 0;
    return rb_reg_match_p(re, argv[0], pos);
}

#named_capturesHash

Returns a hash representing information about named captures of rxp.

A key of the hash is a name of the named captures. A value of the hash is an array which is list of indexes of corresponding named captures.

/(?<foo>.)(?<bar>.)/.named_captures
#=> {"foo"=>[1], "bar"=>[2]}

/(?<foo>.)(?<foo>.)/.named_captures
#=> {"foo"=>[1, 2]}

If there are no named captures, an empty hash is returned.

/(.)(.)/.named_captures
#=> {}

Returns:



832
833
834
835
836
837
838
839
# File 're.c', line 832

static VALUE
rb_reg_named_captures(VALUE re)
{
    regex_t *reg = (rb_reg_check(re), RREGEXP_PTR(re));
    VALUE hash = rb_hash_new_with_size(onig_number_of_names(reg));
    onig_foreach_name(reg, reg_named_captures_iter, (void*)hash);
    return hash;
}

#namesArray

Returns a list of names of captures as an array of strings.

/(?<foo>.)(?<bar>.)(?<baz>.)/.names
#=> ["foo", "bar", "baz"]

/(?<foo>.)(?<foo>.)/.names
#=> ["foo"]

/(.)(.)/.names
#=> []

Returns:



784
785
786
787
788
789
790
791
792
# File 're.c', line 784

static VALUE
rb_reg_names(VALUE re)
{
    VALUE ary;
    rb_reg_check(re);
    ary = rb_ary_new_capa(onig_number_of_names(RREGEXP_PTR(re)));
    onig_foreach_name(RREGEXP_PTR(re), reg_names_iter, (void*)ary);
    return ary;
}

#optionsInteger

Returns the set of bits corresponding to the options used when creating this Regexp (see Regexp::new for details. Note that additional bits may be set in the returned options: these are used internally by the regular expression code. These extra bits are ignored if the options are passed to Regexp::new.

Regexp::IGNORECASE                  #=> 1
Regexp::EXTENDED                    #=> 2
Regexp::MULTILINE                   #=> 4

/cat/.options                       #=> 0
/cat/ix.options                     #=> 3
Regexp.new('cat', true).options     #=> 1
/\xa1\xa2/e.options                 #=> 16

r = /cat/ix
Regexp.new(r.source, r.options)     #=> /cat/ix

Returns:



752
753
754
755
756
757
# File 're.c', line 752

static VALUE
rb_reg_options_m(VALUE re)
{
    int options = rb_reg_options(re);
    return INT2NUM(options);
}

#sourceString

Returns the original string of the pattern.

/ab+c/ix.source #=> "ab+c"

Note that escape sequences are retained as is.

/\x20\+/.source  #=> "\\x20\\+"

Returns:



490
491
492
493
494
495
496
497
498
# File 're.c', line 490

static VALUE
rb_reg_source(VALUE re)
{
    VALUE str;

    rb_reg_check(re);
    str = rb_str_dup(RREGEXP_SRC(re));
    return str;
}

#to_sString

Returns a string containing the regular expression and its options (using the (?opts:source) notation. This string can be fed back in to Regexp::new to a regular expression with the same semantics as the original. (However, Regexp#== may not return true when comparing the two, as the source of the regular expression itself may differ, as the example shows). Regexp#inspect produces a generally more readable version of rxp.

r1 = /ab+c/ix           #=> /ab+c/ix
s1 = r1.to_s            #=> "(?ix-m:ab+c)"
r2 = Regexp.new(s1)     #=> /(?ix-m:ab+c)/
r1 == r2                #=> false
r1.source               #=> "ab+c"
r2.source               #=> "(?ix-m:ab+c)"

Returns:



543
544
545
546
547
# File 're.c', line 543

static VALUE
rb_reg_to_s(VALUE re)
{
    return rb_reg_str_with_term(re, '/');
}

#~(rxp) ⇒ Integer?

Match—Matches rxp against the contents of $_. Equivalent to rxp =~ $_.

$_ = "input data"
~ /at/   #=> 7

Returns:



3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
# File 're.c', line 3244

VALUE
rb_reg_match2(VALUE re)
{
    long start;
    VALUE line = rb_lastline_get();

    if (!RB_TYPE_P(line, T_STRING)) {
	rb_backref_set(Qnil);
	return Qnil;
    }

    start = rb_reg_search(re, line, 0, 0);
    if (start < 0) {
	return Qnil;
    }
    start = rb_str_sublen(line, start);
    return LONG2FIX(start);
}