Ruby 2.7.7p221 (2022-11-24 revision 168ec2b1e5ad0e4688e963d9de019557c78feed9)
re.c
Go to the documentation of this file.
1/**********************************************************************
2
3 re.c -
4
5 $Author$
6 created at: Mon Aug 9 18:24:49 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9
10**********************************************************************/
11
12#include "ruby/encoding.h"
13#include "ruby/re.h"
14#include "ruby/util.h"
15#include "internal.h"
16#include "regint.h"
17#include "encindex.h"
18#include <ctype.h>
19
21
23#define errcpy(err, msg) strlcpy((err), (msg), ONIG_MAX_ERROR_MESSAGE_LEN)
24
25#define BEG(no) (regs->beg[(no)])
26#define END(no) (regs->end[(no)])
27
28#if 'a' == 97 /* it's ascii */
29static const char casetable[] = {
30 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
31 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
32 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
33 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
34 /* ' ' '!' '"' '#' '$' '%' '&' ''' */
35 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
36 /* '(' ')' '*' '+' ',' '-' '.' '/' */
37 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
38 /* '0' '1' '2' '3' '4' '5' '6' '7' */
39 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
40 /* '8' '9' ':' ';' '<' '=' '>' '?' */
41 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
42 /* '@' 'A' 'B' 'C' 'D' 'E' 'F' 'G' */
43 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
44 /* 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' */
45 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
46 /* 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' */
47 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
48 /* 'X' 'Y' 'Z' '[' '\' ']' '^' '_' */
49 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
50 /* '`' 'a' 'b' 'c' 'd' 'e' 'f' 'g' */
51 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
52 /* 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' */
53 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
54 /* 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' */
55 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
56 /* 'x' 'y' 'z' '{' '|' '}' '~' */
57 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
58 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
59 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
60 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
61 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
62 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
63 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
64 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
65 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
66 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
67 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
68 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
69 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
70 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
71 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
72 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
73 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
74};
75#else
76# error >>> "You lose. You will need a translation table for your character set." <<<
77#endif
78
79int
80rb_memcicmp(const void *x, const void *y, long len)
81{
82 const unsigned char *p1 = x, *p2 = y;
83 int tmp;
84
85 while (len--) {
86 if ((tmp = casetable[(unsigned)*p1++] - casetable[(unsigned)*p2++]))
87 return tmp;
88 }
89 return 0;
90}
91
92#ifdef HAVE_MEMMEM
93static inline long
94rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n)
95{
96 const unsigned char *y;
97
98 if ((y = memmem(ys, n, xs, m)) != NULL)
99 return y - ys;
100 else
101 return -1;
102}
103#else
104static inline long
105rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n)
106{
107 const unsigned char *x = xs, *xe = xs + m;
108 const unsigned char *y = ys, *ye = ys + n;
109#define VALUE_MAX ((VALUE)~(VALUE)0)
110 VALUE hx, hy, mask = VALUE_MAX >> ((SIZEOF_VALUE - m) * CHAR_BIT);
111
112 if (m > SIZEOF_VALUE)
113 rb_bug("!!too long pattern string!!");
114
115 if (!(y = memchr(y, *x, n - m + 1)))
116 return -1;
117
118 /* Prepare hash value */
119 for (hx = *x++, hy = *y++; x < xe; ++x, ++y) {
120 hx <<= CHAR_BIT;
121 hy <<= CHAR_BIT;
122 hx |= *x;
123 hy |= *y;
124 }
125 /* Searching */
126 while (hx != hy) {
127 if (y == ye)
128 return -1;
129 hy <<= CHAR_BIT;
130 hy |= *y;
131 hy &= mask;
132 y++;
133 }
134 return y - ys - m;
135}
136#endif
137
138static inline long
139rb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long n)
140{
141 const unsigned char *x = xs, *xe = xs + m;
142 const unsigned char *y = ys;
143 VALUE i, qstable[256];
144
145 /* Preprocessing */
146 for (i = 0; i < 256; ++i)
147 qstable[i] = m + 1;
148 for (; x < xe; ++x)
149 qstable[*x] = xe - x;
150 /* Searching */
151 for (; y + m <= ys + n; y += *(qstable + y[m])) {
152 if (*xs == *y && memcmp(xs, y, m) == 0)
153 return y - ys;
154 }
155 return -1;
156}
157
158static inline unsigned int
159rb_memsearch_qs_utf8_hash(const unsigned char *x)
160{
161 register const unsigned int mix = 8353;
162 register unsigned int h = *x;
163 if (h < 0xC0) {
164 return h + 256;
165 }
166 else if (h < 0xE0) {
167 h *= mix;
168 h += x[1];
169 }
170 else if (h < 0xF0) {
171 h *= mix;
172 h += x[1];
173 h *= mix;
174 h += x[2];
175 }
176 else if (h < 0xF5) {
177 h *= mix;
178 h += x[1];
179 h *= mix;
180 h += x[2];
181 h *= mix;
182 h += x[3];
183 }
184 else {
185 return h + 256;
186 }
187 return (unsigned char)h;
188}
189
190static inline long
191rb_memsearch_qs_utf8(const unsigned char *xs, long m, const unsigned char *ys, long n)
192{
193 const unsigned char *x = xs, *xe = xs + m;
194 const unsigned char *y = ys;
195 VALUE i, qstable[512];
196
197 /* Preprocessing */
198 for (i = 0; i < 512; ++i) {
199 qstable[i] = m + 1;
200 }
201 for (; x < xe; ++x) {
202 qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x;
203 }
204 /* Searching */
205 for (; y + m <= ys + n; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) {
206 if (*xs == *y && memcmp(xs, y, m) == 0)
207 return y - ys;
208 }
209 return -1;
210}
211
212static inline long
213rb_memsearch_wchar(const unsigned char *xs, long m, const unsigned char *ys, long n)
214{
215 const unsigned char *x = xs, x0 = *xs, *y = ys;
216 enum {char_size = 2};
217
218 for (n -= m; n >= 0; n -= char_size, y += char_size) {
219 if (x0 == *y && memcmp(x+1, y+1, m-1) == 0)
220 return y - ys;
221 }
222 return -1;
223}
224
225static inline long
226rb_memsearch_qchar(const unsigned char *xs, long m, const unsigned char *ys, long n)
227{
228 const unsigned char *x = xs, x0 = *xs, *y = ys;
229 enum {char_size = 4};
230
231 for (n -= m; n >= 0; n -= char_size, y += char_size) {
232 if (x0 == *y && memcmp(x+1, y+1, m-1) == 0)
233 return y - ys;
234 }
235 return -1;
236}
237
238long
239rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc)
240{
241 const unsigned char *x = x0, *y = y0;
242
243 if (m > n) return -1;
244 else if (m == n) {
245 return memcmp(x0, y0, m) == 0 ? 0 : -1;
246 }
247 else if (m < 1) {
248 return 0;
249 }
250 else if (m == 1) {
251 const unsigned char *ys = memchr(y, *x, n);
252
253 if (ys)
254 return ys - y;
255 else
256 return -1;
257 }
258 else if (LIKELY(rb_enc_mbminlen(enc) == 1)) {
259 if (m <= SIZEOF_VALUE) {
260 return rb_memsearch_ss(x0, m, y0, n);
261 }
262 else if (enc == rb_utf8_encoding()){
263 return rb_memsearch_qs_utf8(x0, m, y0, n);
264 }
265 }
266 else if (LIKELY(rb_enc_mbminlen(enc) == 2)) {
267 return rb_memsearch_wchar(x0, m, y0, n);
268 }
269 else if (LIKELY(rb_enc_mbminlen(enc) == 4)) {
270 return rb_memsearch_qchar(x0, m, y0, n);
271 }
272 return rb_memsearch_qs(x0, m, y0, n);
273}
274
275#define REG_LITERAL FL_USER5
276#define REG_ENCODING_NONE FL_USER6
277
278#define KCODE_FIXED FL_USER4
279
280#define ARG_REG_OPTION_MASK \
281 (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
282#define ARG_ENCODING_FIXED 16
283#define ARG_ENCODING_NONE 32
284
285static int
286char_to_option(int c)
287{
288 int val;
289
290 switch (c) {
291 case 'i':
293 break;
294 case 'x':
295 val = ONIG_OPTION_EXTEND;
296 break;
297 case 'm':
299 break;
300 default:
301 val = 0;
302 break;
303 }
304 return val;
305}
306
307static char *
308option_to_str(char str[4], int options)
309{
310 char *p = str;
311 if (options & ONIG_OPTION_MULTILINE) *p++ = 'm';
312 if (options & ONIG_OPTION_IGNORECASE) *p++ = 'i';
313 if (options & ONIG_OPTION_EXTEND) *p++ = 'x';
314 *p = 0;
315 return str;
316}
317
318extern int
319rb_char_to_option_kcode(int c, int *option, int *kcode)
320{
321 *option = 0;
322
323 switch (c) {
324 case 'n':
325 *kcode = rb_ascii8bit_encindex();
326 return (*option = ARG_ENCODING_NONE);
327 case 'e':
328 *kcode = ENCINDEX_EUC_JP;
329 break;
330 case 's':
331 *kcode = ENCINDEX_Windows_31J;
332 break;
333 case 'u':
334 *kcode = rb_utf8_encindex();
335 break;
336 default:
337 *kcode = -1;
338 return (*option = char_to_option(c));
339 }
340 *option = ARG_ENCODING_FIXED;
341 return 1;
342}
343
344static void
345rb_reg_check(VALUE re)
346{
347 if (!RREGEXP_PTR(re) || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
348 rb_raise(rb_eTypeError, "uninitialized Regexp");
349 }
350}
351
352static void
353rb_reg_expr_str(VALUE str, const char *s, long len,
354 rb_encoding *enc, rb_encoding *resenc, int term)
355{
356 const char *p, *pend;
357 int cr = ENC_CODERANGE_UNKNOWN;
358 int need_escape = 0;
359 int c, clen;
360
361 p = s; pend = p + len;
362 rb_str_coderange_scan_restartable(p, pend, enc, &cr);
364 while (p < pend) {
365 c = rb_enc_ascget(p, pend, &clen, enc);
366 if (c == -1) {
367 if (enc == resenc) {
368 p += mbclen(p, pend, enc);
369 }
370 else {
371 need_escape = 1;
372 break;
373 }
374 }
375 else if (c != term && rb_enc_isprint(c, enc)) {
376 p += clen;
377 }
378 else {
379 need_escape = 1;
380 break;
381 }
382 }
383 }
384 else {
385 need_escape = 1;
386 }
387
388 if (!need_escape) {
390 }
391 else {
392 int unicode_p = rb_enc_unicode_p(enc);
393 p = s;
394 while (p<pend) {
395 c = rb_enc_ascget(p, pend, &clen, enc);
396 if (c == '\\' && p+clen < pend) {
397 int n = clen + mbclen(p+clen, pend, enc);
398 rb_str_buf_cat(str, p, n);
399 p += n;
400 continue;
401 }
402 else if (c == -1) {
403 clen = rb_enc_precise_mbclen(p, pend, enc);
404 if (!MBCLEN_CHARFOUND_P(clen)) {
405 c = (unsigned char)*p;
406 clen = 1;
407 goto hex;
408 }
409 if (resenc) {
410 unsigned int c = rb_enc_mbc_to_codepoint(p, pend, enc);
411 rb_str_buf_cat_escaped_char(str, c, unicode_p);
412 }
413 else {
414 clen = MBCLEN_CHARFOUND_LEN(clen);
415 rb_str_buf_cat(str, p, clen);
416 }
417 }
418 else if (c == term) {
419 char c = '\\';
420 rb_str_buf_cat(str, &c, 1);
421 rb_str_buf_cat(str, p, clen);
422 }
423 else if (rb_enc_isprint(c, enc)) {
424 rb_str_buf_cat(str, p, clen);
425 }
426 else if (!rb_enc_isspace(c, enc)) {
427 char b[8];
428
429 hex:
430 snprintf(b, sizeof(b), "\\x%02X", c);
431 rb_str_buf_cat(str, b, 4);
432 }
433 else {
434 rb_str_buf_cat(str, p, clen);
435 }
436 p += clen;
437 }
438 }
439}
440
441static VALUE
442rb_reg_desc(const char *s, long len, VALUE re)
443{
444 rb_encoding *enc = rb_enc_get(re);
447 if (resenc == NULL) resenc = rb_default_external_encoding();
448
449 if (re && rb_enc_asciicompat(enc)) {
450 rb_enc_copy(str, re);
451 }
452 else {
454 }
455 rb_reg_expr_str(str, s, len, enc, resenc, '/');
456 rb_str_buf_cat2(str, "/");
457 if (re) {
458 char opts[4];
459 rb_reg_check(re);
460 if (*option_to_str(opts, RREGEXP_PTR(re)->options))
461 rb_str_buf_cat2(str, opts);
462 if (RBASIC(re)->flags & REG_ENCODING_NONE)
463 rb_str_buf_cat2(str, "n");
464 }
465 return str;
466}
467
468
469/*
470 * call-seq:
471 * rxp.source -> str
472 *
473 * Returns the original string of the pattern.
474 *
475 * /ab+c/ix.source #=> "ab+c"
476 *
477 * Note that escape sequences are retained as is.
478 *
479 * /\x20\+/.source #=> "\\x20\\+"
480 *
481 */
482
483static VALUE
484rb_reg_source(VALUE re)
485{
486 VALUE str;
487
488 rb_reg_check(re);
490 return str;
491}
492
493/*
494 * call-seq:
495 * rxp.inspect -> string
496 *
497 * Produce a nicely formatted string-version of _rxp_. Perhaps surprisingly,
498 * <code>#inspect</code> actually produces the more natural version of
499 * the string than <code>#to_s</code>.
500 *
501 * /ab+c/ix.inspect #=> "/ab+c/ix"
502 *
503 */
504
505static VALUE
506rb_reg_inspect(VALUE re)
507{
508 if (!RREGEXP_PTR(re) || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
509 return rb_any_to_s(re);
510 }
511 return rb_reg_desc(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), re);
512}
513
514static VALUE rb_reg_str_with_term(VALUE re, int term);
515
516/*
517 * call-seq:
518 * rxp.to_s -> str
519 *
520 * Returns a string containing the regular expression and its options (using the
521 * <code>(?opts:source)</code> notation. This string can be fed back in to
522 * Regexp::new to a regular expression with the same semantics as the
523 * original. (However, <code>Regexp#==</code> may not return true
524 * when comparing the two, as the source of the regular expression
525 * itself may differ, as the example shows). Regexp#inspect produces
526 * a generally more readable version of <i>rxp</i>.
527 *
528 * r1 = /ab+c/ix #=> /ab+c/ix
529 * s1 = r1.to_s #=> "(?ix-m:ab+c)"
530 * r2 = Regexp.new(s1) #=> /(?ix-m:ab+c)/
531 * r1 == r2 #=> false
532 * r1.source #=> "ab+c"
533 * r2.source #=> "(?ix-m:ab+c)"
534 */
535
536static VALUE
537rb_reg_to_s(VALUE re)
538{
539 return rb_reg_str_with_term(re, '/');
540}
541
542static VALUE
543rb_reg_str_with_term(VALUE re, int term)
544{
545 int options, opt;
547 long len;
548 const UChar* ptr;
549 VALUE str = rb_str_buf_new2("(?");
550 char optbuf[5];
551 rb_encoding *enc = rb_enc_get(re);
552
553 rb_reg_check(re);
554
555 rb_enc_copy(str, re);
556 options = RREGEXP_PTR(re)->options;
557 ptr = (UChar*)RREGEXP_SRC_PTR(re);
558 len = RREGEXP_SRC_LEN(re);
559 again:
560 if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') {
561 int err = 1;
562 ptr += 2;
563 if ((len -= 2) > 0) {
564 do {
565 opt = char_to_option((int )*ptr);
566 if (opt != 0) {
567 options |= opt;
568 }
569 else {
570 break;
571 }
572 ++ptr;
573 } while (--len > 0);
574 }
575 if (len > 1 && *ptr == '-') {
576 ++ptr;
577 --len;
578 do {
579 opt = char_to_option((int )*ptr);
580 if (opt != 0) {
581 options &= ~opt;
582 }
583 else {
584 break;
585 }
586 ++ptr;
587 } while (--len > 0);
588 }
589 if (*ptr == ')') {
590 --len;
591 ++ptr;
592 goto again;
593 }
594 if (*ptr == ':' && ptr[len-1] == ')') {
595 Regexp *rp;
598
599 ++ptr;
600 len -= 2;
601 err = onig_new(&rp, ptr, ptr + len, options,
602 enc, OnigDefaultSyntax, NULL);
603 onig_free(rp);
605 }
606 if (err) {
607 options = RREGEXP_PTR(re)->options;
608 ptr = (UChar*)RREGEXP_SRC_PTR(re);
609 len = RREGEXP_SRC_LEN(re);
610 }
611 }
612
613 if (*option_to_str(optbuf, options)) rb_str_buf_cat2(str, optbuf);
614
615 if ((options & embeddable) != embeddable) {
616 optbuf[0] = '-';
617 option_to_str(optbuf + 1, ~options);
618 rb_str_buf_cat2(str, optbuf);
619 }
620
621 rb_str_buf_cat2(str, ":");
622 if (rb_enc_asciicompat(enc)) {
623 rb_reg_expr_str(str, (char*)ptr, len, enc, NULL, term);
624 rb_str_buf_cat2(str, ")");
625 }
626 else {
627 const char *s, *e;
628 char *paren;
629 ptrdiff_t n;
630 rb_str_buf_cat2(str, ")");
633
634 /* backup encoded ")" to paren */
635 s = RSTRING_PTR(str);
636 e = RSTRING_END(str);
637 s = rb_enc_left_char_head(s, e-1, e, enc);
638 n = e - s;
639 paren = ALLOCA_N(char, n);
640 memcpy(paren, s, n);
642
643 rb_reg_expr_str(str, (char*)ptr, len, enc, NULL, term);
644 rb_str_buf_cat(str, paren, n);
645 }
646 rb_enc_copy(str, re);
647
648 return str;
649}
650
651NORETURN(static void rb_reg_raise(const char *s, long len, const char *err, VALUE re));
652
653static void
654rb_reg_raise(const char *s, long len, const char *err, VALUE re)
655{
656 VALUE desc = rb_reg_desc(s, len, re);
657
658 rb_raise(rb_eRegexpError, "%s: %"PRIsVALUE, err, desc);
659}
660
661static VALUE
662rb_enc_reg_error_desc(const char *s, long len, rb_encoding *enc, int options, const char *err)
663{
664 char opts[6];
665 VALUE desc = rb_str_buf_new2(err);
667 if (resenc == NULL) resenc = rb_default_external_encoding();
668
669 rb_enc_associate(desc, enc);
670 rb_str_buf_cat2(desc, ": /");
671 rb_reg_expr_str(desc, s, len, enc, resenc, '/');
672 opts[0] = '/';
673 option_to_str(opts + 1, options);
674 rb_str_buf_cat2(desc, opts);
675 return rb_exc_new3(rb_eRegexpError, desc);
676}
677
678NORETURN(static void rb_enc_reg_raise(const char *s, long len, rb_encoding *enc, int options, const char *err));
679
680static void
681rb_enc_reg_raise(const char *s, long len, rb_encoding *enc, int options, const char *err)
682{
683 rb_exc_raise(rb_enc_reg_error_desc(s, len, enc, options, err));
684}
685
686static VALUE
687rb_reg_error_desc(VALUE str, int options, const char *err)
688{
689 return rb_enc_reg_error_desc(RSTRING_PTR(str), RSTRING_LEN(str),
690 rb_enc_get(str), options, err);
691}
692
693NORETURN(static void rb_reg_raise_str(VALUE str, int options, const char *err));
694
695static void
696rb_reg_raise_str(VALUE str, int options, const char *err)
697{
698 rb_exc_raise(rb_reg_error_desc(str, options, err));
699}
700
701
702/*
703 * call-seq:
704 * rxp.casefold? -> true or false
705 *
706 * Returns the value of the case-insensitive flag.
707 *
708 * /a/.casefold? #=> false
709 * /a/i.casefold? #=> true
710 * /(?i:a)/.casefold? #=> false
711 */
712
713static VALUE
714rb_reg_casefold_p(VALUE re)
715{
716 rb_reg_check(re);
717 if (RREGEXP_PTR(re)->options & ONIG_OPTION_IGNORECASE) return Qtrue;
718 return Qfalse;
719}
720
721
722/*
723 * call-seq:
724 * rxp.options -> integer
725 *
726 * Returns the set of bits corresponding to the options used when
727 * creating this Regexp (see Regexp::new for details. Note that
728 * additional bits may be set in the returned options: these are used
729 * internally by the regular expression code. These extra bits are
730 * ignored if the options are passed to Regexp::new.
731 *
732 * Regexp::IGNORECASE #=> 1
733 * Regexp::EXTENDED #=> 2
734 * Regexp::MULTILINE #=> 4
735 *
736 * /cat/.options #=> 0
737 * /cat/ix.options #=> 3
738 * Regexp.new('cat', true).options #=> 1
739 * /\xa1\xa2/e.options #=> 16
740 *
741 * r = /cat/ix
742 * Regexp.new(r.source, r.options) #=> /cat/ix
743 */
744
745static VALUE
746rb_reg_options_m(VALUE re)
747{
748 int options = rb_reg_options(re);
749 return INT2NUM(options);
750}
751
752static int
753reg_names_iter(const OnigUChar *name, const OnigUChar *name_end,
754 int back_num, int *back_refs, OnigRegex regex, void *arg)
755{
756 VALUE ary = (VALUE)arg;
757 rb_ary_push(ary, rb_enc_str_new((const char *)name, name_end-name, regex->enc));
758 return 0;
759}
760
761/*
762 * call-seq:
763 * rxp.names -> [name1, name2, ...]
764 *
765 * Returns a list of names of captures as an array of strings.
766 *
767 * /(?<foo>.)(?<bar>.)(?<baz>.)/.names
768 * #=> ["foo", "bar", "baz"]
769 *
770 * /(?<foo>.)(?<foo>.)/.names
771 * #=> ["foo"]
772 *
773 * /(.)(.)/.names
774 * #=> []
775 */
776
777static VALUE
778rb_reg_names(VALUE re)
779{
780 VALUE ary;
781 rb_reg_check(re);
783 onig_foreach_name(RREGEXP_PTR(re), reg_names_iter, (void*)ary);
784 return ary;
785}
786
787static int
788reg_named_captures_iter(const OnigUChar *name, const OnigUChar *name_end,
789 int back_num, int *back_refs, OnigRegex regex, void *arg)
790{
791 VALUE hash = (VALUE)arg;
792 VALUE ary = rb_ary_new2(back_num);
793 int i;
794
795 for (i = 0; i < back_num; i++)
796 rb_ary_store(ary, i, INT2NUM(back_refs[i]));
797
798 rb_hash_aset(hash, rb_str_new((const char*)name, name_end-name),ary);
799
800 return 0;
801}
802
803/*
804 * call-seq:
805 * rxp.named_captures -> hash
806 *
807 * Returns a hash representing information about named captures of <i>rxp</i>.
808 *
809 * A key of the hash is a name of the named captures.
810 * A value of the hash is an array which is list of indexes of corresponding
811 * named captures.
812 *
813 * /(?<foo>.)(?<bar>.)/.named_captures
814 * #=> {"foo"=>[1], "bar"=>[2]}
815 *
816 * /(?<foo>.)(?<foo>.)/.named_captures
817 * #=> {"foo"=>[1, 2]}
818 *
819 * If there are no named captures, an empty hash is returned.
820 *
821 * /(.)(.)/.named_captures
822 * #=> {}
823 */
824
825static VALUE
826rb_reg_named_captures(VALUE re)
827{
828 regex_t *reg = (rb_reg_check(re), RREGEXP_PTR(re));
830 onig_foreach_name(reg, reg_named_captures_iter, (void*)hash);
831 return hash;
832}
833
834static int
835onig_new_with_source(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
836 OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax,
837 OnigErrorInfo* einfo, const char *sourcefile, int sourceline)
838{
839 int r;
840
841 *reg = (regex_t* )malloc(sizeof(regex_t));
842 if (IS_NULL(*reg)) return ONIGERR_MEMORY;
843
844 r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
845 if (r) goto err;
846
847 r = onig_compile_ruby(*reg, pattern, pattern_end, einfo, sourcefile, sourceline);
848 if (r) {
849 err:
850 onig_free(*reg);
851 *reg = NULL;
852 }
853 return r;
854}
855
856static Regexp*
857make_regexp(const char *s, long len, rb_encoding *enc, int flags, onig_errmsg_buffer err,
858 const char *sourcefile, int sourceline)
859{
860 Regexp *rp;
861 int r;
862 OnigErrorInfo einfo;
863
864 /* Handle escaped characters first. */
865
866 /* Build a copy of the string (in dest) with the
867 escaped characters translated, and generate the regex
868 from that.
869 */
870
871 r = onig_new_with_source(&rp, (UChar*)s, (UChar*)(s + len), flags,
872 enc, OnigDefaultSyntax, &einfo, sourcefile, sourceline);
873 if (r) {
874 onig_error_code_to_str((UChar*)err, r, &einfo);
875 return 0;
876 }
877 return rp;
878}
879
880
881/*
882 * Document-class: MatchData
883 *
884 * MatchData encapsulates the result of matching a Regexp against
885 * string. It is returned by Regexp#match and String#match, and also
886 * stored in a global variable returned by Regexp.last_match.
887 *
888 * Usage:
889 *
890 * url = 'https://docs.ruby-lang.org/en/2.5.0/MatchData.html'
891 * m = url.match(/(\d\.?)+/) # => #<MatchData "2.5.0" 1:"0">
892 * m.string # => "https://docs.ruby-lang.org/en/2.5.0/MatchData.html"
893 * m.regexp # => /(\d\.?)+/
894 * # entire matched substring:
895 * m[0] # => "2.5.0"
896 *
897 * # Working with unnamed captures
898 * m = url.match(%r{([^/]+)/([^/]+)\.html$})
899 * m.captures # => ["2.5.0", "MatchData"]
900 * m[1] # => "2.5.0"
901 * m.values_at(1, 2) # => ["2.5.0", "MatchData"]
902 *
903 * # Working with named captures
904 * m = url.match(%r{(?<version>[^/]+)/(?<module>[^/]+)\.html$})
905 * m.captures # => ["2.5.0", "MatchData"]
906 * m.named_captures # => {"version"=>"2.5.0", "module"=>"MatchData"}
907 * m[:version] # => "2.5.0"
908 * m.values_at(:version, :module)
909 * # => ["2.5.0", "MatchData"]
910 * # Numerical indexes are working, too
911 * m[1] # => "2.5.0"
912 * m.values_at(1, 2) # => ["2.5.0", "MatchData"]
913 *
914 * == Global variables equivalence
915 *
916 * Parts of last MatchData (returned by Regexp.last_match) are also
917 * aliased as global variables:
918 *
919 * * <code>$~</code> is Regexp.last_match;
920 * * <code>$&</code> is Regexp.last_match<code>[0]</code>;
921 * * <code>$1</code>, <code>$2</code>, and so on are
922 * Regexp.last_match<code>[i]</code> (captures by number);
923 * * <code>$`</code> is Regexp.last_match<code>.pre_match</code>;
924 * * <code>$'</code> is Regexp.last_match<code>.post_match</code>;
925 * * <code>$+</code> is Regexp.last_match<code>[-1]</code> (the last capture).
926 *
927 * See also "Special global variables" section in Regexp documentation.
928 */
929
931
932static VALUE
933match_alloc(VALUE klass)
934{
935 NEWOBJ_OF(match, struct RMatch, klass, T_MATCH);
936
937 match->str = 0;
938 match->rmatch = 0;
939 match->regexp = 0;
940 match->rmatch = ZALLOC(struct rmatch);
941
942 return (VALUE)match;
943}
944
945int
946rb_reg_region_copy(struct re_registers *to, const struct re_registers *from)
947{
948 onig_region_copy(to, (OnigRegion *)from);
949 if (to->allocated) return 0;
950 rb_gc();
951 onig_region_copy(to, (OnigRegion *)from);
952 if (to->allocated) return 0;
953 return ONIGERR_MEMORY;
954}
955
956typedef struct {
959} pair_t;
960
961static int
962pair_byte_cmp(const void *pair1, const void *pair2)
963{
964 long diff = ((pair_t*)pair1)->byte_pos - ((pair_t*)pair2)->byte_pos;
965#if SIZEOF_LONG > SIZEOF_INT
966 return diff ? diff > 0 ? 1 : -1 : 0;
967#else
968 return (int)diff;
969#endif
970}
971
972static void
973update_char_offset(VALUE match)
974{
975 struct rmatch *rm = RMATCH(match)->rmatch;
976 struct re_registers *regs;
977 int i, num_regs, num_pos;
978 long c;
979 char *s, *p, *q;
980 rb_encoding *enc;
981 pair_t *pairs;
982
984 return;
985
986 regs = &rm->regs;
987 num_regs = rm->regs.num_regs;
988
992 }
993
994 enc = rb_enc_get(RMATCH(match)->str);
995 if (rb_enc_mbmaxlen(enc) == 1) {
996 for (i = 0; i < num_regs; i++) {
997 rm->char_offset[i].beg = BEG(i);
998 rm->char_offset[i].end = END(i);
999 }
1000 return;
1001 }
1002
1003 pairs = ALLOCA_N(pair_t, num_regs*2);
1004 num_pos = 0;
1005 for (i = 0; i < num_regs; i++) {
1006 if (BEG(i) < 0)
1007 continue;
1008 pairs[num_pos++].byte_pos = BEG(i);
1009 pairs[num_pos++].byte_pos = END(i);
1010 }
1011 qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
1012
1013 s = p = RSTRING_PTR(RMATCH(match)->str);
1014 c = 0;
1015 for (i = 0; i < num_pos; i++) {
1016 q = s + pairs[i].byte_pos;
1017 c += rb_enc_strlen(p, q, enc);
1018 pairs[i].char_pos = c;
1019 p = q;
1020 }
1021
1022 for (i = 0; i < num_regs; i++) {
1023 pair_t key, *found;
1024 if (BEG(i) < 0) {
1025 rm->char_offset[i].beg = -1;
1026 rm->char_offset[i].end = -1;
1027 continue;
1028 }
1029
1030 key.byte_pos = BEG(i);
1031 found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
1032 rm->char_offset[i].beg = found->char_pos;
1033
1034 key.byte_pos = END(i);
1035 found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
1036 rm->char_offset[i].end = found->char_pos;
1037 }
1038}
1039
1040static void
1041match_check(VALUE match)
1042{
1043 if (!RMATCH(match)->regexp) {
1044 rb_raise(rb_eTypeError, "uninitialized MatchData");
1045 }
1046}
1047
1048/* :nodoc: */
1049static VALUE
1050match_init_copy(VALUE obj, VALUE orig)
1051{
1052 struct rmatch *rm;
1053
1054 if (!OBJ_INIT_COPY(obj, orig)) return obj;
1055
1056 RMATCH(obj)->str = RMATCH(orig)->str;
1057 RMATCH(obj)->regexp = RMATCH(orig)->regexp;
1058
1059 rm = RMATCH(obj)->rmatch;
1060 if (rb_reg_region_copy(&rm->regs, RMATCH_REGS(orig)))
1061 rb_memerror();
1062
1064 if (rm->char_offset_num_allocated < rm->regs.num_regs) {
1067 }
1069 struct rmatch_offset, rm->regs.num_regs);
1070 RB_GC_GUARD(orig);
1071 }
1072
1073 return obj;
1074}
1075
1076
1077/*
1078 * call-seq:
1079 * mtch.regexp -> regexp
1080 *
1081 * Returns the regexp.
1082 *
1083 * m = /a.*b/.match("abc")
1084 * m.regexp #=> /a.*b/
1085 */
1086
1087static VALUE
1088match_regexp(VALUE match)
1089{
1090 VALUE regexp;
1091 match_check(match);
1092 regexp = RMATCH(match)->regexp;
1093 if (NIL_P(regexp)) {
1094 VALUE str = rb_reg_nth_match(0, match);
1095 regexp = rb_reg_regcomp(rb_reg_quote(str));
1096 RMATCH(match)->regexp = regexp;
1097 }
1098 return regexp;
1099}
1100
1101/*
1102 * call-seq:
1103 * mtch.names -> [name1, name2, ...]
1104 *
1105 * Returns a list of names of captures as an array of strings.
1106 * It is same as mtch.regexp.names.
1107 *
1108 * /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").names
1109 * #=> ["foo", "bar", "baz"]
1110 *
1111 * m = /(?<x>.)(?<y>.)?/.match("a") #=> #<MatchData "a" x:"a" y:nil>
1112 * m.names #=> ["x", "y"]
1113 */
1114
1115static VALUE
1116match_names(VALUE match)
1117{
1118 match_check(match);
1119 if (NIL_P(RMATCH(match)->regexp))
1120 return rb_ary_new_capa(0);
1121 return rb_reg_names(RMATCH(match)->regexp);
1122}
1123
1124/*
1125 * call-seq:
1126 * mtch.length -> integer
1127 * mtch.size -> integer
1128 *
1129 * Returns the number of elements in the match array.
1130 *
1131 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1132 * m.length #=> 5
1133 * m.size #=> 5
1134 */
1135
1136static VALUE
1137match_size(VALUE match)
1138{
1139 match_check(match);
1140 return INT2FIX(RMATCH_REGS(match)->num_regs);
1141}
1142
1143static int name_to_backref_number(struct re_registers *, VALUE, const char*, const char*);
1144NORETURN(static void name_to_backref_error(VALUE name));
1145
1146static void
1147name_to_backref_error(VALUE name)
1148{
1149 rb_raise(rb_eIndexError, "undefined group name reference: % "PRIsVALUE,
1150 name);
1151}
1152
1153static int
1154match_backref_number(VALUE match, VALUE backref)
1155{
1156 const char *name;
1157 int num;
1158
1159 struct re_registers *regs = RMATCH_REGS(match);
1160 VALUE regexp = RMATCH(match)->regexp;
1161
1162 match_check(match);
1163 if (SYMBOL_P(backref)) {
1164 backref = rb_sym2str(backref);
1165 }
1166 else if (!RB_TYPE_P(backref, T_STRING)) {
1167 return NUM2INT(backref);
1168 }
1169 name = StringValueCStr(backref);
1170
1171 num = name_to_backref_number(regs, regexp, name, name + RSTRING_LEN(backref));
1172
1173 if (num < 1) {
1174 name_to_backref_error(backref);
1175 }
1176
1177 return num;
1178}
1179
1180int
1182{
1183 return match_backref_number(match, backref);
1184}
1185
1186/*
1187 * call-seq:
1188 * mtch.offset(n) -> array
1189 *
1190 * Returns a two-element array containing the beginning and ending offsets of
1191 * the <em>n</em>th match.
1192 * <em>n</em> can be a string or symbol to reference a named capture.
1193 *
1194 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1195 * m.offset(0) #=> [1, 7]
1196 * m.offset(4) #=> [6, 7]
1197 *
1198 * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1199 * p m.offset(:foo) #=> [0, 1]
1200 * p m.offset(:bar) #=> [2, 3]
1201 *
1202 */
1203
1204static VALUE
1205match_offset(VALUE match, VALUE n)
1206{
1207 int i = match_backref_number(match, n);
1208 struct re_registers *regs = RMATCH_REGS(match);
1209
1210 match_check(match);
1211 if (i < 0 || regs->num_regs <= i)
1212 rb_raise(rb_eIndexError, "index %d out of matches", i);
1213
1214 if (BEG(i) < 0)
1215 return rb_assoc_new(Qnil, Qnil);
1216
1217 update_char_offset(match);
1218 return rb_assoc_new(INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg),
1219 INT2FIX(RMATCH(match)->rmatch->char_offset[i].end));
1220}
1221
1222
1223/*
1224 * call-seq:
1225 * mtch.begin(n) -> integer
1226 *
1227 * Returns the offset of the start of the <em>n</em>th element of the match
1228 * array in the string.
1229 * <em>n</em> can be a string or symbol to reference a named capture.
1230 *
1231 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1232 * m.begin(0) #=> 1
1233 * m.begin(2) #=> 2
1234 *
1235 * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1236 * p m.begin(:foo) #=> 0
1237 * p m.begin(:bar) #=> 2
1238 */
1239
1240static VALUE
1241match_begin(VALUE match, VALUE n)
1242{
1243 int i = match_backref_number(match, n);
1244 struct re_registers *regs = RMATCH_REGS(match);
1245
1246 match_check(match);
1247 if (i < 0 || regs->num_regs <= i)
1248 rb_raise(rb_eIndexError, "index %d out of matches", i);
1249
1250 if (BEG(i) < 0)
1251 return Qnil;
1252
1253 update_char_offset(match);
1254 return INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg);
1255}
1256
1257
1258/*
1259 * call-seq:
1260 * mtch.end(n) -> integer
1261 *
1262 * Returns the offset of the character immediately following the end of the
1263 * <em>n</em>th element of the match array in the string.
1264 * <em>n</em> can be a string or symbol to reference a named capture.
1265 *
1266 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1267 * m.end(0) #=> 7
1268 * m.end(2) #=> 3
1269 *
1270 * m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
1271 * p m.end(:foo) #=> 1
1272 * p m.end(:bar) #=> 3
1273 */
1274
1275static VALUE
1276match_end(VALUE match, VALUE n)
1277{
1278 int i = match_backref_number(match, n);
1279 struct re_registers *regs = RMATCH_REGS(match);
1280
1281 match_check(match);
1282 if (i < 0 || regs->num_regs <= i)
1283 rb_raise(rb_eIndexError, "index %d out of matches", i);
1284
1285 if (BEG(i) < 0)
1286 return Qnil;
1287
1288 update_char_offset(match);
1289 return INT2FIX(RMATCH(match)->rmatch->char_offset[i].end);
1290}
1291
1292#define MATCH_BUSY FL_USER2
1293
1294void
1296{
1297 FL_SET(match, MATCH_BUSY);
1298}
1299
1300void
1302{
1303 FL_UNSET(match, MATCH_BUSY);
1304}
1305
1306int
1308{
1309 struct re_registers *regs;
1310 if (NIL_P(match)) return -1;
1311 regs = RMATCH_REGS(match);
1312 if (!regs) return -1;
1313 return regs->num_regs;
1314}
1315
1316int
1318{
1319 struct re_registers *regs;
1320 if (NIL_P(match)) return FALSE;
1321 regs = RMATCH_REGS(match);
1322 if (!regs) return FALSE;
1323 if (nth >= regs->num_regs) {
1324 return FALSE;
1325 }
1326 if (nth < 0) {
1327 nth += regs->num_regs;
1328 if (nth <= 0) return FALSE;
1329 }
1330 return (BEG(nth) != -1);
1331}
1332
1333static void
1334match_set_string(VALUE m, VALUE string, long pos, long len)
1335{
1336 struct RMatch *match = (struct RMatch *)m;
1337 struct rmatch *rmatch = match->rmatch;
1338
1339 match->str = string;
1340 match->regexp = Qnil;
1341 int err = onig_region_resize(&rmatch->regs, 1);
1342 if (err) rb_memerror();
1343 rmatch->regs.beg[0] = pos;
1344 rmatch->regs.end[0] = pos + len;
1345}
1346
1347void
1348rb_backref_set_string(VALUE string, long pos, long len)
1349{
1350 VALUE match = rb_backref_get();
1351 if (NIL_P(match) || FL_TEST(match, MATCH_BUSY)) {
1352 match = match_alloc(rb_cMatch);
1353 }
1354 match_set_string(match, string, pos, len);
1355 rb_backref_set(match);
1356}
1357
1358/*
1359 * call-seq:
1360 * rxp.fixed_encoding? -> true or false
1361 *
1362 * Returns false if rxp is applicable to
1363 * a string with any ASCII compatible encoding.
1364 * Returns true otherwise.
1365 *
1366 * r = /a/
1367 * r.fixed_encoding? #=> false
1368 * r =~ "\u{6666} a" #=> 2
1369 * r =~ "\xa1\xa2 a".force_encoding("euc-jp") #=> 2
1370 * r =~ "abc".force_encoding("euc-jp") #=> 0
1371 *
1372 * r = /a/u
1373 * r.fixed_encoding? #=> true
1374 * r.encoding #=> #<Encoding:UTF-8>
1375 * r =~ "\u{6666} a" #=> 2
1376 * r =~ "\xa1\xa2".force_encoding("euc-jp") #=> Encoding::CompatibilityError
1377 * r =~ "abc".force_encoding("euc-jp") #=> 0
1378 *
1379 * r = /\u{6666}/
1380 * r.fixed_encoding? #=> true
1381 * r.encoding #=> #<Encoding:UTF-8>
1382 * r =~ "\u{6666} a" #=> 0
1383 * r =~ "\xa1\xa2".force_encoding("euc-jp") #=> Encoding::CompatibilityError
1384 * r =~ "abc".force_encoding("euc-jp") #=> nil
1385 */
1386
1387static VALUE
1388rb_reg_fixed_encoding_p(VALUE re)
1389{
1390 if (FL_TEST(re, KCODE_FIXED))
1391 return Qtrue;
1392 else
1393 return Qfalse;
1394}
1395
1396static VALUE
1397rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
1398 rb_encoding **fixed_enc, onig_errmsg_buffer err);
1399
1400NORETURN(static void reg_enc_error(VALUE re, VALUE str));
1401
1402static void
1403reg_enc_error(VALUE re, VALUE str)
1404{
1406 "incompatible encoding regexp match (%s regexp with %s string)",
1409}
1410
1411static inline int
1412str_coderange(VALUE str)
1413{
1414 int cr = ENC_CODERANGE(str);
1415 if (cr == ENC_CODERANGE_UNKNOWN) {
1417 }
1418 return cr;
1419}
1420
1421static rb_encoding*
1422rb_reg_prepare_enc(VALUE re, VALUE str, int warn)
1423{
1424 rb_encoding *enc = 0;
1425 int cr = str_coderange(str);
1426
1427 if (cr == ENC_CODERANGE_BROKEN) {
1429 "invalid byte sequence in %s",
1431 }
1432
1433 rb_reg_check(re);
1434 enc = rb_enc_get(str);
1435 if (RREGEXP_PTR(re)->enc == enc) {
1436 }
1437 else if (cr == ENC_CODERANGE_7BIT &&
1438 RREGEXP_PTR(re)->enc == rb_usascii_encoding()) {
1439 enc = RREGEXP_PTR(re)->enc;
1440 }
1441 else if (!rb_enc_asciicompat(enc)) {
1442 reg_enc_error(re, str);
1443 }
1444 else if (rb_reg_fixed_encoding_p(re)) {
1445 if ((!rb_enc_asciicompat(RREGEXP_PTR(re)->enc) ||
1446 cr != ENC_CODERANGE_7BIT)) {
1447 reg_enc_error(re, str);
1448 }
1449 enc = RREGEXP_PTR(re)->enc;
1450 }
1451 else if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) &&
1452 enc != rb_ascii8bit_encoding() &&
1453 cr != ENC_CODERANGE_7BIT) {
1454 rb_warn("historical binary regexp match /.../n against %s string",
1455 rb_enc_name(enc));
1456 }
1457 return enc;
1458}
1459
1460regex_t *
1462{
1463 regex_t *reg = RREGEXP_PTR(re);
1464 int r;
1465 OnigErrorInfo einfo;
1466 const char *pattern;
1467 VALUE unescaped;
1468 rb_encoding *fixed_enc = 0;
1469 rb_encoding *enc = rb_reg_prepare_enc(re, str, 1);
1470
1471 if (reg->enc == enc) return reg;
1472
1473 rb_reg_check(re);
1474 reg = RREGEXP_PTR(re);
1475 pattern = RREGEXP_SRC_PTR(re);
1476
1477 unescaped = rb_reg_preprocess(
1478 pattern, pattern + RREGEXP_SRC_LEN(re), enc,
1479 &fixed_enc, err);
1480
1481 if (unescaped == Qnil) {
1482 rb_raise(rb_eArgError, "regexp preprocess failed: %s", err);
1483 }
1484
1485 r = onig_new(&reg, (UChar* )RSTRING_PTR(unescaped),
1486 (UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)),
1487 reg->options, enc,
1488 OnigDefaultSyntax, &einfo);
1489 if (r) {
1490 onig_error_code_to_str((UChar*)err, r, &einfo);
1491 rb_reg_raise(pattern, RREGEXP_SRC_LEN(re), err, re);
1492 }
1493
1494 RB_GC_GUARD(unescaped);
1495 return reg;
1496}
1497
1498regex_t *
1500{
1502 return rb_reg_prepare_re0(re, str, err);
1503}
1504
1505long
1506rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int reverse)
1507{
1508 long range;
1509 rb_encoding *enc;
1510 UChar *p, *string;
1511
1512 enc = rb_reg_prepare_enc(re, str, 0);
1513
1514 if (reverse) {
1515 range = -pos;
1516 }
1517 else {
1518 range = RSTRING_LEN(str) - pos;
1519 }
1520
1521 if (pos > 0 && ONIGENC_MBC_MAXLEN(enc) != 1 && pos < RSTRING_LEN(str)) {
1522 string = (UChar*)RSTRING_PTR(str);
1523
1524 if (range > 0) {
1525 p = onigenc_get_right_adjust_char_head(enc, string, string + pos, string + RSTRING_LEN(str));
1526 }
1527 else {
1528 p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, string, string + pos, string + RSTRING_LEN(str));
1529 }
1530 return p - string;
1531 }
1532
1533 return pos;
1534}
1535
1536/* returns byte offset */
1537long
1538rb_reg_search0(VALUE re, VALUE str, long pos, int reverse, int set_backref_str)
1539{
1540 long result;
1541 VALUE match;
1542 struct re_registers regi, *regs = &regi;
1543 char *range = RSTRING_PTR(str);
1544 regex_t *reg;
1545 int tmpreg;
1547
1548 if (pos > RSTRING_LEN(str) || pos < 0) {
1550 return -1;
1551 }
1552
1553 reg = rb_reg_prepare_re0(re, str, err);
1554 tmpreg = reg != RREGEXP_PTR(re);
1555 if (!tmpreg) RREGEXP(re)->usecnt++;
1556
1557 match = rb_backref_get();
1558 if (!NIL_P(match)) {
1559 if (FL_TEST(match, MATCH_BUSY)) {
1560 match = Qnil;
1561 }
1562 else {
1563 regs = RMATCH_REGS(match);
1564 }
1565 }
1566 if (NIL_P(match)) {
1567 MEMZERO(regs, struct re_registers, 1);
1568 }
1569 if (!reverse) {
1570 range += RSTRING_LEN(str);
1571 }
1572 result = onig_search(reg,
1573 (UChar*)(RSTRING_PTR(str)),
1575 ((UChar*)(RSTRING_PTR(str)) + pos),
1576 ((UChar*)range),
1577 regs, ONIG_OPTION_NONE);
1578 if (!tmpreg) RREGEXP(re)->usecnt--;
1579 if (tmpreg) {
1580 if (RREGEXP(re)->usecnt) {
1581 onig_free(reg);
1582 }
1583 else {
1585 RREGEXP_PTR(re) = reg;
1586 }
1587 }
1588 if (result < 0) {
1589 if (regs == &regi)
1590 onig_region_free(regs, 0);
1591 if (result == ONIG_MISMATCH) {
1593 return result;
1594 }
1595 else {
1596 onig_error_code_to_str((UChar*)err, (int)result);
1597 rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re);
1598 }
1599 }
1600
1601 if (NIL_P(match)) {
1602 int err;
1603 match = match_alloc(rb_cMatch);
1604 err = rb_reg_region_copy(RMATCH_REGS(match), regs);
1605 onig_region_free(regs, 0);
1606 if (err) rb_memerror();
1607 }
1608
1609 if (set_backref_str) {
1610 RMATCH(match)->str = rb_str_new4(str);
1611 }
1612
1613 RMATCH(match)->regexp = re;
1614 rb_backref_set(match);
1615
1616 return result;
1617}
1618
1619long
1620rb_reg_search(VALUE re, VALUE str, long pos, int reverse)
1621{
1622 return rb_reg_search0(re, str, pos, reverse, 1);
1623}
1624
1625bool
1627{
1628 long result;
1629 VALUE match;
1630 struct re_registers regi, *regs = &regi;
1631 regex_t *reg;
1632 int tmpreg;
1634
1635 reg = rb_reg_prepare_re0(re, str, err);
1636 tmpreg = reg != RREGEXP_PTR(re);
1637 if (!tmpreg) RREGEXP(re)->usecnt++;
1638
1639 match = rb_backref_get();
1640 if (!NIL_P(match)) {
1641 if (FL_TEST(match, MATCH_BUSY)) {
1642 match = Qnil;
1643 }
1644 else {
1645 regs = RMATCH_REGS(match);
1646 }
1647 }
1648 if (NIL_P(match)) {
1649 MEMZERO(regs, struct re_registers, 1);
1650 }
1651 result = onig_match(reg,
1652 (UChar*)(RSTRING_PTR(str)),
1654 (UChar*)(RSTRING_PTR(str)),
1655 regs, ONIG_OPTION_NONE);
1656 if (!tmpreg) RREGEXP(re)->usecnt--;
1657 if (tmpreg) {
1658 if (RREGEXP(re)->usecnt) {
1659 onig_free(reg);
1660 }
1661 else {
1663 RREGEXP_PTR(re) = reg;
1664 }
1665 }
1666 if (result < 0) {
1667 if (regs == &regi)
1668 onig_region_free(regs, 0);
1669 if (result == ONIG_MISMATCH) {
1671 return false;
1672 }
1673 else {
1674 onig_error_code_to_str((UChar*)err, (int)result);
1675 rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re);
1676 }
1677 }
1678
1679 if (NIL_P(match)) {
1680 int err;
1681 match = match_alloc(rb_cMatch);
1682 err = rb_reg_region_copy(RMATCH_REGS(match), regs);
1683 onig_region_free(regs, 0);
1684 if (err) rb_memerror();
1685 }
1686
1687 RMATCH(match)->str = rb_str_new4(str);
1688
1689 RMATCH(match)->regexp = re;
1690 rb_backref_set(match);
1691
1692 return true;
1693}
1694
1695VALUE
1697{
1698 struct re_registers *regs;
1699 if (NIL_P(match)) return Qnil;
1700 match_check(match);
1701 regs = RMATCH_REGS(match);
1702 if (nth >= regs->num_regs) {
1703 return Qnil;
1704 }
1705 if (nth < 0) {
1706 nth += regs->num_regs;
1707 if (nth <= 0) return Qnil;
1708 }
1709 if (BEG(nth) == -1) return Qfalse;
1710 return Qtrue;
1711}
1712
1713VALUE
1715{
1716 VALUE str;
1717 long start, end, len;
1718 struct re_registers *regs;
1719
1720 if (NIL_P(match)) return Qnil;
1721 match_check(match);
1722 regs = RMATCH_REGS(match);
1723 if (nth >= regs->num_regs) {
1724 return Qnil;
1725 }
1726 if (nth < 0) {
1727 nth += regs->num_regs;
1728 if (nth <= 0) return Qnil;
1729 }
1730 start = BEG(nth);
1731 if (start == -1) return Qnil;
1732 end = END(nth);
1733 len = end - start;
1734 str = rb_str_subseq(RMATCH(match)->str, start, len);
1735 return str;
1736}
1737
1738VALUE
1740{
1741 return rb_reg_nth_match(0, match);
1742}
1743
1744
1745/*
1746 * call-seq:
1747 * mtch.pre_match -> str
1748 *
1749 * Returns the portion of the original string before the current match.
1750 * Equivalent to the special variable <code>$`</code>.
1751 *
1752 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1753 * m.pre_match #=> "T"
1754 */
1755
1756VALUE
1758{
1759 VALUE str;
1760 struct re_registers *regs;
1761
1762 if (NIL_P(match)) return Qnil;
1763 match_check(match);
1764 regs = RMATCH_REGS(match);
1765 if (BEG(0) == -1) return Qnil;
1766 str = rb_str_subseq(RMATCH(match)->str, 0, BEG(0));
1767 return str;
1768}
1769
1770
1771/*
1772 * call-seq:
1773 * mtch.post_match -> str
1774 *
1775 * Returns the portion of the original string after the current match.
1776 * Equivalent to the special variable <code>$'</code>.
1777 *
1778 * m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
1779 * m.post_match #=> ": The Movie"
1780 */
1781
1782VALUE
1784{
1785 VALUE str;
1786 long pos;
1787 struct re_registers *regs;
1788
1789 if (NIL_P(match)) return Qnil;
1790 match_check(match);
1791 regs = RMATCH_REGS(match);
1792 if (BEG(0) == -1) return Qnil;
1793 str = RMATCH(match)->str;
1794 pos = END(0);
1795 str = rb_str_subseq(str, pos, RSTRING_LEN(str) - pos);
1796 return str;
1797}
1798
1799VALUE
1801{
1802 int i;
1803 struct re_registers *regs;
1804
1805 if (NIL_P(match)) return Qnil;
1806 match_check(match);
1807 regs = RMATCH_REGS(match);
1808 if (BEG(0) == -1) return Qnil;
1809
1810 for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--)
1811 ;
1812 if (i == 0) return Qnil;
1813 return rb_reg_nth_match(i, match);
1814}
1815
1816static VALUE
1817last_match_getter(ID _x, VALUE *_y)
1818{
1820}
1821
1822static VALUE
1823prematch_getter(ID _x, VALUE *_y)
1824{
1826}
1827
1828static VALUE
1829postmatch_getter(ID _x, VALUE *_y)
1830{
1832}
1833
1834static VALUE
1835last_paren_match_getter(ID _x, VALUE *_y)
1836{
1838}
1839
1840static VALUE
1841match_array(VALUE match, int start)
1842{
1843 struct re_registers *regs;
1844 VALUE ary;
1845 VALUE target;
1846 int i;
1847
1848 match_check(match);
1849 regs = RMATCH_REGS(match);
1850 ary = rb_ary_new2(regs->num_regs);
1851 target = RMATCH(match)->str;
1852
1853 for (i=start; i<regs->num_regs; i++) {
1854 if (regs->beg[i] == -1) {
1855 rb_ary_push(ary, Qnil);
1856 }
1857 else {
1858 VALUE str = rb_str_subseq(target, regs->beg[i], regs->end[i]-regs->beg[i]);
1859 rb_ary_push(ary, str);
1860 }
1861 }
1862 return ary;
1863}
1864
1865
1866/*
1867 * call-seq:
1868 * mtch.to_a -> anArray
1869 *
1870 * Returns the array of matches.
1871 *
1872 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1873 * m.to_a #=> ["HX1138", "H", "X", "113", "8"]
1874 *
1875 * Because <code>to_a</code> is called when expanding
1876 * <code>*</code><em>variable</em>, there's a useful assignment
1877 * shortcut for extracting matched fields. This is slightly slower than
1878 * accessing the fields directly (as an intermediate array is
1879 * generated).
1880 *
1881 * all,f1,f2,f3 = * /(.)(.)(\d+)(\d)/.match("THX1138.")
1882 * all #=> "HX1138"
1883 * f1 #=> "H"
1884 * f2 #=> "X"
1885 * f3 #=> "113"
1886 */
1887
1888static VALUE
1889match_to_a(VALUE match)
1890{
1891 return match_array(match, 0);
1892}
1893
1894
1895/*
1896 * call-seq:
1897 * mtch.captures -> array
1898 *
1899 * Returns the array of captures; equivalent to <code>mtch.to_a[1..-1]</code>.
1900 *
1901 * f1,f2,f3,f4 = /(.)(.)(\d+)(\d)/.match("THX1138.").captures
1902 * f1 #=> "H"
1903 * f2 #=> "X"
1904 * f3 #=> "113"
1905 * f4 #=> "8"
1906 */
1907static VALUE
1908match_captures(VALUE match)
1909{
1910 return match_array(match, 1);
1911}
1912
1913static int
1914name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name, const char* name_end)
1915{
1916 if (NIL_P(regexp)) return -1;
1918 (const unsigned char *)name, (const unsigned char *)name_end, regs);
1919}
1920
1921#define NAME_TO_NUMBER(regs, re, name, name_ptr, name_end) \
1922 (NIL_P(re) ? 0 : \
1923 !rb_enc_compatible(RREGEXP_SRC(re), (name)) ? 0 : \
1924 name_to_backref_number((regs), (re), (name_ptr), (name_end)))
1925
1926static int
1927namev_to_backref_number(struct re_registers *regs, VALUE re, VALUE name)
1928{
1929 int num;
1930
1931 if (SYMBOL_P(name)) {
1932 name = rb_sym2str(name);
1933 }
1934 else if (!RB_TYPE_P(name, T_STRING)) {
1935 return -1;
1936 }
1937 num = NAME_TO_NUMBER(regs, re, name,
1939 if (num < 1) {
1940 name_to_backref_error(name);
1941 }
1942 return num;
1943}
1944
1945static VALUE
1946match_ary_subseq(VALUE match, long beg, long len, VALUE result)
1947{
1948 long olen = RMATCH_REGS(match)->num_regs;
1949 long j, end = olen < beg+len ? olen : beg+len;
1950 if (NIL_P(result)) result = rb_ary_new_capa(len);
1951 if (len == 0) return result;
1952
1953 for (j = beg; j < end; j++) {
1954 rb_ary_push(result, rb_reg_nth_match((int)j, match));
1955 }
1956 if (beg + len > j) {
1957 rb_ary_resize(result, RARRAY_LEN(result) + (beg + len) - j);
1958 }
1959 return result;
1960}
1961
1962static VALUE
1963match_ary_aref(VALUE match, VALUE idx, VALUE result)
1964{
1965 long beg, len;
1966 int num_regs = RMATCH_REGS(match)->num_regs;
1967
1968 /* check if idx is Range */
1969 switch (rb_range_beg_len(idx, &beg, &len, (long)num_regs, !NIL_P(result))) {
1970 case Qfalse:
1971 if (NIL_P(result)) return rb_reg_nth_match(NUM2INT(idx), match);
1972 rb_ary_push(result, rb_reg_nth_match(NUM2INT(idx), match));
1973 return result;
1974 case Qnil:
1975 return Qnil;
1976 default:
1977 return match_ary_subseq(match, beg, len, result);
1978 }
1979}
1980
1981/*
1982 * call-seq:
1983 * mtch[i] -> str or nil
1984 * mtch[start, length] -> array
1985 * mtch[range] -> array
1986 * mtch[name] -> str or nil
1987 *
1988 * Match Reference -- MatchData acts as an array, and may be accessed
1989 * using the normal array indexing techniques. <code>mtch[0]</code>
1990 * is equivalent to the special variable <code>$&</code>, and returns
1991 * the entire matched string. <code>mtch[1]</code>,
1992 * <code>mtch[2]</code>, and so on return the values of the matched
1993 * backreferences (portions of the pattern between parentheses).
1994 *
1995 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
1996 * m #=> #<MatchData "HX1138" 1:"H" 2:"X" 3:"113" 4:"8">
1997 * m[0] #=> "HX1138"
1998 * m[1, 2] #=> ["H", "X"]
1999 * m[1..3] #=> ["H", "X", "113"]
2000 * m[-3, 2] #=> ["X", "113"]
2001 *
2002 * m = /(?<foo>a+)b/.match("ccaaab")
2003 * m #=> #<MatchData "aaab" foo:"aaa">
2004 * m["foo"] #=> "aaa"
2005 * m[:foo] #=> "aaa"
2006 */
2007
2008static VALUE
2009match_aref(int argc, VALUE *argv, VALUE match)
2010{
2011 VALUE idx, length;
2012
2013 match_check(match);
2014 rb_scan_args(argc, argv, "11", &idx, &length);
2015
2016 if (NIL_P(length)) {
2017 if (FIXNUM_P(idx)) {
2018 return rb_reg_nth_match(FIX2INT(idx), match);
2019 }
2020 else {
2021 int num = namev_to_backref_number(RMATCH_REGS(match), RMATCH(match)->regexp, idx);
2022 if (num >= 0) {
2023 return rb_reg_nth_match(num, match);
2024 }
2025 else {
2026 return match_ary_aref(match, idx, Qnil);
2027 }
2028 }
2029 }
2030 else {
2031 long beg = NUM2LONG(idx);
2032 long len = NUM2LONG(length);
2033 long num_regs = RMATCH_REGS(match)->num_regs;
2034 if (len < 0) {
2035 return Qnil;
2036 }
2037 if (beg < 0) {
2038 beg += num_regs;
2039 if (beg < 0) return Qnil;
2040 }
2041 else if (beg > num_regs) {
2042 return Qnil;
2043 }
2044 else if (beg+len > num_regs) {
2045 len = num_regs - beg;
2046 }
2047 return match_ary_subseq(match, beg, len, Qnil);
2048 }
2049}
2050
2051/*
2052 * call-seq:
2053 *
2054 * mtch.values_at(index, ...) -> array
2055 *
2056 * Uses each <i>index</i> to access the matching values, returning an array of
2057 * the corresponding matches.
2058 *
2059 * m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
2060 * m.to_a #=> ["HX1138", "H", "X", "113", "8"]
2061 * m.values_at(0, 2, -2) #=> ["HX1138", "X", "113"]
2062 *
2063 * m = /(?<a>\d+) *(?<op>[+\-*\/]) *(?<b>\d+)/.match("1 + 2")
2064 * m.to_a #=> ["1 + 2", "1", "+", "2"]
2065 * m.values_at(:a, :b, :op) #=> ["1", "2", "+"]
2066 */
2067
2068static VALUE
2069match_values_at(int argc, VALUE *argv, VALUE match)
2070{
2071 VALUE result;
2072 int i;
2073
2074 match_check(match);
2075 result = rb_ary_new2(argc);
2076
2077 for (i=0; i<argc; i++) {
2078 if (FIXNUM_P(argv[i])) {
2079 rb_ary_push(result, rb_reg_nth_match(FIX2INT(argv[i]), match));
2080 }
2081 else {
2082 int num = namev_to_backref_number(RMATCH_REGS(match), RMATCH(match)->regexp, argv[i]);
2083 if (num >= 0) {
2084 rb_ary_push(result, rb_reg_nth_match(num, match));
2085 }
2086 else {
2087 match_ary_aref(match, argv[i], result);
2088 }
2089 }
2090 }
2091 return result;
2092}
2093
2094
2095/*
2096 * call-seq:
2097 * mtch.to_s -> str
2098 *
2099 * Returns the entire matched string.
2100 *
2101 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
2102 * m.to_s #=> "HX1138"
2103 */
2104
2105static VALUE
2106match_to_s(VALUE match)
2107{
2108 VALUE str = rb_reg_last_match(match);
2109
2110 match_check(match);
2111 if (NIL_P(str)) str = rb_str_new(0,0);
2112 return str;
2113}
2114
2115static int
2116match_named_captures_iter(const OnigUChar *name, const OnigUChar *name_end,
2117 int back_num, int *back_refs, OnigRegex regex, void *arg) {
2118 struct MEMO *memo = MEMO_CAST(arg);
2119 VALUE hash = memo->v1;
2120 VALUE match = memo->v2;
2121
2122 VALUE key = rb_enc_str_new((const char *)name, name_end-name, regex->enc);
2123 VALUE value;
2124
2125 int i;
2126 int found = 0;
2127
2128 for (i = 0; i < back_num; i++) {
2129 value = rb_reg_nth_match(back_refs[i], match);
2130 if (RTEST(value)) {
2131 rb_hash_aset(hash, key, value);
2132 found = 1;
2133 }
2134 }
2135
2136 if (found == 0) {
2137 rb_hash_aset(hash, key, Qnil);
2138 }
2139
2140 return 0;
2141}
2142
2143/*
2144 * call-seq:
2145 * mtch.named_captures -> hash
2146 *
2147 * Returns a Hash using named capture.
2148 *
2149 * A key of the hash is a name of the named captures.
2150 * A value of the hash is a string of last successful capture of corresponding
2151 * group.
2152 *
2153 * m = /(?<a>.)(?<b>.)/.match("01")
2154 * m.named_captures #=> {"a" => "0", "b" => "1"}
2155 *
2156 * m = /(?<a>.)(?<b>.)?/.match("0")
2157 * m.named_captures #=> {"a" => "0", "b" => nil}
2158 *
2159 * m = /(?<a>.)(?<a>.)/.match("01")
2160 * m.named_captures #=> {"a" => "1"}
2161 *
2162 * m = /(?<a>x)|(?<a>y)/.match("x")
2163 * m.named_captures #=> {"a" => "x"}
2164 */
2165
2166static VALUE
2167match_named_captures(VALUE match)
2168{
2169 VALUE hash;
2170 struct MEMO *memo;
2171
2172 match_check(match);
2173 if (NIL_P(RMATCH(match)->regexp))
2174 return rb_hash_new();
2175
2176 hash = rb_hash_new();
2177 memo = MEMO_NEW(hash, match, 0);
2178
2179 onig_foreach_name(RREGEXP(RMATCH(match)->regexp)->ptr, match_named_captures_iter, (void*)memo);
2180
2181 return hash;
2182}
2183
2184/*
2185 * call-seq:
2186 * mtch.string -> str
2187 *
2188 * Returns a frozen copy of the string passed in to <code>match</code>.
2189 *
2190 * m = /(.)(.)(\d+)(\d)/.match("THX1138.")
2191 * m.string #=> "THX1138."
2192 */
2193
2194static VALUE
2195match_string(VALUE match)
2196{
2197 match_check(match);
2198 return RMATCH(match)->str; /* str is frozen */
2199}
2200
2202 const UChar *name;
2203 long len;
2204};
2205
2206static int
2207match_inspect_name_iter(const OnigUChar *name, const OnigUChar *name_end,
2208 int back_num, int *back_refs, OnigRegex regex, void *arg0)
2209{
2210 struct backref_name_tag *arg = (struct backref_name_tag *)arg0;
2211 int i;
2212
2213 for (i = 0; i < back_num; i++) {
2214 arg[back_refs[i]].name = name;
2215 arg[back_refs[i]].len = name_end - name;
2216 }
2217 return 0;
2218}
2219
2220/*
2221 * call-seq:
2222 * mtch.inspect -> str
2223 *
2224 * Returns a printable version of <i>mtch</i>.
2225 *
2226 * puts /.$/.match("foo").inspect
2227 * #=> #<MatchData "o">
2228 *
2229 * puts /(.)(.)(.)/.match("foo").inspect
2230 * #=> #<MatchData "foo" 1:"f" 2:"o" 3:"o">
2231 *
2232 * puts /(.)(.)?(.)/.match("fo").inspect
2233 * #=> #<MatchData "fo" 1:"f" 2:nil 3:"o">
2234 *
2235 * puts /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").inspect
2236 * #=> #<MatchData "hog" foo:"h" bar:"o" baz:"g">
2237 *
2238 */
2239
2240static VALUE
2241match_inspect(VALUE match)
2242{
2243 VALUE cname = rb_class_path(rb_obj_class(match));
2244 VALUE str;
2245 int i;
2246 struct re_registers *regs = RMATCH_REGS(match);
2247 int num_regs = regs->num_regs;
2248 struct backref_name_tag *names;
2249 VALUE regexp = RMATCH(match)->regexp;
2250
2251 if (regexp == 0) {
2252 return rb_sprintf("#<%"PRIsVALUE":%p>", cname, (void*)match);
2253 }
2254 else if (NIL_P(regexp)) {
2255 return rb_sprintf("#<%"PRIsVALUE": %"PRIsVALUE">",
2256 cname, rb_reg_nth_match(0, match));
2257 }
2258
2259 names = ALLOCA_N(struct backref_name_tag, num_regs);
2260 MEMZERO(names, struct backref_name_tag, num_regs);
2261
2263 match_inspect_name_iter, names);
2264
2265 str = rb_str_buf_new2("#<");
2266 rb_str_append(str, cname);
2267
2268 for (i = 0; i < num_regs; i++) {
2269 VALUE v;
2270 rb_str_buf_cat2(str, " ");
2271 if (0 < i) {
2272 if (names[i].name)
2273 rb_str_buf_cat(str, (const char *)names[i].name, names[i].len);
2274 else {
2275 rb_str_catf(str, "%d", i);
2276 }
2277 rb_str_buf_cat2(str, ":");
2278 }
2279 v = rb_reg_nth_match(i, match);
2280 if (v == Qnil)
2281 rb_str_buf_cat2(str, "nil");
2282 else
2284 }
2285 rb_str_buf_cat2(str, ">");
2286
2287 return str;
2288}
2289
2291
2292static int
2293read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err)
2294{
2295 const char *p = *pp;
2296 int code;
2297 int meta_prefix = 0, ctrl_prefix = 0;
2298 size_t len;
2299
2300 if (p == end || *p++ != '\\') {
2301 errcpy(err, "too short escaped multibyte character");
2302 return -1;
2303 }
2304
2305again:
2306 if (p == end) {
2307 errcpy(err, "too short escape sequence");
2308 return -1;
2309 }
2310 switch (*p++) {
2311 case '\\': code = '\\'; break;
2312 case 'n': code = '\n'; break;
2313 case 't': code = '\t'; break;
2314 case 'r': code = '\r'; break;
2315 case 'f': code = '\f'; break;
2316 case 'v': code = '\013'; break;
2317 case 'a': code = '\007'; break;
2318 case 'e': code = '\033'; break;
2319
2320 /* \OOO */
2321 case '0': case '1': case '2': case '3':
2322 case '4': case '5': case '6': case '7':
2323 p--;
2324 code = scan_oct(p, end < p+3 ? end-p : 3, &len);
2325 p += len;
2326 break;
2327
2328 case 'x': /* \xHH */
2329 code = scan_hex(p, end < p+2 ? end-p : 2, &len);
2330 if (len < 1) {
2331 errcpy(err, "invalid hex escape");
2332 return -1;
2333 }
2334 p += len;
2335 break;
2336
2337 case 'M': /* \M-X, \M-\C-X, \M-\cX */
2338 if (meta_prefix) {
2339 errcpy(err, "duplicate meta escape");
2340 return -1;
2341 }
2342 meta_prefix = 1;
2343 if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) {
2344 if (*p == '\\') {
2345 p++;
2346 goto again;
2347 }
2348 else {
2349 code = *p++;
2350 break;
2351 }
2352 }
2353 errcpy(err, "too short meta escape");
2354 return -1;
2355
2356 case 'C': /* \C-X, \C-\M-X */
2357 if (p == end || *p++ != '-') {
2358 errcpy(err, "too short control escape");
2359 return -1;
2360 }
2361 case 'c': /* \cX, \c\M-X */
2362 if (ctrl_prefix) {
2363 errcpy(err, "duplicate control escape");
2364 return -1;
2365 }
2366 ctrl_prefix = 1;
2367 if (p < end && (*p & 0x80) == 0) {
2368 if (*p == '\\') {
2369 p++;
2370 goto again;
2371 }
2372 else {
2373 code = *p++;
2374 break;
2375 }
2376 }
2377 errcpy(err, "too short control escape");
2378 return -1;
2379
2380 default:
2381 errcpy(err, "unexpected escape sequence");
2382 return -1;
2383 }
2384 if (code < 0 || 0xff < code) {
2385 errcpy(err, "invalid escape code");
2386 return -1;
2387 }
2388
2389 if (ctrl_prefix)
2390 code &= 0x1f;
2391 if (meta_prefix)
2392 code |= 0x80;
2393
2394 *pp = p;
2395 return code;
2396}
2397
2398static int
2399unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
2401{
2402 const char *p = *pp;
2403 int chmaxlen = rb_enc_mbmaxlen(enc);
2404 unsigned char *area = ALLOCA_N(unsigned char, chmaxlen);
2405 char *chbuf = (char *)area;
2406 int chlen = 0;
2407 int byte;
2408 int l;
2409
2410 memset(chbuf, 0, chmaxlen);
2411
2412 byte = read_escaped_byte(&p, end, err);
2413 if (byte == -1) {
2414 return -1;
2415 }
2416
2417 area[chlen++] = byte;
2418 while (chlen < chmaxlen &&
2419 MBCLEN_NEEDMORE_P(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) {
2420 byte = read_escaped_byte(&p, end, err);
2421 if (byte == -1) {
2422 return -1;
2423 }
2424 area[chlen++] = byte;
2425 }
2426
2427 l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc);
2428 if (MBCLEN_INVALID_P(l)) {
2429 errcpy(err, "invalid multibyte escape");
2430 return -1;
2431 }
2432 if (1 < chlen || (area[0] & 0x80)) {
2433 rb_str_buf_cat(buf, chbuf, chlen);
2434
2435 if (*encp == 0)
2436 *encp = enc;
2437 else if (*encp != enc) {
2438 errcpy(err, "escaped non ASCII character in UTF-8 regexp");
2439 return -1;
2440 }
2441 }
2442 else {
2443 char escbuf[5];
2444 snprintf(escbuf, sizeof(escbuf), "\\x%02X", area[0]&0xff);
2445 rb_str_buf_cat(buf, escbuf, 4);
2446 }
2447 *pp = p;
2448 return 0;
2449}
2450
2451static int
2452check_unicode_range(unsigned long code, onig_errmsg_buffer err)
2453{
2454 if ((0xd800 <= code && code <= 0xdfff) || /* Surrogates */
2455 0x10ffff < code) {
2456 errcpy(err, "invalid Unicode range");
2457 return -1;
2458 }
2459 return 0;
2460}
2461
2462static int
2463append_utf8(unsigned long uv,
2465{
2466 if (check_unicode_range(uv, err) != 0)
2467 return -1;
2468 if (uv < 0x80) {
2469 char escbuf[5];
2470 snprintf(escbuf, sizeof(escbuf), "\\x%02X", (int)uv);
2471 rb_str_buf_cat(buf, escbuf, 4);
2472 }
2473 else {
2474 int len;
2475 char utf8buf[6];
2476 len = rb_uv_to_utf8(utf8buf, uv);
2477 rb_str_buf_cat(buf, utf8buf, len);
2478
2479 if (*encp == 0)
2480 *encp = rb_utf8_encoding();
2481 else if (*encp != rb_utf8_encoding()) {
2482 errcpy(err, "UTF-8 character in non UTF-8 regexp");
2483 return -1;
2484 }
2485 }
2486 return 0;
2487}
2488
2489static int
2490unescape_unicode_list(const char **pp, const char *end,
2492{
2493 const char *p = *pp;
2494 int has_unicode = 0;
2495 unsigned long code;
2496 size_t len;
2497
2498 while (p < end && ISSPACE(*p)) p++;
2499
2500 while (1) {
2501 code = ruby_scan_hex(p, end-p, &len);
2502 if (len == 0)
2503 break;
2504 if (6 < len) { /* max 10FFFF */
2505 errcpy(err, "invalid Unicode range");
2506 return -1;
2507 }
2508 p += len;
2509 if (append_utf8(code, buf, encp, err) != 0)
2510 return -1;
2511 has_unicode = 1;
2512
2513 while (p < end && ISSPACE(*p)) p++;
2514 }
2515
2516 if (has_unicode == 0) {
2517 errcpy(err, "invalid Unicode list");
2518 return -1;
2519 }
2520
2521 *pp = p;
2522
2523 return 0;
2524}
2525
2526static int
2527unescape_unicode_bmp(const char **pp, const char *end,
2529{
2530 const char *p = *pp;
2531 size_t len;
2532 unsigned long code;
2533
2534 if (end < p+4) {
2535 errcpy(err, "invalid Unicode escape");
2536 return -1;
2537 }
2538 code = ruby_scan_hex(p, 4, &len);
2539 if (len != 4) {
2540 errcpy(err, "invalid Unicode escape");
2541 return -1;
2542 }
2543 if (append_utf8(code, buf, encp, err) != 0)
2544 return -1;
2545 *pp = p + 4;
2546 return 0;
2547}
2548
2549static int
2550unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
2551 VALUE buf, rb_encoding **encp, int *has_property,
2553{
2554 unsigned char c;
2555 char smallbuf[2];
2556
2557 while (p < end) {
2558 int chlen = rb_enc_precise_mbclen(p, end, enc);
2559 if (!MBCLEN_CHARFOUND_P(chlen)) {
2560 invalid_multibyte:
2561 errcpy(err, "invalid multibyte character");
2562 return -1;
2563 }
2564 chlen = MBCLEN_CHARFOUND_LEN(chlen);
2565 if (1 < chlen || (*p & 0x80)) {
2566 multibyte:
2567 rb_str_buf_cat(buf, p, chlen);
2568 p += chlen;
2569 if (*encp == 0)
2570 *encp = enc;
2571 else if (*encp != enc) {
2572 errcpy(err, "non ASCII character in UTF-8 regexp");
2573 return -1;
2574 }
2575 continue;
2576 }
2577
2578 switch (c = *p++) {
2579 case '\\':
2580 if (p == end) {
2581 errcpy(err, "too short escape sequence");
2582 return -1;
2583 }
2584 chlen = rb_enc_precise_mbclen(p, end, enc);
2585 if (!MBCLEN_CHARFOUND_P(chlen)) {
2586 goto invalid_multibyte;
2587 }
2588 if ((chlen = MBCLEN_CHARFOUND_LEN(chlen)) > 1) {
2589 /* include the previous backslash */
2590 --p;
2591 ++chlen;
2592 goto multibyte;
2593 }
2594 switch (c = *p++) {
2595 case '1': case '2': case '3':
2596 case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */
2597 {
2598 size_t len = end-(p-1), octlen;
2599 if (ruby_scan_oct(p-1, len < 3 ? len : 3, &octlen) <= 0177) {
2600 /* backref or 7bit octal.
2601 no need to unescape anyway.
2602 re-escaping may break backref */
2603 goto escape_asis;
2604 }
2605 }
2606 /* xxx: How about more than 199 subexpressions? */
2607
2608 case '0': /* \0, \0O, \0OO */
2609
2610 case 'x': /* \xHH */
2611 case 'c': /* \cX, \c\M-X */
2612 case 'C': /* \C-X, \C-\M-X */
2613 case 'M': /* \M-X, \M-\C-X, \M-\cX */
2614 p = p-2;
2615 if (enc == rb_usascii_encoding()) {
2616 const char *pbeg = p;
2617 int byte = read_escaped_byte(&p, end, err);
2618 if (byte == -1) return -1;
2619 c = byte;
2620 rb_str_buf_cat(buf, pbeg, p-pbeg);
2621 }
2622 else {
2623 if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0)
2624 return -1;
2625 }
2626 break;
2627
2628 case 'u':
2629 if (p == end) {
2630 errcpy(err, "too short escape sequence");
2631 return -1;
2632 }
2633 if (*p == '{') {
2634 /* \u{H HH HHH HHHH HHHHH HHHHHH ...} */
2635 p++;
2636 if (unescape_unicode_list(&p, end, buf, encp, err) != 0)
2637 return -1;
2638 if (p == end || *p++ != '}') {
2639 errcpy(err, "invalid Unicode list");
2640 return -1;
2641 }
2642 break;
2643 }
2644 else {
2645 /* \uHHHH */
2646 if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0)
2647 return -1;
2648 break;
2649 }
2650
2651 case 'p': /* \p{Hiragana} */
2652 case 'P':
2653 if (!*encp) {
2654 *has_property = 1;
2655 }
2656 goto escape_asis;
2657
2658 default: /* \n, \\, \d, \9, etc. */
2659escape_asis:
2660 smallbuf[0] = '\\';
2661 smallbuf[1] = c;
2662 rb_str_buf_cat(buf, smallbuf, 2);
2663 break;
2664 }
2665 break;
2666
2667 default:
2668 rb_str_buf_cat(buf, (char *)&c, 1);
2669 break;
2670 }
2671 }
2672
2673 return 0;
2674}
2675
2676static VALUE
2677rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
2678 rb_encoding **fixed_enc, onig_errmsg_buffer err)
2679{
2680 VALUE buf;
2681 int has_property = 0;
2682
2683 buf = rb_str_buf_new(0);
2684
2685 if (rb_enc_asciicompat(enc))
2686 *fixed_enc = 0;
2687 else {
2688 *fixed_enc = enc;
2689 rb_enc_associate(buf, enc);
2690 }
2691
2692 if (unescape_nonascii(p, end, enc, buf, fixed_enc, &has_property, err) != 0)
2693 return Qnil;
2694
2695 if (has_property && !*fixed_enc) {
2696 *fixed_enc = enc;
2697 }
2698
2699 if (*fixed_enc) {
2700 rb_enc_associate(buf, *fixed_enc);
2701 }
2702
2703 return buf;
2704}
2705
2706VALUE
2708{
2709 rb_encoding *fixed_enc = 0;
2711 VALUE buf;
2712 char *p, *end;
2713 rb_encoding *enc;
2714
2716 p = RSTRING_PTR(str);
2717 end = p + RSTRING_LEN(str);
2718 enc = rb_enc_get(str);
2719
2720 buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err);
2722
2723 if (buf == Qnil) {
2724 return rb_reg_error_desc(str, 0, err);
2725 }
2726 return Qnil;
2727}
2728
2729static VALUE
2730rb_reg_preprocess_dregexp(VALUE ary, int options)
2731{
2732 rb_encoding *fixed_enc = 0;
2733 rb_encoding *regexp_enc = 0;
2735 int i;
2736 VALUE result = 0;
2737 rb_encoding *ascii8bit = rb_ascii8bit_encoding();
2738
2739 if (RARRAY_LEN(ary) == 0) {
2740 rb_raise(rb_eArgError, "no arguments given");
2741 }
2742
2743 for (i = 0; i < RARRAY_LEN(ary); i++) {
2744 VALUE str = RARRAY_AREF(ary, i);
2745 VALUE buf;
2746 char *p, *end;
2747 rb_encoding *src_enc;
2748
2749 src_enc = rb_enc_get(str);
2750 if (options & ARG_ENCODING_NONE &&
2751 src_enc != ascii8bit) {
2752 if (str_coderange(str) != ENC_CODERANGE_7BIT)
2753 rb_raise(rb_eRegexpError, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
2754 else
2755 src_enc = ascii8bit;
2756 }
2757
2759 p = RSTRING_PTR(str);
2760 end = p + RSTRING_LEN(str);
2761
2762 buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err);
2763
2764 if (buf == Qnil)
2765 rb_raise(rb_eArgError, "%s", err);
2766
2767 if (fixed_enc != 0) {
2768 if (regexp_enc != 0 && regexp_enc != fixed_enc) {
2769 rb_raise(rb_eRegexpError, "encoding mismatch in dynamic regexp : %s and %s",
2770 rb_enc_name(regexp_enc), rb_enc_name(fixed_enc));
2771 }
2772 regexp_enc = fixed_enc;
2773 }
2774
2775 if (!result)
2776 result = rb_str_new3(str);
2777 else
2778 rb_str_buf_append(result, str);
2779 }
2780 if (regexp_enc) {
2781 rb_enc_associate(result, regexp_enc);
2782 }
2783
2784 return result;
2785}
2786
2787static int
2788rb_reg_initialize(VALUE obj, const char *s, long len, rb_encoding *enc,
2789 int options, onig_errmsg_buffer err,
2790 const char *sourcefile, int sourceline)
2791{
2792 struct RRegexp *re = RREGEXP(obj);
2793 VALUE unescaped;
2794 rb_encoding *fixed_enc = 0;
2796
2798 if (FL_TEST(obj, REG_LITERAL))
2799 rb_raise(rb_eSecurityError, "can't modify literal regexp");
2800 if (re->ptr)
2801 rb_raise(rb_eTypeError, "already initialized regexp");
2802 re->ptr = 0;
2803
2804 if (rb_enc_dummy_p(enc)) {
2805 errcpy(err, "can't make regexp with dummy encoding");
2806 return -1;
2807 }
2808
2809 unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err);
2810 if (unescaped == Qnil)
2811 return -1;
2812
2813 if (fixed_enc) {
2814 if ((fixed_enc != enc && (options & ARG_ENCODING_FIXED)) ||
2815 (fixed_enc != a_enc && (options & ARG_ENCODING_NONE))) {
2816 errcpy(err, "incompatible character encoding");
2817 return -1;
2818 }
2819 if (fixed_enc != a_enc) {
2820 options |= ARG_ENCODING_FIXED;
2821 enc = fixed_enc;
2822 }
2823 }
2824 else if (!(options & ARG_ENCODING_FIXED)) {
2825 enc = rb_usascii_encoding();
2826 }
2827
2828 rb_enc_associate((VALUE)re, enc);
2829 if ((options & ARG_ENCODING_FIXED) || fixed_enc) {
2830 re->basic.flags |= KCODE_FIXED;
2831 }
2832 if (options & ARG_ENCODING_NONE) {
2833 re->basic.flags |= REG_ENCODING_NONE;
2834 }
2835
2836 re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc,
2837 options & ARG_REG_OPTION_MASK, err,
2838 sourcefile, sourceline);
2839 if (!re->ptr) return -1;
2840 RB_GC_GUARD(unescaped);
2841 return 0;
2842}
2843
2844static void
2845reg_set_source(VALUE reg, VALUE str, rb_encoding *enc)
2846{
2847 rb_encoding *regenc = rb_enc_get(reg);
2848 if (regenc != enc) {
2849 str = rb_enc_associate(rb_str_dup(str), enc = regenc);
2850 }
2851 RB_OBJ_WRITE(reg, &RREGEXP(reg)->src, rb_fstring(str));
2852}
2853
2854static int
2855rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err,
2856 const char *sourcefile, int sourceline)
2857{
2858 int ret;
2859 rb_encoding *str_enc = rb_enc_get(str), *enc = str_enc;
2860 if (options & ARG_ENCODING_NONE) {
2861 rb_encoding *ascii8bit = rb_ascii8bit_encoding();
2862 if (enc != ascii8bit) {
2863 if (str_coderange(str) != ENC_CODERANGE_7BIT) {
2864 errcpy(err, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
2865 return -1;
2866 }
2867 enc = ascii8bit;
2868 }
2869 }
2870 ret = rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), enc,
2871 options, err, sourcefile, sourceline);
2872 if (ret == 0) reg_set_source(obj, str, str_enc);
2873 return ret;
2874}
2875
2876static VALUE
2877rb_reg_s_alloc(VALUE klass)
2878{
2880
2881 re->ptr = 0;
2882 RB_OBJ_WRITE(re, &re->src, 0);
2883 re->usecnt = 0;
2884
2885 return (VALUE)re;
2886}
2887
2888VALUE
2890{
2891 return rb_reg_s_alloc(rb_cRegexp);
2892}
2893
2894VALUE
2895rb_reg_new_str(VALUE s, int options)
2896{
2897 return rb_reg_init_str(rb_reg_alloc(), s, options);
2898}
2899
2900VALUE
2901rb_reg_init_str(VALUE re, VALUE s, int options)
2902{
2904
2905 if (rb_reg_initialize_str(re, s, options, err, NULL, 0) != 0) {
2906 rb_reg_raise_str(s, options, err);
2907 }
2908
2909 return re;
2910}
2911
2912static VALUE
2913rb_reg_init_str_enc(VALUE re, VALUE s, rb_encoding *enc, int options)
2914{
2916
2917 if (rb_reg_initialize(re, RSTRING_PTR(s), RSTRING_LEN(s),
2918 enc, options, err, NULL, 0) != 0) {
2919 rb_reg_raise_str(s, options, err);
2920 }
2921 reg_set_source(re, s, enc);
2922
2923 return re;
2924}
2925
2928{
2929 return rb_reg_new_str(rb_reg_preprocess_dregexp(ary, opt), opt);
2930}
2931
2932VALUE
2933rb_enc_reg_new(const char *s, long len, rb_encoding *enc, int options)
2934{
2935 VALUE re = rb_reg_alloc();
2937
2938 if (rb_reg_initialize(re, s, len, enc, options, err, NULL, 0) != 0) {
2939 rb_enc_reg_raise(s, len, enc, options, err);
2940 }
2941 RB_OBJ_WRITE(re, &RREGEXP(re)->src, rb_fstring(rb_enc_str_new(s, len, enc)));
2942
2943 return re;
2944}
2945
2946VALUE
2947rb_reg_new(const char *s, long len, int options)
2948{
2949 return rb_enc_reg_new(s, len, rb_ascii8bit_encoding(), options);
2950}
2951
2952VALUE
2953rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline)
2954{
2955 VALUE re = rb_reg_alloc();
2957
2958 if (!str) str = rb_str_new(0,0);
2959 if (rb_reg_initialize_str(re, str, options, err, sourcefile, sourceline) != 0) {
2960 rb_set_errinfo(rb_reg_error_desc(str, options, err));
2961 return Qnil;
2962 }
2963 FL_SET(re, REG_LITERAL);
2964 return re;
2965}
2966
2967static VALUE reg_cache;
2968
2969VALUE
2971{
2972 if (reg_cache && RREGEXP_SRC_LEN(reg_cache) == RSTRING_LEN(str)
2973 && ENCODING_GET(reg_cache) == ENCODING_GET(str)
2974 && memcmp(RREGEXP_SRC_PTR(reg_cache), RSTRING_PTR(str), RSTRING_LEN(str)) == 0)
2975 return reg_cache;
2976
2977 return reg_cache = rb_reg_new_str(str, 0);
2978}
2979
2980static st_index_t reg_hash(VALUE re);
2981/*
2982 * call-seq:
2983 * rxp.hash -> integer
2984 *
2985 * Produce a hash based on the text and options of this regular expression.
2986 *
2987 * See also Object#hash.
2988 */
2989
2990static VALUE
2991rb_reg_hash(VALUE re)
2992{
2993 st_index_t hashval = reg_hash(re);
2994 return ST2FIX(hashval);
2995}
2996
2997static st_index_t
2998reg_hash(VALUE re)
2999{
3000 st_index_t hashval;
3001
3002 rb_reg_check(re);
3003 hashval = RREGEXP_PTR(re)->options;
3004 hashval = rb_hash_uint(hashval, rb_memhash(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re)));
3005 return rb_hash_end(hashval);
3006}
3007
3008
3009/*
3010 * call-seq:
3011 * rxp == other_rxp -> true or false
3012 * rxp.eql?(other_rxp) -> true or false
3013 *
3014 * Equality---Two regexps are equal if their patterns are identical, they have
3015 * the same character set code, and their <code>casefold?</code> values are the
3016 * same.
3017 *
3018 * /abc/ == /abc/x #=> false
3019 * /abc/ == /abc/i #=> false
3020 * /abc/ == /abc/u #=> false
3021 * /abc/u == /abc/n #=> false
3022 */
3023
3024static VALUE
3025rb_reg_equal(VALUE re1, VALUE re2)
3026{
3027 if (re1 == re2) return Qtrue;
3028 if (!RB_TYPE_P(re2, T_REGEXP)) return Qfalse;
3029 rb_reg_check(re1); rb_reg_check(re2);
3030 if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse;
3031 if (RREGEXP_PTR(re1)->options != RREGEXP_PTR(re2)->options) return Qfalse;
3032 if (RREGEXP_SRC_LEN(re1) != RREGEXP_SRC_LEN(re2)) return Qfalse;
3033 if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse;
3034 if (memcmp(RREGEXP_SRC_PTR(re1), RREGEXP_SRC_PTR(re2), RREGEXP_SRC_LEN(re1)) == 0) {
3035 return Qtrue;
3036 }
3037 return Qfalse;
3038}
3039
3040/*
3041 * call-seq:
3042 * mtch.hash -> integer
3043 *
3044 * Produce a hash based on the target string, regexp and matched
3045 * positions of this matchdata.
3046 *
3047 * See also Object#hash.
3048 */
3049
3050static VALUE
3051match_hash(VALUE match)
3052{
3053 const struct re_registers *regs;
3054 st_index_t hashval;
3055
3056 match_check(match);
3057 hashval = rb_hash_start(rb_str_hash(RMATCH(match)->str));
3058 hashval = rb_hash_uint(hashval, reg_hash(match_regexp(match)));
3059 regs = RMATCH_REGS(match);
3060 hashval = rb_hash_uint(hashval, regs->num_regs);
3061 hashval = rb_hash_uint(hashval, rb_memhash(regs->beg, regs->num_regs * sizeof(*regs->beg)));
3062 hashval = rb_hash_uint(hashval, rb_memhash(regs->end, regs->num_regs * sizeof(*regs->end)));
3063 hashval = rb_hash_end(hashval);
3064 return ST2FIX(hashval);
3065}
3066
3067/*
3068 * call-seq:
3069 * mtch == mtch2 -> true or false
3070 * mtch.eql?(mtch2) -> true or false
3071 *
3072 * Equality---Two matchdata are equal if their target strings,
3073 * patterns, and matched positions are identical.
3074 */
3075
3076static VALUE
3077match_equal(VALUE match1, VALUE match2)
3078{
3079 const struct re_registers *regs1, *regs2;
3080
3081 if (match1 == match2) return Qtrue;
3082 if (!RB_TYPE_P(match2, T_MATCH)) return Qfalse;
3083 if (!RMATCH(match1)->regexp || !RMATCH(match2)->regexp) return Qfalse;
3084 if (!rb_str_equal(RMATCH(match1)->str, RMATCH(match2)->str)) return Qfalse;
3085 if (!rb_reg_equal(match_regexp(match1), match_regexp(match2))) return Qfalse;
3086 regs1 = RMATCH_REGS(match1);
3087 regs2 = RMATCH_REGS(match2);
3088 if (regs1->num_regs != regs2->num_regs) return Qfalse;
3089 if (memcmp(regs1->beg, regs2->beg, regs1->num_regs * sizeof(*regs1->beg))) return Qfalse;
3090 if (memcmp(regs1->end, regs2->end, regs1->num_regs * sizeof(*regs1->end))) return Qfalse;
3091 return Qtrue;
3092}
3093
3094static VALUE
3095reg_operand(VALUE s, int check)
3096{
3097 if (SYMBOL_P(s)) {
3098 return rb_sym2str(s);
3099 }
3100 else if (RB_TYPE_P(s, T_STRING)) {
3101 return s;
3102 }
3103 else {
3104 return check ? rb_str_to_str(s) : rb_check_string_type(s);
3105 }
3106}
3107
3108static long
3109reg_match_pos(VALUE re, VALUE *strp, long pos)
3110{
3111 VALUE str = *strp;
3112
3113 if (NIL_P(str)) {
3115 return -1;
3116 }
3117 *strp = str = reg_operand(str, TRUE);
3118 if (pos != 0) {
3119 if (pos < 0) {
3120 VALUE l = rb_str_length(str);
3121 pos += NUM2INT(l);
3122 if (pos < 0) {
3123 return pos;
3124 }
3125 }
3126 pos = rb_str_offset(str, pos);
3127 }
3128 return rb_reg_search(re, str, pos, 0);
3129}
3130
3131/*
3132 * call-seq:
3133 * rxp =~ str -> integer or nil
3134 *
3135 * Match---Matches <i>rxp</i> against <i>str</i>.
3136 *
3137 * /at/ =~ "input data" #=> 7
3138 * /ax/ =~ "input data" #=> nil
3139 *
3140 * If <code>=~</code> is used with a regexp literal with named captures,
3141 * captured strings (or nil) is assigned to local variables named by
3142 * the capture names.
3143 *
3144 * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ " x = y "
3145 * p lhs #=> "x"
3146 * p rhs #=> "y"
3147 *
3148 * If it is not matched, nil is assigned for the variables.
3149 *
3150 * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ " x = "
3151 * p lhs #=> nil
3152 * p rhs #=> nil
3153 *
3154 * This assignment is implemented in the Ruby parser.
3155 * The parser detects 'regexp-literal =~ expression' for the assignment.
3156 * The regexp must be a literal without interpolation and placed at left hand side.
3157 *
3158 * The assignment does not occur if the regexp is not a literal.
3159 *
3160 * re = /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
3161 * re =~ " x = y "
3162 * p lhs # undefined local variable
3163 * p rhs # undefined local variable
3164 *
3165 * A regexp interpolation, <code>#{}</code>, also disables
3166 * the assignment.
3167 *
3168 * rhs_pat = /(?<rhs>\w+)/
3169 * /(?<lhs>\w+)\s*=\s*#{rhs_pat}/ =~ "x = y"
3170 * p lhs # undefined local variable
3171 *
3172 * The assignment does not occur if the regexp is placed at the right hand side.
3173 *
3174 * " x = y " =~ /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
3175 * p lhs, rhs # undefined local variable
3176 *
3177 */
3178
3179VALUE
3181{
3182 long pos = reg_match_pos(re, &str, 0);
3183 if (pos < 0) return Qnil;
3184 pos = rb_str_sublen(str, pos);
3185 return LONG2FIX(pos);
3186}
3187
3188/*
3189 * call-seq:
3190 * rxp === str -> true or false
3191 *
3192 * Case Equality---Used in case statements.
3193 *
3194 * a = "HELLO"
3195 * case a
3196 * when /\A[a-z]*\z/; print "Lower case\n"
3197 * when /\A[A-Z]*\z/; print "Upper case\n"
3198 * else; print "Mixed case\n"
3199 * end
3200 * #=> "Upper case"
3201 *
3202 * Following a regular expression literal with the #=== operator allows you to
3203 * compare against a String.
3204 *
3205 * /^[a-z]*$/ === "HELLO" #=> false
3206 * /^[A-Z]*$/ === "HELLO" #=> true
3207 */
3208
3209VALUE
3211{
3212 long start;
3213
3214 str = reg_operand(str, FALSE);
3215 if (NIL_P(str)) {
3217 return Qfalse;
3218 }
3219 start = rb_reg_search(re, str, 0, 0);
3220 if (start < 0) {
3221 return Qfalse;
3222 }
3223 return Qtrue;
3224}
3225
3226
3227/*
3228 * call-seq:
3229 * ~ rxp -> integer or nil
3230 *
3231 * Match---Matches <i>rxp</i> against the contents of <code>$_</code>.
3232 * Equivalent to <code><i>rxp</i> =~ $_</code>.
3233 *
3234 * $_ = "input data"
3235 * ~ /at/ #=> 7
3236 */
3237
3238VALUE
3240{
3241 long start;
3242 VALUE line = rb_lastline_get();
3243
3244 if (!RB_TYPE_P(line, T_STRING)) {
3246 return Qnil;
3247 }
3248
3249 start = rb_reg_search(re, line, 0, 0);
3250 if (start < 0) {
3251 return Qnil;
3252 }
3253 start = rb_str_sublen(line, start);
3254 return LONG2FIX(start);
3255}
3256
3257
3258/*
3259 * call-seq:
3260 * rxp.match(str) -> matchdata or nil
3261 * rxp.match(str,pos) -> matchdata or nil
3262 *
3263 * Returns a MatchData object describing the match, or
3264 * <code>nil</code> if there was no match. This is equivalent to
3265 * retrieving the value of the special variable <code>$~</code>
3266 * following a normal match. If the second parameter is present, it
3267 * specifies the position in the string to begin the search.
3268 *
3269 * /(.)(.)(.)/.match("abc")[2] #=> "b"
3270 * /(.)(.)/.match("abc", 1)[2] #=> "c"
3271 *
3272 * If a block is given, invoke the block with MatchData if match succeed, so
3273 * that you can write
3274 *
3275 * /M(.*)/.match("Matz") do |m|
3276 * puts m[0]
3277 * puts m[1]
3278 * end
3279 *
3280 * instead of
3281 *
3282 * if m = /M(.*)/.match("Matz")
3283 * puts m[0]
3284 * puts m[1]
3285 * end
3286 *
3287 * The return value is a value from block execution in this case.
3288 */
3289
3290static VALUE
3291rb_reg_match_m(int argc, VALUE *argv, VALUE re)
3292{
3293 VALUE result, str, initpos;
3294 long pos;
3295
3296 if (rb_scan_args(argc, argv, "11", &str, &initpos) == 2) {
3297 pos = NUM2LONG(initpos);
3298 }
3299 else {
3300 pos = 0;
3301 }
3302
3303 pos = reg_match_pos(re, &str, pos);
3304 if (pos < 0) {
3306 return Qnil;
3307 }
3308 result = rb_backref_get();
3309 rb_match_busy(result);
3310 if (!NIL_P(result) && rb_block_given_p()) {
3311 return rb_yield(result);
3312 }
3313 return result;
3314}
3315
3316/*
3317 * call-seq:
3318 * rxp.match?(str) -> true or false
3319 * rxp.match?(str,pos) -> true or false
3320 *
3321 * Returns a <code>true</code> or <code>false</code> indicates whether the
3322 * regexp is matched or not without updating $~ and other related variables.
3323 * If the second parameter is present, it specifies the position in the string
3324 * to begin the search.
3325 *
3326 * /R.../.match?("Ruby") #=> true
3327 * /R.../.match?("Ruby", 1) #=> false
3328 * /P.../.match?("Ruby") #=> false
3329 * $& #=> nil
3330 */
3331
3332static VALUE
3333rb_reg_match_m_p(int argc, VALUE *argv, VALUE re)
3334{
3335 long pos = rb_check_arity(argc, 1, 2) > 1 ? NUM2LONG(argv[1]) : 0;
3336 return rb_reg_match_p(re, argv[0], pos);
3337}
3338
3339VALUE
3341{
3342 regex_t *reg;
3344 OnigPosition result;
3345 const UChar *start, *end;
3346 int tmpreg;
3347
3348 if (NIL_P(str)) return Qfalse;
3350 if (pos) {
3351 if (pos < 0) {
3352 pos += NUM2LONG(rb_str_length(str));
3353 if (pos < 0) return Qfalse;
3354 }
3355 if (pos > 0) {
3356 long len = 1;
3357 const char *beg = rb_str_subpos(str, pos, &len);
3358 if (!beg) return Qfalse;
3359 pos = beg - RSTRING_PTR(str);
3360 }
3361 }
3362 reg = rb_reg_prepare_re0(re, str, err);
3363 tmpreg = reg != RREGEXP_PTR(re);
3364 if (!tmpreg) RREGEXP(re)->usecnt++;
3365 start = ((UChar*)RSTRING_PTR(str));
3366 end = start + RSTRING_LEN(str);
3367 result = onig_search(reg, start, end, start + pos, end,
3369 if (!tmpreg) RREGEXP(re)->usecnt--;
3370 if (tmpreg) {
3371 if (RREGEXP(re)->usecnt) {
3372 onig_free(reg);
3373 }
3374 else {
3376 RREGEXP_PTR(re) = reg;
3377 }
3378 }
3379 if (result < 0) {
3380 if (result == ONIG_MISMATCH) {
3381 return Qfalse;
3382 }
3383 else {
3384 onig_error_code_to_str((UChar*)err, (int)result);
3385 rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re);
3386 }
3387 }
3388 return Qtrue;
3389}
3390
3391/*
3392 * Document-method: compile
3393 *
3394 * Alias for Regexp.new
3395 */
3396
3397/*
3398 * call-seq:
3399 * Regexp.new(string, [options]) -> regexp
3400 * Regexp.new(regexp) -> regexp
3401 * Regexp.compile(string, [options]) -> regexp
3402 * Regexp.compile(regexp) -> regexp
3403 *
3404 * Constructs a new regular expression from +pattern+, which can be either a
3405 * String or a Regexp (in which case that regexp's options are propagated),
3406 * and new options may not be specified (a change as of Ruby 1.8).
3407 *
3408 * If +options+ is an Integer, it should be one or more of the constants
3409 * Regexp::EXTENDED, Regexp::IGNORECASE, and Regexp::MULTILINE,
3410 * <em>or</em>-ed together. Otherwise, if +options+ is not
3411 * +nil+ or +false+, the regexp will be case insensitive.
3412 *
3413 * r1 = Regexp.new('^a-z+:\\s+\w+') #=> /^a-z+:\s+\w+/
3414 * r2 = Regexp.new('cat', true) #=> /cat/i
3415 * r3 = Regexp.new(r2) #=> /cat/i
3416 * r4 = Regexp.new('dog', Regexp::EXTENDED | Regexp::IGNORECASE) #=> /dog/ix
3417 */
3418
3419static VALUE
3420rb_reg_initialize_m(int argc, VALUE *argv, VALUE self)
3421{
3422 int flags = 0;
3423 VALUE str;
3424 rb_encoding *enc = 0;
3425
3426 rb_check_arity(argc, 1, 3);
3427 if (RB_TYPE_P(argv[0], T_REGEXP)) {
3428 VALUE re = argv[0];
3429
3430 if (argc > 1) {
3431 rb_warn("flags ignored");
3432 }
3433 rb_reg_check(re);
3434 flags = rb_reg_options(re);
3435 str = RREGEXP_SRC(re);
3436 }
3437 else {
3438 if (argc >= 2) {
3439 if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]);
3440 else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE;
3441 }
3442 if (argc == 3 && !NIL_P(argv[2])) {
3443 char *kcode = StringValuePtr(argv[2]);
3444 if (kcode[0] == 'n' || kcode[0] == 'N') {
3445 enc = rb_ascii8bit_encoding();
3446 flags |= ARG_ENCODING_NONE;
3447 }
3448 else {
3449 rb_warn("encoding option is ignored - %s", kcode);
3450 }
3451 }
3452 str = StringValue(argv[0]);
3453 }
3454 if (enc && rb_enc_get(str) != enc)
3455 rb_reg_init_str_enc(self, str, enc, flags);
3456 else
3457 rb_reg_init_str(self, str, flags);
3458 return self;
3459}
3460
3461VALUE
3463{
3464 rb_encoding *enc = rb_enc_get(str);
3465 char *s, *send, *t;
3466 VALUE tmp;
3467 int c, clen;
3468 int ascii_only = rb_enc_str_asciionly_p(str);
3469
3470 s = RSTRING_PTR(str);
3471 send = s + RSTRING_LEN(str);
3472 while (s < send) {
3473 c = rb_enc_ascget(s, send, &clen, enc);
3474 if (c == -1) {
3475 s += mbclen(s, send, enc);
3476 continue;
3477 }
3478 switch (c) {
3479 case '[': case ']': case '{': case '}':
3480 case '(': case ')': case '|': case '-':
3481 case '*': case '.': case '\\':
3482 case '?': case '+': case '^': case '$':
3483 case ' ': case '#':
3484 case '\t': case '\f': case '\v': case '\n': case '\r':
3485 goto meta_found;
3486 }
3487 s += clen;
3488 }
3489 tmp = rb_str_new3(str);
3490 if (ascii_only) {
3492 }
3493 return tmp;
3494
3495 meta_found:
3496 tmp = rb_str_new(0, RSTRING_LEN(str)*2);
3497 if (ascii_only) {
3499 }
3500 else {
3501 rb_enc_copy(tmp, str);
3502 }
3503 t = RSTRING_PTR(tmp);
3504 /* copy upto metacharacter */
3506 t += s - RSTRING_PTR(str);
3507
3508 while (s < send) {
3509 c = rb_enc_ascget(s, send, &clen, enc);
3510 if (c == -1) {
3511 int n = mbclen(s, send, enc);
3512
3513 while (n--)
3514 *t++ = *s++;
3515 continue;
3516 }
3517 s += clen;
3518 switch (c) {
3519 case '[': case ']': case '{': case '}':
3520 case '(': case ')': case '|': case '-':
3521 case '*': case '.': case '\\':
3522 case '?': case '+': case '^': case '$':
3523 case '#':
3524 t += rb_enc_mbcput('\\', t, enc);
3525 break;
3526 case ' ':
3527 t += rb_enc_mbcput('\\', t, enc);
3528 t += rb_enc_mbcput(' ', t, enc);
3529 continue;
3530 case '\t':
3531 t += rb_enc_mbcput('\\', t, enc);
3532 t += rb_enc_mbcput('t', t, enc);
3533 continue;
3534 case '\n':
3535 t += rb_enc_mbcput('\\', t, enc);
3536 t += rb_enc_mbcput('n', t, enc);
3537 continue;
3538 case '\r':
3539 t += rb_enc_mbcput('\\', t, enc);
3540 t += rb_enc_mbcput('r', t, enc);
3541 continue;
3542 case '\f':
3543 t += rb_enc_mbcput('\\', t, enc);
3544 t += rb_enc_mbcput('f', t, enc);
3545 continue;
3546 case '\v':
3547 t += rb_enc_mbcput('\\', t, enc);
3548 t += rb_enc_mbcput('v', t, enc);
3549 continue;
3550 }
3551 t += rb_enc_mbcput(c, t, enc);
3552 }
3553 rb_str_resize(tmp, t - RSTRING_PTR(tmp));
3554 return tmp;
3555}
3556
3557
3558/*
3559 * call-seq:
3560 * Regexp.escape(str) -> string
3561 * Regexp.quote(str) -> string
3562 *
3563 * Escapes any characters that would have special meaning in a regular
3564 * expression. Returns a new escaped string with the same or compatible
3565 * encoding. For any string,
3566 * <code>Regexp.new(Regexp.escape(<i>str</i>))=~<i>str</i></code> will be true.
3567 *
3568 * Regexp.escape('\*?{}.') #=> \\\*\?\{\}\.
3569 *
3570 */
3571
3572static VALUE
3573rb_reg_s_quote(VALUE c, VALUE str)
3574{
3575 return rb_reg_quote(reg_operand(str, TRUE));
3576}
3577
3578int
3580{
3581 int options;
3582
3583 rb_reg_check(re);
3584 options = RREGEXP_PTR(re)->options & ARG_REG_OPTION_MASK;
3585 if (RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED;
3586 if (RBASIC(re)->flags & REG_ENCODING_NONE) options |= ARG_ENCODING_NONE;
3587 return options;
3588}
3589
3590VALUE
3592{
3593 return rb_check_convert_type(re, T_REGEXP, "Regexp", "to_regexp");
3594}
3595
3596/*
3597 * call-seq:
3598 * Regexp.try_convert(obj) -> re or nil
3599 *
3600 * Try to convert <i>obj</i> into a Regexp, using to_regexp method.
3601 * Returns converted regexp or nil if <i>obj</i> cannot be converted
3602 * for any reason.
3603 *
3604 * Regexp.try_convert(/re/) #=> /re/
3605 * Regexp.try_convert("re") #=> nil
3606 *
3607 * o = Object.new
3608 * Regexp.try_convert(o) #=> nil
3609 * def o.to_regexp() /foo/ end
3610 * Regexp.try_convert(o) #=> /foo/
3611 *
3612 */
3613static VALUE
3614rb_reg_s_try_convert(VALUE dummy, VALUE re)
3615{
3616 return rb_check_regexp_type(re);
3617}
3618
3619static VALUE
3620rb_reg_s_union(VALUE self, VALUE args0)
3621{
3622 long argc = RARRAY_LEN(args0);
3623
3624 if (argc == 0) {
3625 VALUE args[1];
3626 args[0] = rb_str_new2("(?!)");
3627 return rb_class_new_instance(1, args, rb_cRegexp);
3628 }
3629 else if (argc == 1) {
3630 VALUE arg = rb_ary_entry(args0, 0);
3632 if (!NIL_P(re))
3633 return re;
3634 else {
3635 VALUE quoted;
3636 quoted = rb_reg_s_quote(Qnil, arg);
3637 return rb_reg_new_str(quoted, 0);
3638 }
3639 }
3640 else {
3641 int i;
3642 VALUE source = rb_str_buf_new(0);
3643 rb_encoding *result_enc;
3644
3645 int has_asciionly = 0;
3646 rb_encoding *has_ascii_compat_fixed = 0;
3647 rb_encoding *has_ascii_incompat = 0;
3648
3649 for (i = 0; i < argc; i++) {
3650 volatile VALUE v;
3651 VALUE e = rb_ary_entry(args0, i);
3652
3653 if (0 < i)
3654 rb_str_buf_cat_ascii(source, "|");
3655
3657 if (!NIL_P(v)) {
3658 rb_encoding *enc = rb_enc_get(v);
3659 if (!rb_enc_asciicompat(enc)) {
3660 if (!has_ascii_incompat)
3661 has_ascii_incompat = enc;
3662 else if (has_ascii_incompat != enc)
3663 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
3664 rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
3665 }
3666 else if (rb_reg_fixed_encoding_p(v)) {
3667 if (!has_ascii_compat_fixed)
3668 has_ascii_compat_fixed = enc;
3669 else if (has_ascii_compat_fixed != enc)
3670 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
3671 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
3672 }
3673 else {
3674 has_asciionly = 1;
3675 }
3676 v = rb_reg_str_with_term(v, -1);
3677 }
3678 else {
3679 rb_encoding *enc;
3680 StringValue(e);
3681 enc = rb_enc_get(e);
3682 if (!rb_enc_asciicompat(enc)) {
3683 if (!has_ascii_incompat)
3684 has_ascii_incompat = enc;
3685 else if (has_ascii_incompat != enc)
3686 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
3687 rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
3688 }
3689 else if (rb_enc_str_asciionly_p(e)) {
3690 has_asciionly = 1;
3691 }
3692 else {
3693 if (!has_ascii_compat_fixed)
3694 has_ascii_compat_fixed = enc;
3695 else if (has_ascii_compat_fixed != enc)
3696 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
3697 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
3698 }
3699 v = rb_reg_s_quote(Qnil, e);
3700 }
3701 if (has_ascii_incompat) {
3702 if (has_asciionly) {
3703 rb_raise(rb_eArgError, "ASCII incompatible encoding: %s",
3704 rb_enc_name(has_ascii_incompat));
3705 }
3706 if (has_ascii_compat_fixed) {
3707 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
3708 rb_enc_name(has_ascii_incompat), rb_enc_name(has_ascii_compat_fixed));
3709 }
3710 }
3711
3712 if (i == 0) {
3713 rb_enc_copy(source, v);
3714 }
3715 rb_str_append(source, v);
3716 }
3717
3718 if (has_ascii_incompat) {
3719 result_enc = has_ascii_incompat;
3720 }
3721 else if (has_ascii_compat_fixed) {
3722 result_enc = has_ascii_compat_fixed;
3723 }
3724 else {
3725 result_enc = rb_ascii8bit_encoding();
3726 }
3727
3728 rb_enc_associate(source, result_enc);
3729 return rb_class_new_instance(1, &source, rb_cRegexp);
3730 }
3731}
3732
3733/*
3734 * call-seq:
3735 * Regexp.union(pat1, pat2, ...) -> new_regexp
3736 * Regexp.union(pats_ary) -> new_regexp
3737 *
3738 * Return a Regexp object that is the union of the given
3739 * <em>pattern</em>s, i.e., will match any of its parts. The
3740 * <em>pattern</em>s can be Regexp objects, in which case their
3741 * options will be preserved, or Strings. If no patterns are given,
3742 * returns <code>/(?!)/</code>. The behavior is unspecified if any
3743 * given <em>pattern</em> contains capture.
3744 *
3745 * Regexp.union #=> /(?!)/
3746 * Regexp.union("penzance") #=> /penzance/
3747 * Regexp.union("a+b*c") #=> /a\+b\*c/
3748 * Regexp.union("skiing", "sledding") #=> /skiing|sledding/
3749 * Regexp.union(["skiing", "sledding"]) #=> /skiing|sledding/
3750 * Regexp.union(/dogs/, /cats/i) #=> /(?-mix:dogs)|(?i-mx:cats)/
3751 *
3752 * Note: the arguments for ::union will try to be converted into a regular
3753 * expression literal via #to_regexp.
3754 */
3755static VALUE
3756rb_reg_s_union_m(VALUE self, VALUE args)
3757{
3758 VALUE v;
3759 if (RARRAY_LEN(args) == 1 &&
3760 !NIL_P(v = rb_check_array_type(rb_ary_entry(args, 0)))) {
3761 return rb_reg_s_union(self, v);
3762 }
3763 return rb_reg_s_union(self, args);
3764}
3765
3766/* :nodoc: */
3767static VALUE
3768rb_reg_init_copy(VALUE copy, VALUE re)
3769{
3770 if (!OBJ_INIT_COPY(copy, re)) return copy;
3771 rb_reg_check(re);
3772 return rb_reg_init_str(copy, RREGEXP_SRC(re), rb_reg_options(re));
3773}
3774
3775VALUE
3777{
3778 VALUE val = 0;
3779 char *p, *s, *e;
3780 int no, clen;
3781 rb_encoding *str_enc = rb_enc_get(str);
3782 rb_encoding *src_enc = rb_enc_get(src);
3783 int acompat = rb_enc_asciicompat(str_enc);
3784#define ASCGET(s,e,cl) (acompat ? (*(cl)=1,ISASCII((s)[0])?(s)[0]:-1) : rb_enc_ascget((s), (e), (cl), str_enc))
3785
3786 p = s = RSTRING_PTR(str);
3787 e = s + RSTRING_LEN(str);
3788
3789 while (s < e) {
3790 int c = ASCGET(s, e, &clen);
3791 char *ss;
3792
3793 if (c == -1) {
3794 s += mbclen(s, e, str_enc);
3795 continue;
3796 }
3797 ss = s;
3798 s += clen;
3799
3800 if (c != '\\' || s == e) continue;
3801
3802 if (!val) {
3803 val = rb_str_buf_new(ss-p);
3804 }
3805 rb_enc_str_buf_cat(val, p, ss-p, str_enc);
3806
3807 c = ASCGET(s, e, &clen);
3808 if (c == -1) {
3809 s += mbclen(s, e, str_enc);
3810 rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
3811 p = s;
3812 continue;
3813 }
3814 s += clen;
3815
3816 p = s;
3817 switch (c) {
3818 case '1': case '2': case '3': case '4':
3819 case '5': case '6': case '7': case '8': case '9':
3820 if (!NIL_P(regexp) && onig_noname_group_capture_is_active(RREGEXP_PTR(regexp))) {
3821 no = c - '0';
3822 }
3823 else {
3824 continue;
3825 }
3826 break;
3827
3828 case 'k':
3829 if (s < e && ASCGET(s, e, &clen) == '<') {
3830 char *name, *name_end;
3831
3832 name_end = name = s + clen;
3833 while (name_end < e) {
3834 c = ASCGET(name_end, e, &clen);
3835 if (c == '>') break;
3836 name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen;
3837 }
3838 if (name_end < e) {
3839 VALUE n = rb_str_subseq(str, (long)(name - RSTRING_PTR(str)),
3840 (long)(name_end - name));
3841 if ((no = NAME_TO_NUMBER(regs, regexp, n, name, name_end)) < 1) {
3842 name_to_backref_error(n);
3843 }
3844 p = s = name_end + clen;
3845 break;
3846 }
3847 else {
3848 rb_raise(rb_eRuntimeError, "invalid group name reference format");
3849 }
3850 }
3851
3852 rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
3853 continue;
3854
3855 case '0':
3856 case '&':
3857 no = 0;
3858 break;
3859
3860 case '`':
3861 rb_enc_str_buf_cat(val, RSTRING_PTR(src), BEG(0), src_enc);
3862 continue;
3863
3864 case '\'':
3865 rb_enc_str_buf_cat(val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0), src_enc);
3866 continue;
3867
3868 case '+':
3869 no = regs->num_regs-1;
3870 while (BEG(no) == -1 && no > 0) no--;
3871 if (no == 0) continue;
3872 break;
3873
3874 case '\\':
3875 rb_enc_str_buf_cat(val, s-clen, clen, str_enc);
3876 continue;
3877
3878 default:
3879 rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
3880 continue;
3881 }
3882
3883 if (no >= 0) {
3884 if (no >= regs->num_regs) continue;
3885 if (BEG(no) == -1) continue;
3886 rb_enc_str_buf_cat(val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no), src_enc);
3887 }
3888 }
3889
3890 if (!val) return str;
3891 if (p < e) {
3892 rb_enc_str_buf_cat(val, p, e-p, str_enc);
3893 }
3894
3895 return val;
3896}
3897
3898static VALUE
3899kcode_getter(ID _x, VALUE *_y)
3900{
3901 rb_warn("variable $KCODE is no longer effective");
3902 return Qnil;
3903}
3904
3905static void
3906kcode_setter(VALUE val, ID id, VALUE *_)
3907{
3908 rb_warn("variable $KCODE is no longer effective; ignored");
3909}
3910
3911static VALUE
3912ignorecase_getter(ID _x, VALUE *_y)
3913{
3914 rb_warn("variable $= is no longer effective");
3915 return Qfalse;
3916}
3917
3918static void
3919ignorecase_setter(VALUE val, ID id, VALUE *_)
3920{
3921 rb_warn("variable $= is no longer effective; ignored");
3922}
3923
3924static VALUE
3925match_getter(void)
3926{
3927 VALUE match = rb_backref_get();
3928
3929 if (NIL_P(match)) return Qnil;
3930 rb_match_busy(match);
3931 return match;
3932}
3933
3934static VALUE
3935get_LAST_MATCH_INFO(ID _x, VALUE *_y)
3936{
3937 return match_getter();
3938}
3939
3940static void
3941match_setter(VALUE val, ID _x, VALUE *_y)
3942{
3943 if (!NIL_P(val)) {
3944 Check_Type(val, T_MATCH);
3945 }
3946 rb_backref_set(val);
3947}
3948
3949/*
3950 * call-seq:
3951 * Regexp.last_match -> matchdata
3952 * Regexp.last_match(n) -> str
3953 *
3954 * The first form returns the MatchData object generated by the
3955 * last successful pattern match. Equivalent to reading the special global
3956 * variable <code>$~</code> (see Special global variables in Regexp for
3957 * details).
3958 *
3959 * The second form returns the <i>n</i>th field in this MatchData object.
3960 * _n_ can be a string or symbol to reference a named capture.
3961 *
3962 * Note that the last_match is local to the thread and method scope of the
3963 * method that did the pattern match.
3964 *
3965 * /c(.)t/ =~ 'cat' #=> 0
3966 * Regexp.last_match #=> #<MatchData "cat" 1:"a">
3967 * Regexp.last_match(0) #=> "cat"
3968 * Regexp.last_match(1) #=> "a"
3969 * Regexp.last_match(2) #=> nil
3970 *
3971 * /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "var = val"
3972 * Regexp.last_match #=> #<MatchData "var = val" lhs:"var" rhs:"val">
3973 * Regexp.last_match(:lhs) #=> "var"
3974 * Regexp.last_match(:rhs) #=> "val"
3975 */
3976
3977static VALUE
3978rb_reg_s_last_match(int argc, VALUE *argv, VALUE _)
3979{
3980 if (rb_check_arity(argc, 0, 1) == 1) {
3981 VALUE match = rb_backref_get();
3982 int n;
3983 if (NIL_P(match)) return Qnil;
3984 n = match_backref_number(match, argv[0]);
3985 return rb_reg_nth_match(n, match);
3986 }
3987 return match_getter();
3988}
3989
3990static void
3991re_warn(const char *s)
3992{
3993 rb_warn("%s", s);
3994}
3995
3996/*
3997 * Document-class: RegexpError
3998 *
3999 * Raised when given an invalid regexp expression.
4000 *
4001 * Regexp.new("?")
4002 *
4003 * <em>raises the exception:</em>
4004 *
4005 * RegexpError: target of repeat operator is not specified: /?/
4006 */
4007
4008/*
4009 * Document-class: Regexp
4010 *
4011 * A Regexp holds a regular expression, used to match a pattern
4012 * against strings. Regexps are created using the <code>/.../</code>
4013 * and <code>%r{...}</code> literals, and by the Regexp::new
4014 * constructor.
4015 *
4016 * :include: doc/regexp.rdoc
4017 */
4018
4019void
4021{
4023
4025 onig_set_warn_func(re_warn);
4026 onig_set_verb_warn_func(re_warn);
4027
4028 rb_define_virtual_variable("$~", get_LAST_MATCH_INFO, match_setter);
4029 rb_define_virtual_variable("$&", last_match_getter, 0);
4030 rb_define_virtual_variable("$`", prematch_getter, 0);
4031 rb_define_virtual_variable("$'", postmatch_getter, 0);
4032 rb_define_virtual_variable("$+", last_paren_match_getter, 0);
4033
4034 rb_define_virtual_variable("$=", ignorecase_getter, ignorecase_setter);
4035 rb_define_virtual_variable("$KCODE", kcode_getter, kcode_setter);
4036 rb_define_virtual_variable("$-K", kcode_getter, kcode_setter);
4037
4039 rb_define_alloc_func(rb_cRegexp, rb_reg_s_alloc);
4041 rb_define_singleton_method(rb_cRegexp, "quote", rb_reg_s_quote, 1);
4042 rb_define_singleton_method(rb_cRegexp, "escape", rb_reg_s_quote, 1);
4043 rb_define_singleton_method(rb_cRegexp, "union", rb_reg_s_union_m, -2);
4044 rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1);
4045 rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1);
4046
4047 rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1);
4048 rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1);
4049 rb_define_method(rb_cRegexp, "hash", rb_reg_hash, 0);
4050 rb_define_method(rb_cRegexp, "eql?", rb_reg_equal, 1);
4051 rb_define_method(rb_cRegexp, "==", rb_reg_equal, 1);
4055 rb_define_method(rb_cRegexp, "match", rb_reg_match_m, -1);
4056 rb_define_method(rb_cRegexp, "match?", rb_reg_match_m_p, -1);
4057 rb_define_method(rb_cRegexp, "to_s", rb_reg_to_s, 0);
4058 rb_define_method(rb_cRegexp, "inspect", rb_reg_inspect, 0);
4059 rb_define_method(rb_cRegexp, "source", rb_reg_source, 0);
4060 rb_define_method(rb_cRegexp, "casefold?", rb_reg_casefold_p, 0);
4061 rb_define_method(rb_cRegexp, "options", rb_reg_options_m, 0);
4062 rb_define_method(rb_cRegexp, "encoding", rb_obj_encoding, 0); /* in encoding.c */
4063 rb_define_method(rb_cRegexp, "fixed_encoding?", rb_reg_fixed_encoding_p, 0);
4064 rb_define_method(rb_cRegexp, "names", rb_reg_names, 0);
4065 rb_define_method(rb_cRegexp, "named_captures", rb_reg_named_captures, 0);
4066
4067 /* see Regexp.options and Regexp.new */
4069 /* see Regexp.options and Regexp.new */
4071 /* see Regexp.options and Regexp.new */
4073 /* see Regexp.options and Regexp.new */
4075 /* see Regexp.options and Regexp.new */
4077
4078 rb_global_variable(&reg_cache);
4079
4080 rb_cMatch = rb_define_class("MatchData", rb_cObject);
4081 rb_define_alloc_func(rb_cMatch, match_alloc);
4083 rb_undef_method(CLASS_OF(rb_cMatch), "allocate");
4084
4085 rb_define_method(rb_cMatch, "initialize_copy", match_init_copy, 1);
4086 rb_define_method(rb_cMatch, "regexp", match_regexp, 0);
4087 rb_define_method(rb_cMatch, "names", match_names, 0);
4088 rb_define_method(rb_cMatch, "size", match_size, 0);
4089 rb_define_method(rb_cMatch, "length", match_size, 0);
4090 rb_define_method(rb_cMatch, "offset", match_offset, 1);
4091 rb_define_method(rb_cMatch, "begin", match_begin, 1);
4092 rb_define_method(rb_cMatch, "end", match_end, 1);
4093 rb_define_method(rb_cMatch, "to_a", match_to_a, 0);
4094 rb_define_method(rb_cMatch, "[]", match_aref, -1);
4095 rb_define_method(rb_cMatch, "captures", match_captures, 0);
4096 rb_define_method(rb_cMatch, "named_captures", match_named_captures, 0);
4097 rb_define_method(rb_cMatch, "values_at", match_values_at, -1);
4099 rb_define_method(rb_cMatch, "post_match", rb_reg_match_post, 0);
4100 rb_define_method(rb_cMatch, "to_s", match_to_s, 0);
4101 rb_define_method(rb_cMatch, "inspect", match_inspect, 0);
4102 rb_define_method(rb_cMatch, "string", match_string, 0);
4103 rb_define_method(rb_cMatch, "hash", match_hash, 0);
4104 rb_define_method(rb_cMatch, "eql?", match_equal, 1);
4105 rb_define_method(rb_cMatch, "==", match_equal, 1);
4106}
#define range(low, item, hi)
Definition: date_strftime.c:21
enum @73::@75::@76 mask
struct RIMemo * ptr
Definition: debug.c:65
#define ENCINDEX_Windows_31J
Definition: encindex.h:53
#define ENCINDEX_EUC_JP
Definition: encindex.h:52
st_table * names
Definition: encoding.c:59
int rb_enc_dummy_p(rb_encoding *enc)
Definition: encoding.c:131
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:1032
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Definition: encoding.c:866
rb_encoding * rb_utf8_encoding(void)
Definition: encoding.c:1328
rb_encoding * rb_ascii8bit_encoding(void)
Definition: encoding.c:1316
rb_encoding * rb_default_internal_encoding(void)
Definition: encoding.c:1512
int rb_utf8_encindex(void)
Definition: encoding.c:1334
rb_encoding * rb_enc_get(VALUE obj)
Definition: encoding.c:872
int rb_ascii8bit_encindex(void)
Definition: encoding.c:1322
int rb_enc_unicode_p(rb_encoding *enc)
Definition: encoding.c:521
void rb_enc_copy(VALUE obj1, VALUE obj2)
Definition: encoding.c:990
rb_encoding * rb_default_external_encoding(void)
Definition: encoding.c:1427
VALUE rb_obj_encoding(VALUE obj)
Definition: encoding.c:1004
rb_encoding * rb_usascii_encoding(void)
Definition: encoding.c:1340
VALUE rb_enc_from_encoding(rb_encoding *encoding)
Definition: encoding.c:116
int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
Definition: encoding.c:1044
#define ENC_CODERANGE_7BIT
Definition: encoding.h:104
#define rb_enc_left_char_head(s, p, e, enc)
Definition: encoding.h:222
#define rb_enc_mbcput(c, buf, enc)
Definition: encoding.h:217
#define ENC_CODERANGE_CLEAN_P(cr)
Definition: encoding.h:107
int rb_enc_str_coderange(VALUE)
Definition: string.c:657
#define ENC_CODERANGE(obj)
Definition: encoding.h:108
#define ENC_CODERANGE_UNKNOWN
Definition: encoding.h:103
VALUE rb_enc_str_new(const char *, long, rb_encoding *)
Definition: string.c:796
#define rb_enc_name(enc)
Definition: encoding.h:177
#define rb_enc_mbmaxlen(enc)
Definition: encoding.h:181
#define ENCODING_GET(obj)
Definition: encoding.h:62
#define rb_enc_mbc_to_codepoint(p, e, enc)
Definition: encoding.h:208
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Definition: string.c:2919
#define MBCLEN_CHARFOUND_LEN(ret)
Definition: encoding.h:192
#define rb_enc_asciicompat(enc)
Definition: encoding.h:245
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Definition: transcode.c:2870
int rb_enc_str_asciionly_p(VALUE)
Definition: string.c:678
#define MBCLEN_INVALID_P(ret)
Definition: encoding.h:193
#define rb_enc_isprint(c, enc)
Definition: encoding.h:236
#define MBCLEN_NEEDMORE_P(ret)
Definition: encoding.h:194
#define rb_enc_mbminlen(enc)
Definition: encoding.h:180
long rb_enc_strlen(const char *, const char *, rb_encoding *)
Definition: string.c:1740
#define ENC_CODERANGE_BROKEN
Definition: encoding.h:106
long rb_str_coderange_scan_restartable(const char *, const char *, rb_encoding *, int *)
Definition: string.c:567
#define MBCLEN_CHARFOUND_P(ret)
Definition: encoding.h:191
#define rb_enc_isspace(c, enc)
Definition: encoding.h:237
char str[HTML_ESCAPE_MAX_LEN+1]
Definition: escape.c:18
void rb_memerror(void)
Definition: gc.c:9611
VALUE rb_define_class(const char *, VALUE)
Defines a top-level class.
Definition: class.c:662
void rb_undef_method(VALUE, const char *)
Definition: class.c:1593
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition: eval.c:898
VALUE rb_cObject
Object class.
Definition: ruby.h:2012
VALUE rb_eRegexpError
Definition: re.c:20
struct RBasic basic
Definition: ruby.h:1113
const VALUE src
Definition: ruby.h:1115
VALUE rb_cRegexp
Definition: re.c:2290
VALUE rb_cMatch
Definition: re.c:930
unsigned long usecnt
Definition: ruby.h:1116
struct re_pattern_buffer * ptr
Definition: ruby.h:1114
void rb_raise(VALUE exc, const char *fmt,...)
Definition: error.c:2671
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition: eval.c:668
void rb_bug(const char *fmt,...)
Definition: error.c:636
VALUE rb_eStandardError
Definition: error.c:921
void rb_set_errinfo(VALUE err)
Sets the current exception ($!) to the given value.
Definition: eval.c:1896
VALUE rb_eTypeError
Definition: error.c:924
VALUE rb_eEncCompatError
Definition: error.c:931
VALUE rb_eRuntimeError
Definition: error.c:922
void rb_warn(const char *fmt,...)
Definition: error.c:315
VALUE rb_eArgError
Definition: error.c:925
VALUE rb_eIndexError
Definition: error.c:926
VALUE rb_eSecurityError
Definition: error.c:933
VALUE rb_check_convert_type(VALUE, int, const char *, const char *)
Tries to convert an object into another type.
Definition: object.c:2941
VALUE rb_any_to_s(VALUE)
Default implementation of #to_s.
Definition: object.c:527
VALUE rb_class_new_instance(int, const VALUE *, VALUE)
Allocates and initializes an instance of klass.
Definition: object.c:1955
VALUE rb_obj_class(VALUE)
Equivalent to Object#class in Ruby.
Definition: object.c:217
const char term
Definition: id.c:37
verbose(int level, const char *format,...)
Definition: mjit_worker.c:303
#define char_size(c2, c1)
Definition: nkf.c:3824
const char * name
Definition: nkf.c:208
#define ONIG_MISMATCH
Definition: onigmo.h:625
#define ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s, end)
Definition: onigmo.h:336
ONIG_EXTERN int onig_reg_init(OnigRegex reg, OnigOptionType option, OnigCaseFoldType case_fold_flag, OnigEncoding enc, const OnigSyntaxType *syntax)
ONIG_EXTERN int onig_error_code_to_str(OnigUChar *s, OnigPosition err_code,...)
ONIG_EXTERN OnigUChar * onigenc_get_right_adjust_char_head(OnigEncoding enc, const OnigUChar *start, const OnigUChar *s, const OnigUChar *end)
unsigned char OnigUChar
Definition: onigmo.h:79
#define ONIG_MAX_ERROR_MESSAGE_LEN
Definition: onigmo.h:443
ONIG_EXTERN int onig_new(OnigRegex *, const OnigUChar *pattern, const OnigUChar *pattern_end, OnigOptionType option, OnigEncoding enc, const OnigSyntaxType *syntax, OnigErrorInfo *einfo)
ONIG_EXTERN int onig_region_resize(OnigRegion *region, int n)
Definition: regexec.c:248
ONIG_EXTERN void onig_region_free(OnigRegion *region, int free_self)
Definition: regexec.c:343
#define ONIG_OPTION_MULTILINE
Definition: onigmo.h:453
#define UChar
Definition: onigmo.h:76
ONIG_EXTERN OnigPosition onig_search(OnigRegex, const OnigUChar *str, const OnigUChar *end, const OnigUChar *start, const OnigUChar *range, OnigRegion *region, OnigOptionType option)
ptrdiff_t OnigPosition
Definition: onigmo.h:83
#define ONIGENC_CASE_FOLD_DEFAULT
Definition: onigmo.h:131
#define ONIG_OPTION_IGNORECASE
Definition: onigmo.h:451
#define ONIGERR_MEMORY
Definition: onigmo.h:629
#define ONIG_ENCODING_ASCII
Definition: onigmo.h:225
ONIG_EXTERN void onig_free(OnigRegex)
ONIG_EXTERN int onigenc_set_default_encoding(OnigEncoding enc)
Definition: regenc.c:48
ONIG_EXTERN const OnigSyntaxType * OnigDefaultSyntax
Definition: onigmo.h:515
#define ONIGENC_MBC_MAXLEN(enc)
Definition: onigmo.h:362
ONIG_EXTERN void onig_set_verb_warn_func(OnigWarnFunc f)
Definition: regparse.c:106
ONIG_EXTERN OnigPosition onig_match(OnigRegex, const OnigUChar *str, const OnigUChar *end, const OnigUChar *at, OnigRegion *region, OnigOptionType option)
unsigned int OnigOptionType
Definition: onigmo.h:445
ONIG_EXTERN int onig_foreach_name(OnigRegex reg, int(*func)(const OnigUChar *, const OnigUChar *, int, int *, OnigRegex, void *), void *arg)
ONIG_EXTERN void onig_region_copy(OnigRegion *to, const OnigRegion *from)
Definition: regexec.c:359
ONIG_EXTERN int onig_name_to_backref_number(OnigRegex reg, const OnigUChar *name, const OnigUChar *name_end, const OnigRegion *region)
ONIG_EXTERN int onig_noname_group_capture_is_active(const OnigRegexType *reg)
Definition: regparse.c:963
#define ONIG_OPTION_EXTEND
Definition: onigmo.h:452
ONIG_EXTERN int onig_number_of_names(const OnigRegexType *reg)
Definition: regparse.c:623
#define ONIG_OPTION_NONE
Definition: onigmo.h:450
ONIG_EXTERN void onig_set_warn_func(OnigWarnFunc f)
Definition: regparse.c:101
#define RARRAY_LEN(a)
void * memchr(const void *, int, size_t)
#define rb_str_new2
#define MEMCPY(p1, p2, type, n)
#define RREGEXP(obj)
VALUE rb_str_to_str(VALUE)
Definition: string.c:1382
#define NULL
int memcmp(const void *, const void *, size_t)
Definition: memcmp.c:7
VALUE rb_str_resize(VALUE, long)
Definition: string.c:2709
use StringValue() instead")))
#define RSTRING_LEN(str)
#define rb_str_buf_cat2
#define _(args)
#define NEWOBJ_OF(obj, type, klass, flags)
#define RTEST(v)
void rb_define_virtual_variable(const char *, rb_gvar_getter_t *, rb_gvar_setter_t *)
Definition: variable.c:511
#define ALLOCA_N(type, n)
#define rb_hash_uint(h, i)
#define OBJ_INIT_COPY(obj, orig)
long rb_str_offset(VALUE, long)
Definition: string.c:2416
#define FL_TEST(x, f)
#define RBASIC(obj)
#define T_STRING
#define rb_hash_end(h)
long int ptrdiff_t
void * malloc(size_t) __attribute__((__malloc__)) __attribute__((__warn_unused_result__)) __attribute__((__alloc_size__(1)))
VALUE rb_assoc_new(VALUE, VALUE)
Definition: array.c:896
VALUE rb_backref_get(void)
Definition: vm.c:1304
#define StringValuePtr(v)
#define RREGEXP_SRC_LEN(r)
char * rb_str_subpos(VALUE, long, long *)
Definition: string.c:2497
#define LONG2FIX(i)
VALUE rb_range_beg_len(VALUE, long *, long *, long, int)
Definition: range.c:1278
#define CHAR_BIT
#define RSTRING_END(str)
const VALUE VALUE obj
#define rb_check_frozen(obj)
#define FL_SET(x, f)
#define RSTRING_PTR(str)
#define rb_str_buf_new2
int snprintf(char *__restrict__, size_t, const char *__restrict__,...) __attribute__((__format__(__printf__
st_index_t rb_memhash(const void *ptr, long len)
Definition: random.c:1444
#define rb_str_new(str, len)
#define NIL_P(v)
#define rb_str_buf_cat
#define REALLOC_N(var, type, n)
const char size_t n
#define ruby_verbose
#define MEMZERO(p, type, n)
st_index_t rb_hash_start(st_index_t)
Definition: random.c:1438
#define MEMO_NEW(a, b, c)
unsigned long VALUE
VALUE rb_ary_push(VALUE, VALUE)
Definition: array.c:1195
__inline__ const void *__restrict__ src
VALUE rb_sym2str(VALUE)
Definition: symbol.c:784
VALUE rb_str_buf_new(long)
Definition: string.c:1315
VALUE rb_check_string_type(VALUE)
Definition: string.c:2314
VALUE rb_str_inspect(VALUE)
Definition: string.c:5930
#define rp(obj)
void rb_define_alloc_func(VALUE, rb_alloc_func_t)
#define FL_UNSET(x, f)
uint32_t i
double y0(double)
#define char
VALUE rb_fstring(VALUE)
Definition: string.c:312
__inline__ const void *__restrict__ size_t len
VALUE rb_class_path(VALUE)
Definition: variable.c:153
void rb_gc(void)
Definition: gc.c:8695
void rb_global_variable(VALUE *)
Definition: gc.c:7128
#define INT2NUM(x)
#define ZALLOC(type)
#define RB_OBJ_WRITE(a, slot, b)
VALUE rb_hash_new_with_size(st_index_t size)
Definition: hash.c:1529
int rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
Definition: string.c:5815
#define rb_str_new3
void rb_define_const(VALUE, const char *, VALUE)
Definition: variable.c:2891
#define NUM2INT(x)
void rb_define_singleton_method(VALUE, const char *, VALUE(*)(), int)
VALUE rb_str_equal(VALUE str1, VALUE str2)
Definition: string.c:3267
#define RB_GC_GUARD(v)
#define RREGEXP_PTR(r)
#define PRIsVALUE
void * memset(void *, int, size_t)
#define FIX2INT(x)
int VALUE v
#define rb_scan_args(argc, argvp, fmt,...)
#define rb_exc_new3
VALUE rb_ary_new_capa(long capa)
Definition: array.c:717
VALUE rb_str_catf(VALUE, const char *,...) __attribute__((format(printf
void * bsearch(const void *__key, const void *__base, size_t __nmemb, size_t __size, __compar_fn_t _compar)
#define TRUE
#define FALSE
VALUE rb_ary_resize(VALUE ary, long len)
expands or shrinks ary to len elements.
Definition: array.c:1955
#define MEMO_CAST(m)
#define Qtrue
VALUE rb_str_subseq(VALUE, long, long)
Definition: string.c:2474
struct rb_call_cache buf
VALUE rb_str_append(VALUE, VALUE)
Definition: string.c:2965
VALUE rb_str_buf_cat_ascii(VALUE, const char *)
Definition: string.c:2926
#define Qnil
#define Qfalse
void * memcpy(void *__restrict__, const void *__restrict__, size_t)
VALUE rb_str_buf_append(VALUE, VALUE)
Definition: string.c:2950
#define RREGEXP_SRC_PTR(r)
void * memmem(const void *, size_t, const void *, size_t)
VALUE rb_lastline_get(void)
Definition: vm.c:1316
st_data_t st_index_t
#define RB_TYPE_P(obj, type)
#define FL_WB_PROTECTED
#define INT2FIX(i)
VALUE rb_check_array_type(VALUE)
Definition: array.c:909
#define MJIT_FUNC_EXPORTED
const VALUE * argv
#define T_MATCH
#define SYMBOL_P(x)
#define FIXNUM_P(f)
#define CLASS_OF(v)
#define Check_Type(v, t)
VALUE rb_hash_aset(VALUE, VALUE, VALUE)
Definition: hash.c:2852
#define rb_check_arity
long rb_str_sublen(VALUE, long)
Definition: string.c:2463
VALUE rb_str_dup(VALUE)
Definition: string.c:1516
VALUE rb_sprintf(const char *,...) __attribute__((format(printf
st_index_t rb_str_hash(VALUE)
Definition: string.c:3163
unsigned long ID
VALUE rb_yield(VALUE)
Definition: vm_eval.c:1237
size_t st_index_t h
#define ISSPACE(c)
#define RREGEXP_SRC(r)
#define RGENGC_WB_PROTECTED_REGEXP
#define NUM2LONG(x)
void rb_define_method(VALUE, const char *, VALUE(*)(), int)
#define rb_ary_new2
VALUE rb_str_length(VALUE)
Definition: string.c:1843
#define RARRAY_AREF(a, i)
#define SIZEOF_VALUE
void rb_backref_set(VALUE)
Definition: vm.c:1310
VALUE rb_hash_new(void)
Definition: hash.c:1523
#define ST2FIX(h)
void qsort(void *__base, size_t __nmemb, size_t __size, __compar_fn_t _compar)
int rb_uv_to_utf8(char[6], unsigned long)
Definition: pack.c:1651
void rb_ary_store(VALUE, long, VALUE)
Definition: array.c:1079
#define rb_str_new4
VALUE rb_ary_entry(VALUE, long)
Definition: array.c:1512
#define StringValueCStr(v)
#define T_REGEXP
#define LIKELY(x)
int rb_reg_backref_number(VALUE match, VALUE backref)
Definition: re.c:1181
void rb_backref_set_string(VALUE string, long pos, long len)
Definition: re.c:1348
VALUE rb_reg_nth_defined(int nth, VALUE match)
Definition: re.c:1696
NORETURN(static void rb_reg_raise(const char *s, long len, const char *err, VALUE re))
VALUE rb_reg_match_last(VALUE match)
Definition: re.c:1800
int rb_reg_options(VALUE re)
Definition: re.c:3579
VALUE rb_reg_match(VALUE re, VALUE str)
Definition: re.c:3180
#define REG_ENCODING_NONE
Definition: re.c:276
VALUE rb_reg_eqq(VALUE re, VALUE str)
Definition: re.c:3210
#define NAME_TO_NUMBER(regs, re, name, name_ptr, name_end)
Definition: re.c:1921
regex_t * rb_reg_prepare_re(VALUE re, VALUE str)
Definition: re.c:1499
VALUE rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
Definition: re.c:3776
#define ARG_ENCODING_NONE
Definition: re.c:283
VALUE rb_enc_reg_new(const char *s, long len, rb_encoding *enc, int options)
Definition: re.c:2933
#define ARG_ENCODING_FIXED
Definition: re.c:282
VALUE rb_reg_match_post(VALUE match)
Definition: re.c:1783
VALUE rb_reg_last_match(VALUE match)
Definition: re.c:1739
VALUE rb_reg_match_p(VALUE re, VALUE str, long pos)
Definition: re.c:3340
void Init_Regexp(void)
Definition: re.c:4020
VALUE rb_reg_match_pre(VALUE match)
Definition: re.c:1757
#define BEG(no)
Definition: re.c:25
char onig_errmsg_buffer[ONIG_MAX_ERROR_MESSAGE_LEN]
Definition: re.c:22
#define END(no)
Definition: re.c:26
VALUE rb_reg_new(const char *s, long len, int options)
Definition: re.c:2947
bool rb_reg_start_with_p(VALUE re, VALUE str)
Definition: re.c:1626
int rb_char_to_option_kcode(int c, int *option, int *kcode)
Definition: re.c:319
void rb_match_busy(VALUE match)
Definition: re.c:1295
#define REG_LITERAL
Definition: re.c:275
long rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc)
Definition: re.c:239
regex_t * rb_reg_prepare_re0(VALUE re, VALUE str, onig_errmsg_buffer err)
Definition: re.c:1461
#define ARG_REG_OPTION_MASK
Definition: re.c:280
#define MATCH_BUSY
Definition: re.c:1292
#define VALUE_MAX
long rb_reg_search(VALUE re, VALUE str, long pos, int reverse)
Definition: re.c:1620
long rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int reverse)
Definition: re.c:1506
VALUE rb_reg_regcomp(VALUE str)
Definition: re.c:2970
VALUE rb_reg_nth_match(int nth, VALUE match)
Definition: re.c:1714
#define errcpy(err, msg)
Definition: re.c:23
int rb_match_nth_defined(int nth, VALUE match)
Definition: re.c:1317
long rb_reg_search0(VALUE re, VALUE str, long pos, int reverse, int set_backref_str)
Definition: re.c:1538
MJIT_FUNC_EXPORTED VALUE rb_reg_new_ary(VALUE ary, int opt)
Definition: re.c:2927
int rb_memcicmp(const void *x, const void *y, long len)
Definition: re.c:80
VALUE rb_reg_init_str(VALUE re, VALUE s, int options)
Definition: re.c:2901
VALUE rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline)
Definition: re.c:2953
void rb_match_unbusy(VALUE match)
Definition: re.c:1301
int rb_match_count(VALUE match)
Definition: re.c:1307
VALUE rb_reg_match2(VALUE re)
Definition: re.c:3239
#define KCODE_FIXED
Definition: re.c:278
VALUE rb_reg_quote(VALUE str)
Definition: re.c:3462
VALUE rb_check_regexp_type(VALUE re)
Definition: re.c:3591
VALUE rb_reg_alloc(void)
Definition: re.c:2889
#define ASCGET(s, e, cl)
VALUE rb_reg_check_preprocess(VALUE str)
Definition: re.c:2707
VALUE rb_reg_new_str(VALUE s, int options)
Definition: re.c:2895
int rb_reg_region_copy(struct re_registers *to, const struct re_registers *from)
Definition: re.c:946
typedefRUBY_SYMBOL_EXPORT_BEGIN struct re_pattern_buffer Regexp
Definition: re.h:29
#define RMATCH(obj)
Definition: re.h:50
#define RMATCH_REGS(obj)
Definition: re.h:51
int onig_compile_ruby(regex_t *reg, const UChar *pattern, const UChar *pattern_end, OnigErrorInfo *einfo, const char *sourcefile, int sourceline)
Definition: regcomp.c:5713
#define mbclen(p, e, enc)
Definition: regex.h:33
#define IS_NULL(p)
Definition: regint.h:298
const VALUE v2
const VALUE v1
const VALUE value
Definition: re.h:43
VALUE regexp
Definition: re.h:47
struct rmatch * rmatch
Definition: re.h:46
VALUE str
Definition: re.h:45
Definition: ruby.h:1112
long len
Definition: re.c:2203
const UChar * name
Definition: re.c:2202
Definition: re.c:956
long char_pos
Definition: re.c:958
long byte_pos
Definition: re.c:957
OnigEncoding enc
Definition: onigmo.h:776
OnigOptionType options
Definition: onigmo.h:772
OnigPosition * beg
Definition: onigmo.h:719
int num_regs
Definition: onigmo.h:718
int allocated
Definition: onigmo.h:717
OnigPosition * end
Definition: onigmo.h:720
long beg
Definition: re.h:32
long end
Definition: re.h:33
Definition: re.h:36
int char_offset_num_allocated
Definition: re.h:40
struct rmatch_offset * char_offset
Definition: re.h:39
struct re_registers regs
Definition: re.h:37
unsigned long ruby_scan_oct(const char *, size_t, size_t *)
Definition: util.c:34
unsigned long ruby_scan_hex(const char *, size_t, size_t *)
Definition: util.c:52
#define scan_hex(s, l, e)
Definition: util.h:55
#define scan_oct(s, l, e)
Definition: util.h:53