Ruby 2.7.7p221 (2022-11-24 revision 168ec2b1e5ad0e4688e963d9de019557c78feed9)
string.c
Go to the documentation of this file.
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/encoding.h"
15#include "ruby/re.h"
16#include "internal.h"
17#include "encindex.h"
18#include "probes.h"
19#include "gc.h"
20#include "ruby_assert.h"
21#include "id.h"
22#include "debug_counter.h"
23#include "ruby/util.h"
24
25#define BEG(no) (regs->beg[(no)])
26#define END(no) (regs->end[(no)])
27
28#include <errno.h>
29#include <math.h>
30#include <ctype.h>
31
32#ifdef HAVE_UNISTD_H
33#include <unistd.h>
34#endif
35
36#if defined HAVE_CRYPT_R
37# if defined HAVE_CRYPT_H
38# include <crypt.h>
39# endif
40#elif !defined HAVE_CRYPT
41# include "missing/crypt.h"
42# define HAVE_CRYPT_R 1
43#endif
44
45#undef rb_str_new
46#undef rb_usascii_str_new
47#undef rb_utf8_str_new
48#undef rb_enc_str_new
49#undef rb_str_new_cstr
50#undef rb_tainted_str_new_cstr
51#undef rb_usascii_str_new_cstr
52#undef rb_utf8_str_new_cstr
53#undef rb_enc_str_new_cstr
54#undef rb_external_str_new_cstr
55#undef rb_locale_str_new_cstr
56#undef rb_str_dup_frozen
57#undef rb_str_buf_new_cstr
58#undef rb_str_buf_cat
59#undef rb_str_buf_cat2
60#undef rb_str_cat2
61#undef rb_str_cat_cstr
62#undef rb_fstring_cstr
63
64static VALUE rb_str_clear(VALUE str);
65
68
69/* FLAGS of RString
70 *
71 * 1: RSTRING_NOEMBED
72 * 2: STR_SHARED (== ELTS_SHARED)
73 * 2-6: RSTRING_EMBED_LEN (5 bits == 32)
74 * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
75 * other strings that rely on this string's buffer)
76 * 6: STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
77 * early, specific to rb_str_tmp_frozen_{acquire,release})
78 * 7: STR_TMPLOCK
79 * 8-9: ENC_CODERANGE (2 bits)
80 * 10-16: ENCODING (7 bits == 128)
81 * 17: RSTRING_FSTR
82 * 18: STR_NOFREE
83 * 19: STR_FAKESTR
84 */
85
86#define RUBY_MAX_CHAR_LEN 16
87#define STR_SHARED_ROOT FL_USER5
88#define STR_BORROWED FL_USER6
89#define STR_TMPLOCK FL_USER7
90#define STR_NOFREE FL_USER18
91#define STR_FAKESTR FL_USER19
92
93#define STR_SET_NOEMBED(str) do {\
94 FL_SET((str), STR_NOEMBED);\
95 STR_SET_EMBED_LEN((str), 0);\
96} while (0)
97#define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
98#define STR_SET_EMBED_LEN(str, n) do { \
99 long tmp_n = (n);\
100 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
101 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
102} while (0)
103
104#define STR_SET_LEN(str, n) do { \
105 if (STR_EMBED_P(str)) {\
106 STR_SET_EMBED_LEN((str), (n));\
107 }\
108 else {\
109 RSTRING(str)->as.heap.len = (n);\
110 }\
111} while (0)
112
113#define STR_DEC_LEN(str) do {\
114 if (STR_EMBED_P(str)) {\
115 long n = RSTRING_LEN(str);\
116 n--;\
117 STR_SET_EMBED_LEN((str), n);\
118 }\
119 else {\
120 RSTRING(str)->as.heap.len--;\
121 }\
122} while (0)
123
124#define TERM_LEN(str) rb_enc_mbminlen(rb_enc_get(str))
125#define TERM_FILL(ptr, termlen) do {\
126 char *const term_fill_ptr = (ptr);\
127 const int term_fill_len = (termlen);\
128 *term_fill_ptr = '\0';\
129 if (UNLIKELY(term_fill_len > 1))\
130 memset(term_fill_ptr, 0, term_fill_len);\
131} while (0)
132
133#define RESIZE_CAPA(str,capacity) do {\
134 const int termlen = TERM_LEN(str);\
135 RESIZE_CAPA_TERM(str,capacity,termlen);\
136} while (0)
137#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
138 if (STR_EMBED_P(str)) {\
139 if (!STR_EMBEDDABLE_P(capacity, termlen)) {\
140 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
141 const long tlen = RSTRING_LEN(str);\
142 memcpy(tmp, RSTRING_PTR(str), tlen);\
143 RSTRING(str)->as.heap.ptr = tmp;\
144 RSTRING(str)->as.heap.len = tlen;\
145 STR_SET_NOEMBED(str);\
146 RSTRING(str)->as.heap.aux.capa = (capacity);\
147 }\
148 }\
149 else {\
150 assert(!FL_TEST((str), STR_SHARED)); \
151 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
152 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
153 RSTRING(str)->as.heap.aux.capa = (capacity);\
154 }\
155} while (0)
156
157#define STR_SET_SHARED(str, shared_str) do { \
158 if (!FL_TEST(str, STR_FAKESTR)) { \
159 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
160 FL_SET((str), STR_SHARED); \
161 FL_SET((shared_str), STR_SHARED_ROOT); \
162 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
163 FL_SET_RAW((shared_str), STR_BORROWED); \
164 } \
165} while (0)
166
167#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
168#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
169
170#define STR_ENC_GET(str) get_encoding(str)
171
172#if !defined SHARABLE_MIDDLE_SUBSTRING
173# define SHARABLE_MIDDLE_SUBSTRING 0
174#endif
175#if !SHARABLE_MIDDLE_SUBSTRING
176#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
177#else
178#define SHARABLE_SUBSTRING_P(beg, len, end) 1
179#endif
180
181#define STR_EMBEDDABLE_P(len, termlen) \
182 ((len) <= RSTRING_EMBED_LEN_MAX + 1 - (termlen))
183
184static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
185static VALUE str_new_shared(VALUE klass, VALUE str);
186static VALUE str_new_frozen(VALUE klass, VALUE orig);
187static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
188static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
189static inline void str_modifiable(VALUE str);
190static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
191
192static inline void
193str_make_independent(VALUE str)
194{
195 long len = RSTRING_LEN(str);
196 int termlen = TERM_LEN(str);
197 str_make_independent_expand((str), len, 0L, termlen);
198}
199
200/* symbols for [up|down|swap]case/capitalize options */
201static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
202
203static rb_encoding *
204get_actual_encoding(const int encidx, VALUE str)
205{
206 const unsigned char *q;
207
208 switch (encidx) {
209 case ENCINDEX_UTF_16:
210 if (RSTRING_LEN(str) < 2) break;
211 q = (const unsigned char *)RSTRING_PTR(str);
212 if (q[0] == 0xFE && q[1] == 0xFF) {
214 }
215 if (q[0] == 0xFF && q[1] == 0xFE) {
217 }
218 return rb_ascii8bit_encoding();
219 case ENCINDEX_UTF_32:
220 if (RSTRING_LEN(str) < 4) break;
221 q = (const unsigned char *)RSTRING_PTR(str);
222 if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) {
224 }
225 if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) {
227 }
228 return rb_ascii8bit_encoding();
229 }
230 return rb_enc_from_index(encidx);
231}
232
233static rb_encoding *
234get_encoding(VALUE str)
235{
236 return get_actual_encoding(ENCODING_GET(str), str);
237}
238
239static void
240mustnot_broken(VALUE str)
241{
242 if (is_broken_string(str)) {
243 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
244 }
245}
246
247static void
248mustnot_wchar(VALUE str)
249{
251 if (rb_enc_mbminlen(enc) > 1) {
252 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
253 }
254}
255
256static int fstring_cmp(VALUE a, VALUE b);
257
258static VALUE register_fstring(VALUE str);
259
261 fstring_cmp,
263};
264
265#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
266
267static int
268fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t arg, int existing)
269{
270 VALUE *fstr = (VALUE *)arg;
271 VALUE str = (VALUE)*key;
272
273 if (existing) {
274 /* because of lazy sweep, str may be unmarked already and swept
275 * at next time */
276
278 *fstr = Qundef;
279 return ST_DELETE;
280 }
281
282 *fstr = str;
283 return ST_STOP;
284 }
285 else {
287 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
288 RSTRING(str)->as.heap.len,
291 }
292 else {
293 str = str_new_frozen(rb_cString, str);
294 if (STR_SHARED_P(str)) { /* str should not be shared */
295 /* shared substring */
296 str_make_independent(str);
298 }
299 if (!BARE_STRING_P(str)) {
300 str = str_new_frozen(rb_cString, str);
301 }
302 }
303 RBASIC(str)->flags |= RSTRING_FSTR;
304
305 *key = *value = *fstr = str;
306 return ST_CONTINUE;
307 }
308}
309
311VALUE
313{
314 VALUE fstr;
315 int bare;
316
318
320 return str;
321
322 bare = BARE_STRING_P(str);
323 if (!bare) {
324 if (STR_EMBED_P(str)) {
326 return str;
327 }
330 return str;
331 }
332 }
333
334 if (!OBJ_FROZEN(str))
336
337 fstr = register_fstring(str);
338
339 if (!bare) {
340 str_replace_shared_without_enc(str, fstr);
342 return str;
343 }
344 return fstr;
345}
346
347static VALUE
348register_fstring(VALUE str)
349{
350 VALUE ret;
351 st_table *frozen_strings = rb_vm_fstring_table();
352
353 do {
354 ret = str;
355 st_update(frozen_strings, (st_data_t)str,
356 fstr_update_callback, (st_data_t)&ret);
357 } while (ret == Qundef);
358
359 assert(OBJ_FROZEN(ret));
363 return ret;
364}
365
366static VALUE
367setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
368{
370 /* SHARED to be allocated by the callback */
371
372 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
373
375 fake_str->as.heap.len = len;
376 fake_str->as.heap.ptr = (char *)name;
377 fake_str->as.heap.aux.capa = len;
378 return (VALUE)fake_str;
379}
380
381/*
382 * set up a fake string which refers a static string literal.
383 */
384VALUE
385rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
386{
387 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
388}
389
390/*
391 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
392 * shared string which refers a static string literal. `ptr` must
393 * point a constant string.
394 */
396rb_fstring_new(const char *ptr, long len)
397{
398 struct RString fake_str;
399 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII));
400}
401
402VALUE
403rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
404{
405 struct RString fake_str;
406 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc));
407}
408
409VALUE
411{
412 return rb_fstring_new(ptr, strlen(ptr));
413}
414
415static int
416fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
417{
419 return ST_CONTINUE;
420}
421
422static int
423fstring_cmp(VALUE a, VALUE b)
424{
425 long alen, blen;
426 const char *aptr, *bptr;
427 RSTRING_GETMEM(a, aptr, alen);
428 RSTRING_GETMEM(b, bptr, blen);
429 return (alen != blen ||
430 ENCODING_GET(a) != ENCODING_GET(b) ||
431 memcmp(aptr, bptr, alen) != 0);
432}
433
434static inline int
435single_byte_optimizable(VALUE str)
436{
437 rb_encoding *enc;
438
439 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
441 return 1;
442
443 enc = STR_ENC_GET(str);
444 if (rb_enc_mbmaxlen(enc) == 1)
445 return 1;
446
447 /* Conservative. Possibly single byte.
448 * "\xa1" in Shift_JIS for example. */
449 return 0;
450}
451
453
454static inline const char *
455search_nonascii(const char *p, const char *e)
456{
457 const uintptr_t *s, *t;
458
459#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
460# if SIZEOF_UINTPTR_T == 8
461# define NONASCII_MASK UINT64_C(0x8080808080808080)
462# elif SIZEOF_UINTPTR_T == 4
463# define NONASCII_MASK UINT32_C(0x80808080)
464# else
465# error "don't know what to do."
466# endif
467#else
468# if SIZEOF_UINTPTR_T == 8
469# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
470# elif SIZEOF_UINTPTR_T == 4
471# define NONASCII_MASK 0x80808080UL /* or...? */
472# else
473# error "don't know what to do."
474# endif
475#endif
476
477 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
478#if !UNALIGNED_WORD_ACCESS
479 if ((uintptr_t)p % SIZEOF_VOIDP) {
480 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
481 p += l;
482 switch (l) {
483 default: UNREACHABLE;
484#if SIZEOF_VOIDP > 4
485 case 7: if (p[-7]&0x80) return p-7;
486 case 6: if (p[-6]&0x80) return p-6;
487 case 5: if (p[-5]&0x80) return p-5;
488 case 4: if (p[-4]&0x80) return p-4;
489#endif
490 case 3: if (p[-3]&0x80) return p-3;
491 case 2: if (p[-2]&0x80) return p-2;
492 case 1: if (p[-1]&0x80) return p-1;
493 case 0: break;
494 }
495 }
496#endif
497#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
498#define aligned_ptr(value) \
499 __builtin_assume_aligned((value), sizeof(uintptr_t))
500#else
501#define aligned_ptr(value) (uintptr_t *)(value)
502#endif
503 s = aligned_ptr(p);
504 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
505#undef aligned_ptr
506 for (;s < t; s++) {
507 if (*s & NONASCII_MASK) {
508#ifdef WORDS_BIGENDIAN
509 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
510#else
511 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
512#endif
513 }
514 }
515 p = (const char *)s;
516 }
517
518 switch (e - p) {
519 default: UNREACHABLE;
520#if SIZEOF_VOIDP > 4
521 case 7: if (e[-7]&0x80) return e-7;
522 case 6: if (e[-6]&0x80) return e-6;
523 case 5: if (e[-5]&0x80) return e-5;
524 case 4: if (e[-4]&0x80) return e-4;
525#endif
526 case 3: if (e[-3]&0x80) return e-3;
527 case 2: if (e[-2]&0x80) return e-2;
528 case 1: if (e[-1]&0x80) return e-1;
529 case 0: return NULL;
530 }
531}
532
533static int
534coderange_scan(const char *p, long len, rb_encoding *enc)
535{
536 const char *e = p + len;
537
539 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
540 p = search_nonascii(p, e);
542 }
543
544 if (rb_enc_asciicompat(enc)) {
545 p = search_nonascii(p, e);
546 if (!p) return ENC_CODERANGE_7BIT;
547 for (;;) {
548 int ret = rb_enc_precise_mbclen(p, e, enc);
550 p += MBCLEN_CHARFOUND_LEN(ret);
551 if (p == e) break;
552 p = search_nonascii(p, e);
553 if (!p) break;
554 }
555 }
556 else {
557 while (p < e) {
558 int ret = rb_enc_precise_mbclen(p, e, enc);
560 p += MBCLEN_CHARFOUND_LEN(ret);
561 }
562 }
563 return ENC_CODERANGE_VALID;
564}
565
566long
567rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
568{
569 const char *p = s;
570
571 if (*cr == ENC_CODERANGE_BROKEN)
572 return e - s;
573
575 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
576 if (*cr == ENC_CODERANGE_VALID) return e - s;
577 p = search_nonascii(p, e);
579 return e - s;
580 }
581 else if (rb_enc_asciicompat(enc)) {
582 p = search_nonascii(p, e);
583 if (!p) {
584 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
585 return e - s;
586 }
587 for (;;) {
588 int ret = rb_enc_precise_mbclen(p, e, enc);
589 if (!MBCLEN_CHARFOUND_P(ret)) {
591 return p - s;
592 }
593 p += MBCLEN_CHARFOUND_LEN(ret);
594 if (p == e) break;
595 p = search_nonascii(p, e);
596 if (!p) break;
597 }
598 }
599 else {
600 while (p < e) {
601 int ret = rb_enc_precise_mbclen(p, e, enc);
602 if (!MBCLEN_CHARFOUND_P(ret)) {
604 return p - s;
605 }
606 p += MBCLEN_CHARFOUND_LEN(ret);
607 }
608 }
610 return e - s;
611}
612
613static inline void
614str_enc_copy(VALUE str1, VALUE str2)
615{
616 rb_enc_set_index(str1, ENCODING_GET(str2));
617}
618
619static void
620rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
621{
622 /* this function is designed for copying encoding and coderange
623 * from src to new string "dest" which is made from the part of src.
624 */
625 str_enc_copy(dest, src);
626 if (RSTRING_LEN(dest) == 0) {
629 else
631 return;
632 }
633 switch (ENC_CODERANGE(src)) {
636 break;
639 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
641 else
643 break;
644 default:
645 break;
646 }
647}
648
649static void
650rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
651{
652 str_enc_copy(dest, src);
654}
655
656int
658{
659 int cr = ENC_CODERANGE(str);
660
661 if (cr == ENC_CODERANGE_UNKNOWN) {
662 int encidx = ENCODING_GET(str);
663 rb_encoding *enc = rb_enc_from_index(encidx);
664 if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) &&
665 rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) {
667 }
668 else {
669 cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str),
670 enc);
671 }
673 }
674 return cr;
675}
676
677int
679{
681
682 if (!rb_enc_asciicompat(enc))
683 return FALSE;
685 return TRUE;
686 return FALSE;
687}
688
689static inline void
690str_mod_check(VALUE s, const char *p, long len)
691{
692 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
693 rb_raise(rb_eRuntimeError, "string modified");
694 }
695}
696
697static size_t
698str_capacity(VALUE str, const int termlen)
699{
700 if (STR_EMBED_P(str)) {
701 return (RSTRING_EMBED_LEN_MAX + 1 - termlen);
702 }
703 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
704 return RSTRING(str)->as.heap.len;
705 }
706 else {
707 return RSTRING(str)->as.heap.aux.capa;
708 }
709}
710
711size_t
713{
714 return str_capacity(str, TERM_LEN(str));
715}
716
717static inline void
718must_not_null(const char *ptr)
719{
720 if (!ptr) {
721 rb_raise(rb_eArgError, "NULL pointer given");
722 }
723}
724
725static inline VALUE
726str_alloc(VALUE klass)
727{
729 return (VALUE)str;
730}
731
732static inline VALUE
733empty_str_alloc(VALUE klass)
734{
735 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
736 return str_alloc(klass);
737}
738
739static VALUE
740str_new0(VALUE klass, const char *ptr, long len, int termlen)
741{
742 VALUE str;
743
744 if (len < 0) {
745 rb_raise(rb_eArgError, "negative string size (or size too big)");
746 }
747
749
750 str = str_alloc(klass);
751 if (!STR_EMBEDDABLE_P(len, termlen)) {
752 RSTRING(str)->as.heap.aux.capa = len;
753 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)len + termlen);
755 }
756 else if (len == 0) {
758 }
759 if (ptr) {
761 }
763 TERM_FILL(RSTRING_PTR(str) + len, termlen);
764 return str;
765}
766
767static VALUE
768str_new(VALUE klass, const char *ptr, long len)
769{
770 return str_new0(klass, ptr, len, 1);
771}
772
773VALUE
774rb_str_new(const char *ptr, long len)
775{
776 return str_new(rb_cString, ptr, len);
777}
778
779VALUE
780rb_usascii_str_new(const char *ptr, long len)
781{
784 return str;
785}
786
787VALUE
788rb_utf8_str_new(const char *ptr, long len)
789{
790 VALUE str = str_new(rb_cString, ptr, len);
792 return str;
793}
794
795VALUE
796rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
797{
798 VALUE str;
799
800 if (!enc) return rb_str_new(ptr, len);
801
802 str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
803 rb_enc_associate(str, enc);
804 return str;
805}
806
807VALUE
809{
810 must_not_null(ptr);
811 /* rb_str_new_cstr() can take pointer from non-malloc-generated
812 * memory regions, and that cannot be detected by the MSAN. Just
813 * trust the programmer that the argument passed here is a sane C
814 * string. */
816 return rb_str_new(ptr, strlen(ptr));
817}
818
819VALUE
821{
824 return str;
825}
826
827VALUE
829{
832 return str;
833}
834
835VALUE
837{
838 must_not_null(ptr);
839 if (rb_enc_mbminlen(enc) != 1) {
840 rb_raise(rb_eArgError, "wchar encoding given");
841 }
842 return rb_enc_str_new(ptr, strlen(ptr), enc);
843}
844
845static VALUE
846str_new_static(VALUE klass, const char *ptr, long len, int encindex)
847{
848 VALUE str;
849
850 if (len < 0) {
851 rb_raise(rb_eArgError, "negative string size (or size too big)");
852 }
853
854 if (!ptr) {
855 rb_encoding *enc = rb_enc_get_from_index(encindex);
856 str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
857 }
858 else {
860 str = str_alloc(klass);
861 RSTRING(str)->as.heap.len = len;
862 RSTRING(str)->as.heap.ptr = (char *)ptr;
863 RSTRING(str)->as.heap.aux.capa = len;
865 RBASIC(str)->flags |= STR_NOFREE;
866 }
867 rb_enc_associate_index(str, encindex);
868 return str;
869}
870
871VALUE
872rb_str_new_static(const char *ptr, long len)
873{
874 return str_new_static(rb_cString, ptr, len, 0);
875}
876
877VALUE
879{
880 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
881}
882
883VALUE
885{
886 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
887}
888
889VALUE
890rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
891{
892 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
893}
894
895VALUE
896rb_tainted_str_new(const char *ptr, long len)
897{
898 rb_warning("rb_tainted_str_new is deprecated and will be removed in Ruby 3.2.");
899 return rb_str_new(ptr, len);
900}
901
902VALUE
904{
905 rb_warning("rb_tainted_str_new_cstr is deprecated and will be removed in Ruby 3.2.");
906 return rb_str_new_cstr(ptr);
907}
908
909static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
910 rb_encoding *from, rb_encoding *to,
911 int ecflags, VALUE ecopts);
912
913VALUE
915{
916 long len;
917 const char *ptr;
918 VALUE newstr;
919
920 if (!to) return str;
921 if (!from) from = rb_enc_get(str);
922 if (from == to) return str;
923 if ((rb_enc_asciicompat(to) && is_ascii_string(str)) ||
924 to == rb_ascii8bit_encoding()) {
925 if (STR_ENC_GET(str) != to) {
926 str = rb_str_dup(str);
928 }
929 return str;
930 }
931
933 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
934 from, to, ecflags, ecopts);
935 if (NIL_P(newstr)) {
936 /* some error, return original */
937 return str;
938 }
939 return newstr;
940}
941
942VALUE
943rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
944 rb_encoding *from, int ecflags, VALUE ecopts)
945{
946 long olen;
947
948 olen = RSTRING_LEN(newstr);
949 if (ofs < -olen || olen < ofs)
950 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
951 if (ofs < 0) ofs += olen;
952 if (!from) {
953 STR_SET_LEN(newstr, ofs);
954 return rb_str_cat(newstr, ptr, len);
955 }
956
957 rb_str_modify(newstr);
958 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
959 rb_enc_get(newstr),
960 ecflags, ecopts);
961}
962
963VALUE
964rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
965{
966 STR_SET_LEN(str, 0);
967 rb_enc_associate(str, enc);
969 return str;
970}
971
972static VALUE
973str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
974 rb_encoding *from, rb_encoding *to,
975 int ecflags, VALUE ecopts)
976{
977 rb_econv_t *ec;
979 long olen;
980 VALUE econv_wrapper;
981 const unsigned char *start, *sp;
982 unsigned char *dest, *dp;
983 size_t converted_output = (size_t)ofs;
984
985 olen = rb_str_capacity(newstr);
986
987 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
988 RBASIC_CLEAR_CLASS(econv_wrapper);
989 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
990 if (!ec) return Qnil;
991 DATA_PTR(econv_wrapper) = ec;
992
993 sp = (unsigned char*)ptr;
994 start = sp;
995 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
996 (dp = dest + converted_output),
997 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
999 /* destination buffer short */
1000 size_t converted_input = sp - start;
1001 size_t rest = len - converted_input;
1002 converted_output = dp - dest;
1003 rb_str_set_len(newstr, converted_output);
1004 if (converted_input && converted_output &&
1005 rest < (LONG_MAX / converted_output)) {
1006 rest = (rest * converted_output) / converted_input;
1007 }
1008 else {
1009 rest = olen;
1010 }
1011 olen += rest < 2 ? 2 : rest;
1012 rb_str_resize(newstr, olen);
1013 }
1014 DATA_PTR(econv_wrapper) = 0;
1015 rb_econv_close(ec);
1016 rb_gc_force_recycle(econv_wrapper);
1017 switch (ret) {
1018 case econv_finished:
1019 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1020 rb_str_set_len(newstr, len);
1021 rb_enc_associate(newstr, to);
1022 return newstr;
1023
1024 default:
1025 return Qnil;
1026 }
1027}
1028
1029VALUE
1031{
1032 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1033}
1034
1035VALUE
1037{
1038 rb_encoding *ienc;
1039 VALUE str;
1040 const int eidx = rb_enc_to_index(eenc);
1041
1042 if (!ptr) {
1043 return rb_enc_str_new(ptr, len, eenc);
1044 }
1045
1046 /* ASCII-8BIT case, no conversion */
1047 if ((eidx == rb_ascii8bit_encindex()) ||
1048 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1049 return rb_str_new(ptr, len);
1050 }
1051 /* no default_internal or same encoding, no conversion */
1053 if (!ienc || eenc == ienc) {
1054 return rb_enc_str_new(ptr, len, eenc);
1055 }
1056 /* ASCII compatible, and ASCII only string, no conversion in
1057 * default_internal */
1058 if ((eidx == rb_ascii8bit_encindex()) ||
1059 (eidx == rb_usascii_encindex()) ||
1060 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1061 return rb_enc_str_new(ptr, len, ienc);
1062 }
1063 /* convert from the given encoding to default_internal */
1064 str = rb_enc_str_new(NULL, 0, ienc);
1065 /* when the conversion failed for some reason, just ignore the
1066 * default_internal and result in the given encoding as-is. */
1067 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1068 rb_str_initialize(str, ptr, len, eenc);
1069 }
1070 return str;
1071}
1072
1073VALUE
1075{
1076 int eidx = rb_enc_to_index(eenc);
1077 if (eidx == rb_usascii_encindex() &&
1080 return str;
1081 }
1084}
1085
1086VALUE
1087rb_external_str_new(const char *ptr, long len)
1088{
1090}
1091
1092VALUE
1094{
1096}
1097
1098VALUE
1099rb_locale_str_new(const char *ptr, long len)
1100{
1102}
1103
1104VALUE
1106{
1108}
1109
1110VALUE
1112{
1114}
1115
1116VALUE
1118{
1120}
1121
1122VALUE
1124{
1126}
1127
1128VALUE
1130{
1132}
1133
1134VALUE
1136{
1137 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1138}
1139
1140static VALUE
1141str_replace_shared_without_enc(VALUE str2, VALUE str)
1142{
1143 const int termlen = TERM_LEN(str);
1144 char *ptr;
1145 long len;
1146
1148 if (STR_EMBEDDABLE_P(len, termlen)) {
1149 char *ptr2 = RSTRING(str2)->as.ary;
1150 STR_SET_EMBED(str2);
1151 memcpy(ptr2, RSTRING_PTR(str), len);
1152 STR_SET_EMBED_LEN(str2, len);
1153 TERM_FILL(ptr2+len, termlen);
1154 }
1155 else {
1156 VALUE root;
1157 if (STR_SHARED_P(str)) {
1158 root = RSTRING(str)->as.heap.aux.shared;
1160 }
1161 else {
1162 root = rb_str_new_frozen(str);
1163 RSTRING_GETMEM(root, ptr, len);
1164 }
1165 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1166 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1167 rb_fatal("about to free a possible shared root");
1168 }
1169 char *ptr2 = STR_HEAP_PTR(str2);
1170 if (ptr2 != ptr) {
1171 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1172 }
1173 }
1174 FL_SET(str2, STR_NOEMBED);
1175 RSTRING(str2)->as.heap.len = len;
1176 RSTRING(str2)->as.heap.ptr = ptr;
1177 STR_SET_SHARED(str2, root);
1178 }
1179 return str2;
1180}
1181
1182static VALUE
1183str_replace_shared(VALUE str2, VALUE str)
1184{
1185 str_replace_shared_without_enc(str2, str);
1186 rb_enc_cr_str_exact_copy(str2, str);
1187 return str2;
1188}
1189
1190static VALUE
1191str_new_shared(VALUE klass, VALUE str)
1192{
1193 return str_replace_shared(str_alloc(klass), str);
1194}
1195
1196VALUE
1198{
1199 return str_new_shared(rb_obj_class(str), str);
1200}
1201
1202VALUE
1204{
1205 if (OBJ_FROZEN(orig)) return orig;
1206 return str_new_frozen(rb_obj_class(orig), orig);
1207}
1208
1209VALUE
1211{
1212 if (OBJ_FROZEN_RAW(orig)) return orig;
1213 return str_new_frozen(0, orig);
1214}
1215
1216void
1218{
1219 if (RBASIC_CLASS(tmp) != 0)
1220 return;
1221
1222 if (STR_EMBED_P(tmp)) {
1223 assert(OBJ_FROZEN_RAW(tmp));
1225 }
1226 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1228 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1229
1230 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1231 FL_UNSET_RAW(orig, STR_SHARED);
1232 assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1233 assert(RSTRING(orig)->as.heap.len == RSTRING(tmp)->as.heap.len);
1234 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1235 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1236 assert(OBJ_FROZEN_RAW(tmp));
1238 }
1239 }
1240}
1241
1242static VALUE
1243str_new_frozen(VALUE klass, VALUE orig)
1244{
1245 VALUE str;
1246
1247 if (STR_EMBED_P(orig)) {
1248 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1249 }
1250 else {
1251 if (FL_TEST_RAW(orig, STR_SHARED)) {
1252 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1253 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING(shared)->as.heap.ptr;
1254 long rest = RSTRING(shared)->as.heap.len - ofs - RSTRING(orig)->as.heap.len;
1257
1258 if ((ofs > 0) || (rest > 0) ||
1259 (klass != RBASIC(shared)->klass) ||
1260 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1261 str = str_new_shared(klass, shared);
1262 RSTRING(str)->as.heap.ptr += ofs;
1263 RSTRING(str)->as.heap.len -= ofs + rest;
1264 }
1265 else {
1266 if (RBASIC_CLASS(shared) == 0)
1268 return shared;
1269 }
1270 }
1271 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1272 str = str_alloc(klass);
1277 }
1278 else {
1279 str = str_alloc(klass);
1281 RSTRING(str)->as.heap.len = RSTRING_LEN(orig);
1282 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1283 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1284 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1285 RBASIC(orig)->flags &= ~STR_NOFREE;
1286 STR_SET_SHARED(orig, str);
1287 if (klass == 0)
1289 }
1290 }
1291
1292 rb_enc_cr_str_exact_copy(str, orig);
1293 OBJ_FREEZE(str);
1294 return str;
1295}
1296
1297VALUE
1299{
1300 return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1301}
1302
1303static VALUE
1304str_new_empty(VALUE str)
1305{
1307 rb_enc_copy(v, str);
1308 return v;
1309}
1310
1311#define STR_BUF_MIN_SIZE 63
1313
1314VALUE
1316{
1317 VALUE str = str_alloc(rb_cString);
1318
1319 if (capa < STR_BUF_MIN_SIZE) {
1321 }
1323 RSTRING(str)->as.heap.aux.capa = capa;
1324 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1325 RSTRING(str)->as.heap.ptr[0] = '\0';
1326
1327 return str;
1328}
1329
1330VALUE
1332{
1333 VALUE str;
1334 long len = strlen(ptr);
1335
1338
1339 return str;
1340}
1341
1342VALUE
1344{
1345 return str_new(0, 0, len);
1346}
1347
1348void
1350{
1351 if (FL_TEST(str, RSTRING_FSTR)) {
1352 st_data_t fstr = (st_data_t)str;
1354 RB_DEBUG_COUNTER_INC(obj_str_fstr);
1355 }
1356
1357 if (STR_EMBED_P(str)) {
1358 RB_DEBUG_COUNTER_INC(obj_str_embed);
1359 }
1360 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1363 }
1364 else {
1365 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1367 }
1368}
1369
1370RUBY_FUNC_EXPORTED size_t
1372{
1374 return STR_HEAP_SIZE(str);
1375 }
1376 else {
1377 return 0;
1378 }
1379}
1380
1381VALUE
1383{
1384 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1385}
1386
1387static inline void str_discard(VALUE str);
1388static void str_shared_replace(VALUE str, VALUE str2);
1389
1390void
1392{
1393 if (str != str2) str_shared_replace(str, str2);
1394}
1395
1396static void
1397str_shared_replace(VALUE str, VALUE str2)
1398{
1399 rb_encoding *enc;
1400 int cr;
1401 int termlen;
1402
1403 RUBY_ASSERT(str2 != str);
1404 enc = STR_ENC_GET(str2);
1405 cr = ENC_CODERANGE(str2);
1406 str_discard(str);
1407 termlen = rb_enc_mbminlen(enc);
1408
1409 if (STR_EMBEDDABLE_P(RSTRING_LEN(str2), termlen)) {
1411 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1413 rb_enc_associate(str, enc);
1415 }
1416 else {
1419 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1420 RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
1421
1422 if (FL_TEST(str2, STR_SHARED)) {
1423 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1425 }
1426 else {
1427 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1428 }
1429
1430 /* abandon str2 */
1431 STR_SET_EMBED(str2);
1432 RSTRING_PTR(str2)[0] = 0;
1433 STR_SET_EMBED_LEN(str2, 0);
1434 rb_enc_associate(str, enc);
1436 }
1437}
1438
1439VALUE
1441{
1442 VALUE str;
1443
1444 if (RB_TYPE_P(obj, T_STRING)) {
1445 return obj;
1446 }
1447 str = rb_funcall(obj, idTo_s, 0);
1449}
1450
1453{
1454 if (!RB_TYPE_P(str, T_STRING))
1455 return rb_any_to_s(obj);
1456 return str;
1457}
1458
1459static VALUE
1460str_replace(VALUE str, VALUE str2)
1461{
1462 long len;
1463
1464 len = RSTRING_LEN(str2);
1465 if (STR_SHARED_P(str2)) {
1466 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1469 RSTRING(str)->as.heap.len = len;
1470 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1472 rb_enc_cr_str_exact_copy(str, str2);
1473 }
1474 else {
1475 str_replace_shared(str, str2);
1476 }
1477
1478 return str;
1479}
1480
1481static inline VALUE
1482str_duplicate(VALUE klass, VALUE str)
1483{
1484 enum {embed_size = RSTRING_EMBED_LEN_MAX + 1};
1485 const VALUE flag_mask =
1488 FL_FREEZE
1489 ;
1490 VALUE flags = FL_TEST_RAW(str, flag_mask);
1491 VALUE dup = str_alloc(klass);
1492 MEMCPY(RSTRING(dup)->as.ary, RSTRING(str)->as.ary,
1493 char, embed_size);
1494 if (flags & STR_NOEMBED) {
1495 if (FL_TEST_RAW(str, STR_SHARED)) {
1496 str = RSTRING(str)->as.heap.aux.shared;
1497 }
1498 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1499 str = str_new_frozen(klass, str);
1500 flags = FL_TEST_RAW(str, flag_mask);
1501 }
1502 if (flags & STR_NOEMBED) {
1503 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, str);
1504 flags |= STR_SHARED;
1505 }
1506 else {
1507 MEMCPY(RSTRING(dup)->as.ary, RSTRING(str)->as.ary,
1508 char, embed_size);
1509 }
1510 }
1511 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1512 return dup;
1513}
1514
1515VALUE
1517{
1518 return str_duplicate(rb_obj_class(str), str);
1519}
1520
1521VALUE
1523{
1525 return str_duplicate(rb_cString, str);
1526}
1527
1528/*
1529 * call-seq:
1530 * String.new(str="") -> new_str
1531 * String.new(str="", encoding: enc) -> new_str
1532 * String.new(str="", capacity: size) -> new_str
1533 *
1534 * Returns a new string object containing a copy of <i>str</i>.
1535 *
1536 * The optional <i>encoding</i> keyword argument specifies the encoding
1537 * of the new string.
1538 * If not specified, the encoding of <i>str</i> is used
1539 * (or ASCII-8BIT, if <i>str</i> is not specified).
1540 *
1541 * The optional <i>capacity</i> keyword argument specifies the size
1542 * of the internal buffer.
1543 * This may improve performance, when the string will be concatenated many
1544 * times (causing many realloc calls).
1545 */
1546
1547static VALUE
1548rb_str_init(int argc, VALUE *argv, VALUE str)
1549{
1550 static ID keyword_ids[2];
1551 VALUE orig, opt, venc, vcapa;
1552 VALUE kwargs[2];
1553 rb_encoding *enc = 0;
1554 int n;
1555
1556 if (!keyword_ids[0]) {
1557 keyword_ids[0] = rb_id_encoding();
1558 CONST_ID(keyword_ids[1], "capacity");
1559 }
1560
1561 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1562 if (!NIL_P(opt)) {
1563 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1564 venc = kwargs[0];
1565 vcapa = kwargs[1];
1566 if (venc != Qundef && !NIL_P(venc)) {
1567 enc = rb_to_encoding(venc);
1568 }
1569 if (vcapa != Qundef && !NIL_P(vcapa)) {
1570 long capa = NUM2LONG(vcapa);
1571 long len = 0;
1572 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1573
1574 if (capa < STR_BUF_MIN_SIZE) {
1576 }
1577 if (n == 1) {
1578 StringValue(orig);
1579 len = RSTRING_LEN(orig);
1580 if (capa < len) {
1581 capa = len;
1582 }
1583 if (orig == str) n = 0;
1584 }
1585 str_modifiable(str);
1586 if (STR_EMBED_P(str)) { /* make noembed always */
1587 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1588 memcpy(new_ptr, RSTRING(str)->as.ary, RSTRING_EMBED_LEN_MAX + 1);
1589 RSTRING(str)->as.heap.ptr = new_ptr;
1590 }
1591 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1592 const size_t size = (size_t)capa + termlen;
1593 const char *const old_ptr = RSTRING_PTR(str);
1594 const size_t osize = RSTRING(str)->as.heap.len + TERM_LEN(str);
1595 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1596 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1598 RSTRING(str)->as.heap.ptr = new_ptr;
1599 }
1600 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1601 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1602 (size_t)capa + termlen, STR_HEAP_SIZE(str));
1603 }
1604 RSTRING(str)->as.heap.len = len;
1605 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1606 if (n == 1) {
1607 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1608 rb_enc_cr_str_exact_copy(str, orig);
1609 }
1611 RSTRING(str)->as.heap.aux.capa = capa;
1612 }
1613 else if (n == 1) {
1614 rb_str_replace(str, orig);
1615 }
1616 if (enc) {
1617 rb_enc_associate(str, enc);
1619 }
1620 }
1621 else if (n == 1) {
1622 rb_str_replace(str, orig);
1623 }
1624 return str;
1625}
1626
1627#ifdef NONASCII_MASK
1628#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1629
1630/*
1631 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1632 * bit representation. (see http://en.wikipedia.org/wiki/UTF-8)
1633 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1634 *
1635 * if (!(byte & 0x80))
1636 * byte |= 0x40; // turn on bit6
1637 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
1638 *
1639 * This function calculates whether a byte is leading or not for all bytes
1640 * in the argument word by concurrently using the above logic, and then
1641 * adds up the number of leading bytes in the word.
1642 */
1643static inline uintptr_t
1644count_utf8_lead_bytes_with_word(const uintptr_t *s)
1645{
1646 uintptr_t d = *s;
1647
1648 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1649 d = (d>>6) | (~d>>7);
1650 d &= NONASCII_MASK >> 7;
1651
1652 /* Gather all bytes. */
1653#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1654 /* use only if it can use POPCNT */
1655 return rb_popcount_intptr(d);
1656#else
1657 d += (d>>8);
1658 d += (d>>16);
1659# if SIZEOF_VOIDP == 8
1660 d += (d>>32);
1661# endif
1662 return (d&0xF);
1663#endif
1664}
1665#endif
1666
1667static inline long
1668enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
1669{
1670 long c;
1671 const char *q;
1672
1673 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1674 long diff = (long)(e - p);
1675 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1676 }
1677#ifdef NONASCII_MASK
1678 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
1679 uintptr_t len = 0;
1680 if ((int)sizeof(uintptr_t) * 2 < e - p) {
1681 const uintptr_t *s, *t;
1682 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
1683 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
1684 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
1685 while (p < (const char *)s) {
1686 if (is_utf8_lead_byte(*p)) len++;
1687 p++;
1688 }
1689 while (s < t) {
1690 len += count_utf8_lead_bytes_with_word(s);
1691 s++;
1692 }
1693 p = (const char *)s;
1694 }
1695 while (p < e) {
1696 if (is_utf8_lead_byte(*p)) len++;
1697 p++;
1698 }
1699 return (long)len;
1700 }
1701#endif
1702 else if (rb_enc_asciicompat(enc)) {
1703 c = 0;
1704 if (ENC_CODERANGE_CLEAN_P(cr)) {
1705 while (p < e) {
1706 if (ISASCII(*p)) {
1707 q = search_nonascii(p, e);
1708 if (!q)
1709 return c + (e - p);
1710 c += q - p;
1711 p = q;
1712 }
1713 p += rb_enc_fast_mbclen(p, e, enc);
1714 c++;
1715 }
1716 }
1717 else {
1718 while (p < e) {
1719 if (ISASCII(*p)) {
1720 q = search_nonascii(p, e);
1721 if (!q)
1722 return c + (e - p);
1723 c += q - p;
1724 p = q;
1725 }
1726 p += rb_enc_mbclen(p, e, enc);
1727 c++;
1728 }
1729 }
1730 return c;
1731 }
1732
1733 for (c=0; p<e; c++) {
1734 p += rb_enc_mbclen(p, e, enc);
1735 }
1736 return c;
1737}
1738
1739long
1740rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
1741{
1742 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
1743}
1744
1745/* To get strlen with cr
1746 * Note that given cr is not used.
1747 */
1748long
1749rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
1750{
1751 long c;
1752 const char *q;
1753 int ret;
1754
1755 *cr = 0;
1756 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1757 long diff = (long)(e - p);
1758 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1759 }
1760 else if (rb_enc_asciicompat(enc)) {
1761 c = 0;
1762 while (p < e) {
1763 if (ISASCII(*p)) {
1764 q = search_nonascii(p, e);
1765 if (!q) {
1766 if (!*cr) *cr = ENC_CODERANGE_7BIT;
1767 return c + (e - p);
1768 }
1769 c += q - p;
1770 p = q;
1771 }
1772 ret = rb_enc_precise_mbclen(p, e, enc);
1773 if (MBCLEN_CHARFOUND_P(ret)) {
1774 *cr |= ENC_CODERANGE_VALID;
1775 p += MBCLEN_CHARFOUND_LEN(ret);
1776 }
1777 else {
1779 p++;
1780 }
1781 c++;
1782 }
1783 if (!*cr) *cr = ENC_CODERANGE_7BIT;
1784 return c;
1785 }
1786
1787 for (c=0; p<e; c++) {
1788 ret = rb_enc_precise_mbclen(p, e, enc);
1789 if (MBCLEN_CHARFOUND_P(ret)) {
1790 *cr |= ENC_CODERANGE_VALID;
1791 p += MBCLEN_CHARFOUND_LEN(ret);
1792 }
1793 else {
1795 if (p + rb_enc_mbminlen(enc) <= e)
1796 p += rb_enc_mbminlen(enc);
1797 else
1798 p = e;
1799 }
1800 }
1801 if (!*cr) *cr = ENC_CODERANGE_7BIT;
1802 return c;
1803}
1804
1805/* enc must be str's enc or rb_enc_check(str, str2) */
1806static long
1807str_strlen(VALUE str, rb_encoding *enc)
1808{
1809 const char *p, *e;
1810 int cr;
1811
1812 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
1813 if (!enc) enc = STR_ENC_GET(str);
1814 p = RSTRING_PTR(str);
1815 e = RSTRING_END(str);
1816 cr = ENC_CODERANGE(str);
1817
1818 if (cr == ENC_CODERANGE_UNKNOWN) {
1819 long n = rb_enc_strlen_cr(p, e, enc, &cr);
1820 if (cr) ENC_CODERANGE_SET(str, cr);
1821 return n;
1822 }
1823 else {
1824 return enc_strlen(p, e, enc, cr);
1825 }
1826}
1827
1828long
1830{
1831 return str_strlen(str, NULL);
1832}
1833
1834/*
1835 * call-seq:
1836 * str.length -> integer
1837 * str.size -> integer
1838 *
1839 * Returns the character length of <i>str</i>.
1840 */
1841
1842VALUE
1844{
1845 return LONG2NUM(str_strlen(str, NULL));
1846}
1847
1848/*
1849 * call-seq:
1850 * str.bytesize -> integer
1851 *
1852 * Returns the length of +str+ in bytes.
1853 *
1854 * "\x80\u3042".bytesize #=> 4
1855 * "hello".bytesize #=> 5
1856 */
1857
1858static VALUE
1859rb_str_bytesize(VALUE str)
1860{
1861 return LONG2NUM(RSTRING_LEN(str));
1862}
1863
1864/*
1865 * call-seq:
1866 * str.empty? -> true or false
1867 *
1868 * Returns <code>true</code> if <i>str</i> has a length of zero.
1869 *
1870 * "hello".empty? #=> false
1871 * " ".empty? #=> false
1872 * "".empty? #=> true
1873 */
1874
1875static VALUE
1876rb_str_empty(VALUE str)
1877{
1878 if (RSTRING_LEN(str) == 0)
1879 return Qtrue;
1880 return Qfalse;
1881}
1882
1883/*
1884 * call-seq:
1885 * str + other_str -> new_str
1886 *
1887 * Concatenation---Returns a new String containing
1888 * <i>other_str</i> concatenated to <i>str</i>.
1889 *
1890 * "Hello from " + self.to_s #=> "Hello from main"
1891 */
1892
1893VALUE
1895{
1896 VALUE str3;
1897 rb_encoding *enc;
1898 char *ptr1, *ptr2, *ptr3;
1899 long len1, len2;
1900 int termlen;
1901
1902 StringValue(str2);
1903 enc = rb_enc_check_str(str1, str2);
1904 RSTRING_GETMEM(str1, ptr1, len1);
1905 RSTRING_GETMEM(str2, ptr2, len2);
1906 termlen = rb_enc_mbminlen(enc);
1907 if (len1 > LONG_MAX - len2) {
1908 rb_raise(rb_eArgError, "string size too big");
1909 }
1910 str3 = str_new0(rb_cString, 0, len1+len2, termlen);
1911 ptr3 = RSTRING_PTR(str3);
1912 memcpy(ptr3, ptr1, len1);
1913 memcpy(ptr3+len1, ptr2, len2);
1914 TERM_FILL(&ptr3[len1+len2], termlen);
1915
1918 RB_GC_GUARD(str1);
1919 RB_GC_GUARD(str2);
1920 return str3;
1921}
1922
1923/* A variant of rb_str_plus that does not raise but return Qundef instead. */
1926{
1927 assert(RBASIC_CLASS(str1) == rb_cString);
1928 assert(RBASIC_CLASS(str2) == rb_cString);
1929 long len1, len2;
1930 MAYBE_UNUSED(char) *ptr1, *ptr2;
1931 RSTRING_GETMEM(str1, ptr1, len1);
1932 RSTRING_GETMEM(str2, ptr2, len2);
1933 int enc1 = rb_enc_get_index(str1);
1934 int enc2 = rb_enc_get_index(str2);
1935
1936 if (enc1 < 0) {
1937 return Qundef;
1938 }
1939 else if (enc2 < 0) {
1940 return Qundef;
1941 }
1942 else if (enc1 != enc2) {
1943 return Qundef;
1944 }
1945 else if (len1 > LONG_MAX - len2) {
1946 return Qundef;
1947 }
1948 else {
1949 return rb_str_plus(str1, str2);
1950 }
1951
1952}
1953
1954/*
1955 * call-seq:
1956 * str * integer -> new_str
1957 *
1958 * Copy --- Returns a new String containing +integer+ copies of the receiver.
1959 * +integer+ must be greater than or equal to 0.
1960 *
1961 * "Ho! " * 3 #=> "Ho! Ho! Ho! "
1962 * "Ho! " * 0 #=> ""
1963 */
1964
1965VALUE
1967{
1968 VALUE str2;
1969 long n, len;
1970 char *ptr2;
1971 int termlen;
1972
1973 if (times == INT2FIX(1)) {
1974 return rb_str_dup(str);
1975 }
1976 if (times == INT2FIX(0)) {
1977 str2 = str_alloc(rb_obj_class(str));
1978 rb_enc_copy(str2, str);
1979 return str2;
1980 }
1981 len = NUM2LONG(times);
1982 if (len < 0) {
1983 rb_raise(rb_eArgError, "negative argument");
1984 }
1985 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
1986 str2 = str_alloc(rb_obj_class(str));
1987 if (!STR_EMBEDDABLE_P(len, 1)) {
1988 RSTRING(str2)->as.heap.aux.capa = len;
1989 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
1990 STR_SET_NOEMBED(str2);
1991 }
1992 STR_SET_LEN(str2, len);
1993 rb_enc_copy(str2, str);
1994 return str2;
1995 }
1996 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
1997 rb_raise(rb_eArgError, "argument too big");
1998 }
1999
2000 len *= RSTRING_LEN(str);
2001 termlen = TERM_LEN(str);
2002 str2 = str_new0(rb_obj_class(str), 0, len, termlen);
2003 ptr2 = RSTRING_PTR(str2);
2004 if (len) {
2005 n = RSTRING_LEN(str);
2006 memcpy(ptr2, RSTRING_PTR(str), n);
2007 while (n <= len/2) {
2008 memcpy(ptr2 + n, ptr2, n);
2009 n *= 2;
2010 }
2011 memcpy(ptr2 + n, ptr2, len-n);
2012 }
2013 STR_SET_LEN(str2, len);
2014 TERM_FILL(&ptr2[len], termlen);
2015 rb_enc_cr_str_copy_for_substr(str2, str);
2016
2017 return str2;
2018}
2019
2020/*
2021 * call-seq:
2022 * str % arg -> new_str
2023 *
2024 * Format---Uses <i>str</i> as a format specification, and returns
2025 * the result of applying it to <i>arg</i>. If the format
2026 * specification contains more than one substitution, then <i>arg</i>
2027 * must be an Array or Hash containing the values to be
2028 * substituted. See Kernel#sprintf for details of the format string.
2029 *
2030 * "%05d" % 123 #=> "00123"
2031 * "%-5s: %016x" % [ "ID", self.object_id ] #=> "ID : 00002b054ec93168"
2032 * "foo = %{foo}" % { :foo => 'bar' } #=> "foo = bar"
2033 */
2034
2035static VALUE
2036rb_str_format_m(VALUE str, VALUE arg)
2037{
2039
2040 if (!NIL_P(tmp)) {
2041 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2042 }
2043 return rb_str_format(1, &arg, str);
2044}
2045
2046static inline void
2047rb_check_lockedtmp(VALUE str)
2048{
2049 if (FL_TEST(str, STR_TMPLOCK)) {
2050 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2051 }
2052}
2053
2054static inline void
2055str_modifiable(VALUE str)
2056{
2057 rb_check_lockedtmp(str);
2059}
2060
2061static inline int
2062str_dependent_p(VALUE str)
2063{
2065 return 0;
2066 }
2067 else {
2068 return 1;
2069 }
2070}
2071
2072static inline int
2073str_independent(VALUE str)
2074{
2075 str_modifiable(str);
2076 return !str_dependent_p(str);
2077}
2078
2079static void
2080str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2081{
2082 char *ptr;
2083 char *oldptr;
2084 long capa = len + expand;
2085
2086 if (len > capa) len = capa;
2087
2088 if (!STR_EMBED_P(str) && STR_EMBEDDABLE_P(capa, termlen)) {
2089 ptr = RSTRING(str)->as.heap.ptr;
2091 memcpy(RSTRING(str)->as.ary, ptr, len);
2092 TERM_FILL(RSTRING(str)->as.ary + len, termlen);
2094 return;
2095 }
2096
2097 ptr = ALLOC_N(char, (size_t)capa + termlen);
2098 oldptr = RSTRING_PTR(str);
2099 if (oldptr) {
2100 memcpy(ptr, oldptr, len);
2101 }
2103 xfree(oldptr);
2104 }
2107 TERM_FILL(ptr + len, termlen);
2108 RSTRING(str)->as.heap.ptr = ptr;
2109 RSTRING(str)->as.heap.len = len;
2110 RSTRING(str)->as.heap.aux.capa = capa;
2111}
2112
2113void
2115{
2116 if (!str_independent(str))
2117 str_make_independent(str);
2119}
2120
2121void
2123{
2124 int termlen = TERM_LEN(str);
2125 long len = RSTRING_LEN(str);
2126
2127 if (expand < 0) {
2128 rb_raise(rb_eArgError, "negative expanding string size");
2129 }
2130 if (expand >= LONG_MAX - len) {
2131 rb_raise(rb_eArgError, "string size too big");
2132 }
2133
2134 if (!str_independent(str)) {
2135 str_make_independent_expand(str, len, expand, termlen);
2136 }
2137 else if (expand > 0) {
2138 RESIZE_CAPA_TERM(str, len + expand, termlen);
2139 }
2141}
2142
2143/* As rb_str_modify(), but don't clear coderange */
2144static void
2145str_modify_keep_cr(VALUE str)
2146{
2147 if (!str_independent(str))
2148 str_make_independent(str);
2150 /* Force re-scan later */
2152}
2153
2154static inline void
2155str_discard(VALUE str)
2156{
2157 str_modifiable(str);
2160 RSTRING(str)->as.heap.ptr = 0;
2161 RSTRING(str)->as.heap.len = 0;
2162 }
2163}
2164
2165void
2167{
2168 rb_encoding *enc = rb_enc_get(str);
2169 if (!rb_enc_asciicompat(enc)) {
2170 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2171 }
2172}
2173
2174VALUE
2176{
2177 VALUE s = *ptr;
2178 if (!RB_TYPE_P(s, T_STRING)) {
2179 s = rb_str_to_str(s);
2180 *ptr = s;
2181 }
2182 return s;
2183}
2184
2185char *
2187{
2189 return RSTRING_PTR(str);
2190}
2191
2192static int
2193zero_filled(const char *s, int n)
2194{
2195 for (; n > 0; --n) {
2196 if (*s++) return 0;
2197 }
2198 return 1;
2199}
2200
2201static const char *
2202str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2203{
2204 const char *e = s + len;
2205
2206 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2207 if (zero_filled(s, minlen)) return s;
2208 }
2209 return 0;
2210}
2211
2212static char *
2213str_fill_term(VALUE str, char *s, long len, int termlen)
2214{
2215 /* This function assumes that (capa + termlen) bytes of memory
2216 * is allocated, like many other functions in this file.
2217 */
2218 if (str_dependent_p(str)) {
2219 if (!zero_filled(s + len, termlen))
2220 str_make_independent_expand(str, len, 0L, termlen);
2221 }
2222 else {
2223 TERM_FILL(s + len, termlen);
2224 return s;
2225 }
2226 return RSTRING_PTR(str);
2227}
2228
2229void
2230rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2231{
2232 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2233 long len = RSTRING_LEN(str);
2234
2235 assert(capa >= len);
2236 if (capa - len < termlen) {
2237 rb_check_lockedtmp(str);
2238 str_make_independent_expand(str, len, 0L, termlen);
2239 }
2240 else if (str_dependent_p(str)) {
2241 if (termlen > oldtermlen)
2242 str_make_independent_expand(str, len, 0L, termlen);
2243 }
2244 else {
2245 if (!STR_EMBED_P(str)) {
2246 /* modify capa instead of realloc */
2248 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2249 }
2250 if (termlen > oldtermlen) {
2251 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2252 }
2253 }
2254
2255 return;
2256}
2257
2258static char *
2259str_null_check(VALUE str, int *w)
2260{
2261 char *s = RSTRING_PTR(str);
2262 long len = RSTRING_LEN(str);
2263 rb_encoding *enc = rb_enc_get(str);
2264 const int minlen = rb_enc_mbminlen(enc);
2265
2266 if (minlen > 1) {
2267 *w = 1;
2268 if (str_null_char(s, len, minlen, enc)) {
2269 return NULL;
2270 }
2271 return str_fill_term(str, s, len, minlen);
2272 }
2273 *w = 0;
2274 if (!s || memchr(s, 0, len)) {
2275 return NULL;
2276 }
2277 if (s[len]) {
2278 s = str_fill_term(str, s, len, minlen);
2279 }
2280 return s;
2281}
2282
2283char *
2285{
2286 int w;
2287 return str_null_check(str, &w);
2288}
2289
2290char *
2292{
2294 int w;
2295 char *s = str_null_check(str, &w);
2296 if (!s) {
2297 if (w) {
2298 rb_raise(rb_eArgError, "string contains null char");
2299 }
2300 rb_raise(rb_eArgError, "string contains null byte");
2301 }
2302 return s;
2303}
2304
2305char *
2306rb_str_fill_terminator(VALUE str, const int newminlen)
2307{
2308 char *s = RSTRING_PTR(str);
2309 long len = RSTRING_LEN(str);
2310 return str_fill_term(str, s, len, newminlen);
2311}
2312
2313VALUE
2315{
2317 return str;
2318}
2319
2320/*
2321 * call-seq:
2322 * String.try_convert(obj) -> string or nil
2323 *
2324 * Try to convert <i>obj</i> into a String, using to_str method.
2325 * Returns converted string or nil if <i>obj</i> cannot be converted
2326 * for any reason.
2327 *
2328 * String.try_convert("str") #=> "str"
2329 * String.try_convert(/re/) #=> nil
2330 */
2331static VALUE
2332rb_str_s_try_convert(VALUE dummy, VALUE str)
2333{
2334 return rb_check_string_type(str);
2335}
2336
2337static char*
2338str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2339{
2340 long nth = *nthp;
2341 if (rb_enc_mbmaxlen(enc) == 1) {
2342 p += nth;
2343 }
2344 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2345 p += nth * rb_enc_mbmaxlen(enc);
2346 }
2347 else if (rb_enc_asciicompat(enc)) {
2348 const char *p2, *e2;
2349 int n;
2350
2351 while (p < e && 0 < nth) {
2352 e2 = p + nth;
2353 if (e < e2) {
2354 *nthp = nth;
2355 return (char *)e;
2356 }
2357 if (ISASCII(*p)) {
2358 p2 = search_nonascii(p, e2);
2359 if (!p2) {
2360 nth -= e2 - p;
2361 *nthp = nth;
2362 return (char *)e2;
2363 }
2364 nth -= p2 - p;
2365 p = p2;
2366 }
2367 n = rb_enc_mbclen(p, e, enc);
2368 p += n;
2369 nth--;
2370 }
2371 *nthp = nth;
2372 if (nth != 0) {
2373 return (char *)e;
2374 }
2375 return (char *)p;
2376 }
2377 else {
2378 while (p < e && nth--) {
2379 p += rb_enc_mbclen(p, e, enc);
2380 }
2381 }
2382 if (p > e) p = e;
2383 *nthp = nth;
2384 return (char*)p;
2385}
2386
2387char*
2388rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2389{
2390 return str_nth_len(p, e, &nth, enc);
2391}
2392
2393static char*
2394str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2395{
2396 if (singlebyte)
2397 p += nth;
2398 else {
2399 p = str_nth_len(p, e, &nth, enc);
2400 }
2401 if (!p) return 0;
2402 if (p > e) p = e;
2403 return (char *)p;
2404}
2405
2406/* char offset to byte offset */
2407static long
2408str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2409{
2410 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2411 if (!pp) return e - p;
2412 return pp - p;
2413}
2414
2415long
2417{
2418 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2419 STR_ENC_GET(str), single_byte_optimizable(str));
2420}
2421
2422#ifdef NONASCII_MASK
2423static char *
2424str_utf8_nth(const char *p, const char *e, long *nthp)
2425{
2426 long nth = *nthp;
2427 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2428 const uintptr_t *s, *t;
2429 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2430 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2431 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2432 while (p < (const char *)s) {
2433 if (is_utf8_lead_byte(*p)) nth--;
2434 p++;
2435 }
2436 do {
2437 nth -= count_utf8_lead_bytes_with_word(s);
2438 s++;
2439 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2440 p = (char *)s;
2441 }
2442 while (p < e) {
2443 if (is_utf8_lead_byte(*p)) {
2444 if (nth == 0) break;
2445 nth--;
2446 }
2447 p++;
2448 }
2449 *nthp = nth;
2450 return (char *)p;
2451}
2452
2453static long
2454str_utf8_offset(const char *p, const char *e, long nth)
2455{
2456 const char *pp = str_utf8_nth(p, e, &nth);
2457 return pp - p;
2458}
2459#endif
2460
2461/* byte offset to char offset */
2462long
2464{
2465 if (single_byte_optimizable(str) || pos < 0)
2466 return pos;
2467 else {
2468 char *p = RSTRING_PTR(str);
2469 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2470 }
2471}
2472
2473VALUE
2474rb_str_subseq(VALUE str, long beg, long len)
2475{
2476 VALUE str2;
2477
2478 if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2480 long olen;
2482 RSTRING(str2)->as.heap.ptr += beg;
2483 olen = RSTRING(str2)->as.heap.len;
2484 if (olen > len) RSTRING(str2)->as.heap.len = len;
2485 }
2486 else {
2489 }
2490
2491 rb_enc_cr_str_copy_for_substr(str2, str);
2492
2493 return str2;
2494}
2495
2496char *
2497rb_str_subpos(VALUE str, long beg, long *lenp)
2498{
2499 long len = *lenp;
2500 long slen = -1L;
2501 long blen = RSTRING_LEN(str);
2502 rb_encoding *enc = STR_ENC_GET(str);
2503 char *p, *s = RSTRING_PTR(str), *e = s + blen;
2504
2505 if (len < 0) return 0;
2506 if (!blen) {
2507 len = 0;
2508 }
2509 if (single_byte_optimizable(str)) {
2510 if (beg > blen) return 0;
2511 if (beg < 0) {
2512 beg += blen;
2513 if (beg < 0) return 0;
2514 }
2515 if (len > blen - beg)
2516 len = blen - beg;
2517 if (len < 0) return 0;
2518 p = s + beg;
2519 goto end;
2520 }
2521 if (beg < 0) {
2522 if (len > -beg) len = -beg;
2523 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2524 beg = -beg;
2525 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2526 p = e;
2527 if (!p) return 0;
2528 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2529 if (!p) return 0;
2530 len = e - p;
2531 goto end;
2532 }
2533 else {
2534 slen = str_strlen(str, enc);
2535 beg += slen;
2536 if (beg < 0) return 0;
2537 p = s + beg;
2538 if (len == 0) goto end;
2539 }
2540 }
2541 else if (beg > 0 && beg > RSTRING_LEN(str)) {
2542 return 0;
2543 }
2544 if (len == 0) {
2545 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2546 p = s + beg;
2547 }
2548#ifdef NONASCII_MASK
2549 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2550 enc == rb_utf8_encoding()) {
2551 p = str_utf8_nth(s, e, &beg);
2552 if (beg > 0) return 0;
2553 len = str_utf8_offset(p, e, len);
2554 }
2555#endif
2556 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2557 int char_sz = rb_enc_mbmaxlen(enc);
2558
2559 p = s + beg * char_sz;
2560 if (p > e) {
2561 return 0;
2562 }
2563 else if (len * char_sz > e - p)
2564 len = e - p;
2565 else
2566 len *= char_sz;
2567 }
2568 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2569 if (beg > 0) return 0;
2570 len = 0;
2571 }
2572 else {
2573 len = str_offset(p, e, len, enc, 0);
2574 }
2575 end:
2576 *lenp = len;
2578 return p;
2579}
2580
2581static VALUE str_substr(VALUE str, long beg, long len, int empty);
2582
2583VALUE
2584rb_str_substr(VALUE str, long beg, long len)
2585{
2586 return str_substr(str, beg, len, TRUE);
2587}
2588
2589static VALUE
2590str_substr(VALUE str, long beg, long len, int empty)
2591{
2592 VALUE str2;
2593 char *p = rb_str_subpos(str, beg, &len);
2594
2595 if (!p) return Qnil;
2596 if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2598 long ofs = p - RSTRING_PTR(str);
2599 str2 = rb_str_new_frozen(str);
2600 str2 = str_new_shared(rb_obj_class(str2), str2);
2601 RSTRING(str2)->as.heap.ptr += ofs;
2602 RSTRING(str2)->as.heap.len = len;
2603 ENC_CODERANGE_CLEAR(str2);
2604 }
2605 else {
2606 if (!len && !empty) return Qnil;
2607 str2 = rb_str_new_with_class(str, p, len);
2609 }
2610 rb_enc_cr_str_copy_for_substr(str2, str);
2611
2612 return str2;
2613}
2614
2615VALUE
2617{
2618 if (OBJ_FROZEN(str)) return str;
2620 return rb_obj_freeze(str);
2621}
2622
2623
2624/*
2625 * call-seq:
2626 * +str -> str (mutable)
2627 *
2628 * If the string is frozen, then return duplicated mutable string.
2629 *
2630 * If the string is not frozen, then return the string itself.
2631 */
2632static VALUE
2633str_uplus(VALUE str)
2634{
2635 if (OBJ_FROZEN(str)) {
2636 return rb_str_dup(str);
2637 }
2638 else {
2639 return str;
2640 }
2641}
2642
2643/*
2644 * call-seq:
2645 * -str -> str (frozen)
2646 *
2647 * Returns a frozen, possibly pre-existing copy of the string.
2648 *
2649 * The string will be deduplicated as long as it does not have
2650 * any instance variables set on it.
2651 */
2652static VALUE
2653str_uminus(VALUE str)
2654{
2655 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
2656 str = rb_str_dup(str);
2657 }
2658 return rb_fstring(str);
2659}
2660
2662#define rb_str_dup_frozen rb_str_new_frozen
2663
2664VALUE
2666{
2667 if (FL_TEST(str, STR_TMPLOCK)) {
2668 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
2669 }
2671 return str;
2672}
2673
2674VALUE
2676{
2677 if (!FL_TEST(str, STR_TMPLOCK)) {
2678 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
2679 }
2681 return str;
2682}
2683
2686{
2688 return rb_ensure(func, arg, rb_str_unlocktmp, str);
2689}
2690
2691void
2693{
2694 long capa;
2695 const int termlen = TERM_LEN(str);
2696
2697 str_modifiable(str);
2698 if (STR_SHARED_P(str)) {
2699 rb_raise(rb_eRuntimeError, "can't set length of shared string");
2700 }
2701 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
2702 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
2703 }
2705 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
2706}
2707
2708VALUE
2710{
2711 long slen;
2712 int independent;
2713
2714 if (len < 0) {
2715 rb_raise(rb_eArgError, "negative string size (or size too big)");
2716 }
2717
2718 independent = str_independent(str);
2720 slen = RSTRING_LEN(str);
2721
2722 {
2723 long capa;
2724 const int termlen = TERM_LEN(str);
2725 if (STR_EMBED_P(str)) {
2726 if (len == slen) return str;
2727 if (STR_EMBEDDABLE_P(len, termlen)) {
2729 TERM_FILL(RSTRING(str)->as.ary + len, termlen);
2730 return str;
2731 }
2732 str_make_independent_expand(str, slen, len - slen, termlen);
2733 }
2734 else if (STR_EMBEDDABLE_P(len, termlen)) {
2735 char *ptr = STR_HEAP_PTR(str);
2737 if (slen > len) slen = len;
2738 if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
2739 TERM_FILL(RSTRING(str)->as.ary + len, termlen);
2741 if (independent) ruby_xfree(ptr);
2742 return str;
2743 }
2744 else if (!independent) {
2745 if (len == slen) return str;
2746 str_make_independent_expand(str, slen, len - slen, termlen);
2747 }
2748 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
2749 (capa - len) > (len < 1024 ? len : 1024)) {
2750 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2751 (size_t)len + termlen, STR_HEAP_SIZE(str));
2752 RSTRING(str)->as.heap.aux.capa = len;
2753 }
2754 else if (len == slen) return str;
2755 RSTRING(str)->as.heap.len = len;
2756 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
2757 }
2758 return str;
2759}
2760
2761static VALUE
2762str_buf_cat(VALUE str, const char *ptr, long len)
2763{
2764 long capa, total, olen, off = -1;
2765 char *sptr;
2766 const int termlen = TERM_LEN(str);
2767 assert(termlen < RSTRING_EMBED_LEN_MAX + 1); /* < (LONG_MAX/2) */
2768
2769 RSTRING_GETMEM(str, sptr, olen);
2770 if (ptr >= sptr && ptr <= sptr + olen) {
2771 off = ptr - sptr;
2772 }
2774 if (len == 0) return 0;
2775 if (STR_EMBED_P(str)) {
2776 capa = RSTRING_EMBED_LEN_MAX + 1 - termlen;
2777 sptr = RSTRING(str)->as.ary;
2778 olen = RSTRING_EMBED_LEN(str);
2779 }
2780 else {
2781 capa = RSTRING(str)->as.heap.aux.capa;
2782 sptr = RSTRING(str)->as.heap.ptr;
2783 olen = RSTRING(str)->as.heap.len;
2784 }
2785 if (olen > LONG_MAX - len) {
2786 rb_raise(rb_eArgError, "string sizes too big");
2787 }
2788 total = olen + len;
2789 if (capa < total) {
2790 if (total >= LONG_MAX / 2) {
2791 capa = total;
2792 }
2793 while (total > capa) {
2794 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
2795 }
2796 RESIZE_CAPA_TERM(str, capa, termlen);
2797 sptr = RSTRING_PTR(str);
2798 }
2799 if (off != -1) {
2800 ptr = sptr + off;
2801 }
2802 memcpy(sptr + olen, ptr, len);
2803 STR_SET_LEN(str, total);
2804 TERM_FILL(sptr + total, termlen); /* sentinel */
2805
2806 return str;
2807}
2808
2809#define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
2810
2811VALUE
2812rb_str_cat(VALUE str, const char *ptr, long len)
2813{
2814 if (len == 0) return str;
2815 if (len < 0) {
2816 rb_raise(rb_eArgError, "negative string size (or size too big)");
2817 }
2818 return str_buf_cat(str, ptr, len);
2819}
2820
2821VALUE
2823{
2824 must_not_null(ptr);
2825 return rb_str_buf_cat(str, ptr, strlen(ptr));
2826}
2827
2831
2832static VALUE
2833rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
2834 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
2835{
2836 int str_encindex = ENCODING_GET(str);
2837 int res_encindex;
2838 int str_cr, res_cr;
2839 rb_encoding *str_enc, *ptr_enc;
2840
2842
2843 if (str_encindex == ptr_encindex) {
2844 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
2845 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
2846 }
2847 }
2848 else {
2849 str_enc = rb_enc_from_index(str_encindex);
2850 ptr_enc = rb_enc_from_index(ptr_encindex);
2851 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
2852 if (len == 0)
2853 return str;
2854 if (RSTRING_LEN(str) == 0) {
2856 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
2857 return str;
2858 }
2859 goto incompatible;
2860 }
2861 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
2862 ptr_cr = coderange_scan(ptr, len, ptr_enc);
2863 }
2864 if (str_cr == ENC_CODERANGE_UNKNOWN) {
2865 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
2866 str_cr = rb_enc_str_coderange(str);
2867 }
2868 }
2869 }
2870 if (ptr_cr_ret)
2871 *ptr_cr_ret = ptr_cr;
2872
2873 if (str_encindex != ptr_encindex &&
2874 str_cr != ENC_CODERANGE_7BIT &&
2875 ptr_cr != ENC_CODERANGE_7BIT) {
2876 str_enc = rb_enc_from_index(str_encindex);
2877 ptr_enc = rb_enc_from_index(ptr_encindex);
2878 incompatible:
2879 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
2880 rb_enc_name(str_enc), rb_enc_name(ptr_enc));
2881 }
2882
2883 if (str_cr == ENC_CODERANGE_UNKNOWN) {
2884 res_encindex = str_encindex;
2885 res_cr = ENC_CODERANGE_UNKNOWN;
2886 }
2887 else if (str_cr == ENC_CODERANGE_7BIT) {
2888 if (ptr_cr == ENC_CODERANGE_7BIT) {
2889 res_encindex = str_encindex;
2890 res_cr = ENC_CODERANGE_7BIT;
2891 }
2892 else {
2893 res_encindex = ptr_encindex;
2894 res_cr = ptr_cr;
2895 }
2896 }
2897 else if (str_cr == ENC_CODERANGE_VALID) {
2898 res_encindex = str_encindex;
2899 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
2900 res_cr = str_cr;
2901 else
2902 res_cr = ptr_cr;
2903 }
2904 else { /* str_cr == ENC_CODERANGE_BROKEN */
2905 res_encindex = str_encindex;
2906 res_cr = str_cr;
2907 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
2908 }
2909
2910 if (len < 0) {
2911 rb_raise(rb_eArgError, "negative string size (or size too big)");
2912 }
2913 str_buf_cat(str, ptr, len);
2914 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
2915 return str;
2916}
2917
2918VALUE
2919rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
2920{
2921 return rb_enc_cr_str_buf_cat(str, ptr, len,
2923}
2924
2925VALUE
2927{
2928 /* ptr must reference NUL terminated ASCII string. */
2929 int encindex = ENCODING_GET(str);
2930 rb_encoding *enc = rb_enc_from_index(encindex);
2931 if (rb_enc_asciicompat(enc)) {
2932 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
2933 encindex, ENC_CODERANGE_7BIT, 0);
2934 }
2935 else {
2936 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
2937 while (*ptr) {
2938 unsigned int c = (unsigned char)*ptr;
2939 int len = rb_enc_codelen(c, enc);
2940 rb_enc_mbcput(c, buf, enc);
2941 rb_enc_cr_str_buf_cat(str, buf, len,
2942 encindex, ENC_CODERANGE_VALID, 0);
2943 ptr++;
2944 }
2945 return str;
2946 }
2947}
2948
2949VALUE
2951{
2952 int str2_cr;
2953
2954 str2_cr = ENC_CODERANGE(str2);
2955
2956 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
2957 ENCODING_GET(str2), str2_cr, &str2_cr);
2958
2959 ENC_CODERANGE_SET(str2, str2_cr);
2960
2961 return str;
2962}
2963
2964VALUE
2966{
2967 StringValue(str2);
2968 return rb_str_buf_append(str, str2);
2969}
2970
2971#define MIN_PRE_ALLOC_SIZE 48
2972
2974rb_str_concat_literals(size_t num, const VALUE *strary)
2975{
2976 VALUE str;
2977 size_t i, s;
2978 long len = 1;
2979
2980 if (UNLIKELY(!num)) return rb_str_new(0, 0);
2981 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
2982
2983 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
2984 if (LIKELY(len < MIN_PRE_ALLOC_SIZE)) {
2985 str = rb_str_resurrect(strary[0]);
2986 s = 1;
2987 }
2988 else {
2990 rb_enc_copy(str, strary[0]);
2991 s = 0;
2992 }
2993
2994 for (i = s; i < num; ++i) {
2995 const VALUE v = strary[i];
2996 int encidx = ENCODING_GET(v);
2997
2998 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(v), RSTRING_LEN(v),
2999 encidx, ENC_CODERANGE(v), NULL);
3000 if (encidx != ENCINDEX_US_ASCII) {
3002 rb_enc_set_index(str, encidx);
3003 }
3004 }
3005 return str;
3006}
3007
3008/*
3009 * call-seq:
3010 * str.concat(obj1, obj2, ...) -> str
3011 *
3012 * Concatenates the given object(s) to <i>str</i>. If an object is an
3013 * Integer, it is considered a codepoint and converted to a character
3014 * before concatenation.
3015 *
3016 * +concat+ can take multiple arguments, and all the arguments are
3017 * concatenated in order.
3018 *
3019 * a = "hello "
3020 * a.concat("world", 33) #=> "hello world!"
3021 * a #=> "hello world!"
3022 *
3023 * b = "sn"
3024 * b.concat("_", b, "_", b) #=> "sn_sn_sn"
3025 *
3026 * See also String#<<, which takes a single argument.
3027 */
3028static VALUE
3029rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3030{
3031 str_modifiable(str);
3032
3033 if (argc == 1) {
3034 return rb_str_concat(str, argv[0]);
3035 }
3036 else if (argc > 1) {
3037 int i;
3038 VALUE arg_str = rb_str_tmp_new(0);
3039 rb_enc_copy(arg_str, str);
3040 for (i = 0; i < argc; i++) {
3041 rb_str_concat(arg_str, argv[i]);
3042 }
3043 rb_str_buf_append(str, arg_str);
3044 }
3045
3046 return str;
3047}
3048
3049/*
3050 * call-seq:
3051 * str << obj -> str
3052 * str << integer -> str
3053 *
3054 * Appends the given object to <i>str</i>. If the object is an
3055 * Integer, it is considered a codepoint and converted to a character
3056 * before being appended.
3057 *
3058 * a = "hello "
3059 * a << "world" #=> "hello world"
3060 * a << 33 #=> "hello world!"
3061 *
3062 * See also String#concat, which takes multiple arguments.
3063 */
3064VALUE
3066{
3067 unsigned int code;
3068 rb_encoding *enc = STR_ENC_GET(str1);
3069 int encidx;
3070
3071 if (RB_INTEGER_TYPE_P(str2)) {
3072 if (rb_num_to_uint(str2, &code) == 0) {
3073 }
3074 else if (FIXNUM_P(str2)) {
3075 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3076 }
3077 else {
3078 rb_raise(rb_eRangeError, "bignum out of char range");
3079 }
3080 }
3081 else {
3082 return rb_str_append(str1, str2);
3083 }
3084
3085 encidx = rb_enc_to_index(enc);
3086 if (encidx == ENCINDEX_ASCII || encidx == ENCINDEX_US_ASCII) {
3087 /* US-ASCII automatically extended to ASCII-8BIT */
3088 char buf[1];
3089 buf[0] = (char)code;
3090 if (code > 0xFF) {
3091 rb_raise(rb_eRangeError, "%u out of char range", code);
3092 }
3093 rb_str_cat(str1, buf, 1);
3094 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3097 }
3098 }
3099 else {
3100 long pos = RSTRING_LEN(str1);
3101 int cr = ENC_CODERANGE(str1);
3102 int len;
3103 char *buf;
3104
3105 switch (len = rb_enc_codelen(code, enc)) {
3107 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3108 break;
3110 case 0:
3111 rb_raise(rb_eRangeError, "%u out of char range", code);
3112 break;
3113 }
3114 buf = ALLOCA_N(char, len + 1);
3115 rb_enc_mbcput(code, buf, enc);
3116 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3117 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3118 }
3119 rb_str_resize(str1, pos+len);
3120 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3121 if (cr == ENC_CODERANGE_7BIT && code > 127)
3123 ENC_CODERANGE_SET(str1, cr);
3124 }
3125 return str1;
3126}
3127
3128/*
3129 * call-seq:
3130 * str.prepend(other_str1, other_str2, ...) -> str
3131 *
3132 * Prepend---Prepend the given strings to <i>str</i>.
3133 *
3134 * a = "!"
3135 * a.prepend("hello ", "world") #=> "hello world!"
3136 * a #=> "hello world!"
3137 *
3138 * See also String#concat.
3139 */
3140
3141static VALUE
3142rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3143{
3144 str_modifiable(str);
3145
3146 if (argc == 1) {
3147 rb_str_update(str, 0L, 0L, argv[0]);
3148 }
3149 else if (argc > 1) {
3150 int i;
3151 VALUE arg_str = rb_str_tmp_new(0);
3152 rb_enc_copy(arg_str, str);
3153 for (i = 0; i < argc; i++) {
3154 rb_str_append(arg_str, argv[i]);
3155 }
3156 rb_str_update(str, 0L, 0L, arg_str);
3157 }
3158
3159 return str;
3160}
3161
3164{
3165 int e = ENCODING_GET(str);
3167 e = 0;
3168 }
3169 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
3170}
3171
3172int
3174{
3175 long len1, len2;
3176 const char *ptr1, *ptr2;
3177 RSTRING_GETMEM(str1, ptr1, len1);
3178 RSTRING_GETMEM(str2, ptr2, len2);
3179 return (len1 != len2 ||
3180 !rb_str_comparable(str1, str2) ||
3181 memcmp(ptr1, ptr2, len1) != 0);
3182}
3183
3184/*
3185 * call-seq:
3186 * str.hash -> integer
3187 *
3188 * Returns a hash based on the string's length, content and encoding.
3189 *
3190 * See also Object#hash.
3191 */
3192
3193static VALUE
3194rb_str_hash_m(VALUE str)
3195{
3196 st_index_t hval = rb_str_hash(str);
3197 return ST2FIX(hval);
3198}
3199
3200#define lesser(a,b) (((a)>(b))?(b):(a))
3201
3202int
3204{
3205 int idx1, idx2;
3206 int rc1, rc2;
3207
3208 if (RSTRING_LEN(str1) == 0) return TRUE;
3209 if (RSTRING_LEN(str2) == 0) return TRUE;
3210 idx1 = ENCODING_GET(str1);
3211 idx2 = ENCODING_GET(str2);
3212 if (idx1 == idx2) return TRUE;
3213 rc1 = rb_enc_str_coderange(str1);
3214 rc2 = rb_enc_str_coderange(str2);
3215 if (rc1 == ENC_CODERANGE_7BIT) {
3216 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3218 return TRUE;
3219 }
3220 if (rc2 == ENC_CODERANGE_7BIT) {
3222 return TRUE;
3223 }
3224 return FALSE;
3225}
3226
3227int
3229{
3230 long len1, len2;
3231 const char *ptr1, *ptr2;
3232 int retval;
3233
3234 if (str1 == str2) return 0;
3235 RSTRING_GETMEM(str1, ptr1, len1);
3236 RSTRING_GETMEM(str2, ptr2, len2);
3237 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3238 if (len1 == len2) {
3239 if (!rb_str_comparable(str1, str2)) {
3240 if (ENCODING_GET(str1) > ENCODING_GET(str2))
3241 return 1;
3242 return -1;
3243 }
3244 return 0;
3245 }
3246 if (len1 > len2) return 1;
3247 return -1;
3248 }
3249 if (retval > 0) return 1;
3250 return -1;
3251}
3252
3253/*
3254 * call-seq:
3255 * str == obj -> true or false
3256 * str === obj -> true or false
3257 *
3258 * Equality---Returns whether +str+ == +obj+, similar to Object#==.
3259 *
3260 * If +obj+ is not an instance of String but responds to +to_str+, then the
3261 * two strings are compared using <code>obj.==</code>.
3262 *
3263 * Otherwise, returns similarly to String#eql?, comparing length and content.
3264 */
3265
3266VALUE
3268{
3269 if (str1 == str2) return Qtrue;
3270 if (!RB_TYPE_P(str2, T_STRING)) {
3271 if (!rb_respond_to(str2, idTo_str)) {
3272 return Qfalse;
3273 }
3274 return rb_equal(str2, str1);
3275 }
3276 return rb_str_eql_internal(str1, str2);
3277}
3278
3279/*
3280 * call-seq:
3281 * str.eql?(other) -> true or false
3282 *
3283 * Two strings are equal if they have the same length and content.
3284 */
3285
3288{
3289 if (str1 == str2) return Qtrue;
3290 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3291 return rb_str_eql_internal(str1, str2);
3292}
3293
3294/*
3295 * call-seq:
3296 * string <=> other_string -> -1, 0, +1, or nil
3297 *
3298 * Comparison---Returns -1, 0, +1, or +nil+ depending on whether +string+ is
3299 * less than, equal to, or greater than +other_string+.
3300 *
3301 * +nil+ is returned if the two values are incomparable.
3302 *
3303 * If the strings are of different lengths, and the strings are equal when
3304 * compared up to the shortest length, then the longer string is considered
3305 * greater than the shorter one.
3306 *
3307 * <code><=></code> is the basis for the methods <code><</code>,
3308 * <code><=</code>, <code>></code>, <code>>=</code>, and
3309 * <code>between?</code>, included from module Comparable. The method
3310 * String#== does not use Comparable#==.
3311 *
3312 * "abcdef" <=> "abcde" #=> 1
3313 * "abcdef" <=> "abcdef" #=> 0
3314 * "abcdef" <=> "abcdefg" #=> -1
3315 * "abcdef" <=> "ABCDEF" #=> 1
3316 * "abcdef" <=> 1 #=> nil
3317 */
3318
3319static VALUE
3320rb_str_cmp_m(VALUE str1, VALUE str2)
3321{
3322 int result;
3323 VALUE s = rb_check_string_type(str2);
3324 if (NIL_P(s)) {
3325 return rb_invcmp(str1, str2);
3326 }
3327 result = rb_str_cmp(str1, s);
3328 return INT2FIX(result);
3329}
3330
3331static VALUE str_casecmp(VALUE str1, VALUE str2);
3332static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3333
3334/*
3335 * call-seq:
3336 * str.casecmp(other_str) -> -1, 0, +1, or nil
3337 *
3338 * Case-insensitive version of String#<=>.
3339 * Currently, case-insensitivity only works on characters A-Z/a-z,
3340 * not all of Unicode. This is different from String#casecmp?.
3341 *
3342 * "aBcDeF".casecmp("abcde") #=> 1
3343 * "aBcDeF".casecmp("abcdef") #=> 0
3344 * "aBcDeF".casecmp("abcdefg") #=> -1
3345 * "abcdef".casecmp("ABCDEF") #=> 0
3346 *
3347 * +nil+ is returned if the two strings have incompatible encodings,
3348 * or if +other_str+ is not a string.
3349 *
3350 * "foo".casecmp(2) #=> nil
3351 * "\u{e4 f6 fc}".encode("ISO-8859-1").casecmp("\u{c4 d6 dc}") #=> nil
3352 */
3353
3354static VALUE
3355rb_str_casecmp(VALUE str1, VALUE str2)
3356{
3357 VALUE s = rb_check_string_type(str2);
3358 if (NIL_P(s)) {
3359 return Qnil;
3360 }
3361 return str_casecmp(str1, s);
3362}
3363
3364static VALUE
3365str_casecmp(VALUE str1, VALUE str2)
3366{
3367 long len;
3368 rb_encoding *enc;
3369 char *p1, *p1end, *p2, *p2end;
3370
3371 enc = rb_enc_compatible(str1, str2);
3372 if (!enc) {
3373 return Qnil;
3374 }
3375
3376 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3377 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3378 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3379 while (p1 < p1end && p2 < p2end) {
3380 if (*p1 != *p2) {
3381 unsigned int c1 = TOLOWER(*p1 & 0xff);
3382 unsigned int c2 = TOLOWER(*p2 & 0xff);
3383 if (c1 != c2)
3384 return INT2FIX(c1 < c2 ? -1 : 1);
3385 }
3386 p1++;
3387 p2++;
3388 }
3389 }
3390 else {
3391 while (p1 < p1end && p2 < p2end) {
3392 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3393 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3394
3395 if (0 <= c1 && 0 <= c2) {
3396 c1 = TOLOWER(c1);
3397 c2 = TOLOWER(c2);
3398 if (c1 != c2)
3399 return INT2FIX(c1 < c2 ? -1 : 1);
3400 }
3401 else {
3402 int r;
3403 l1 = rb_enc_mbclen(p1, p1end, enc);
3404 l2 = rb_enc_mbclen(p2, p2end, enc);
3405 len = l1 < l2 ? l1 : l2;
3406 r = memcmp(p1, p2, len);
3407 if (r != 0)
3408 return INT2FIX(r < 0 ? -1 : 1);
3409 if (l1 != l2)
3410 return INT2FIX(l1 < l2 ? -1 : 1);
3411 }
3412 p1 += l1;
3413 p2 += l2;
3414 }
3415 }
3416 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3417 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3418 return INT2FIX(-1);
3419}
3420
3421/*
3422 * call-seq:
3423 * str.casecmp?(other_str) -> true, false, or nil
3424 *
3425 * Returns +true+ if +str+ and +other_str+ are equal after
3426 * Unicode case folding, +false+ if they are not equal.
3427 *
3428 * "aBcDeF".casecmp?("abcde") #=> false
3429 * "aBcDeF".casecmp?("abcdef") #=> true
3430 * "aBcDeF".casecmp?("abcdefg") #=> false
3431 * "abcdef".casecmp?("ABCDEF") #=> true
3432 * "\u{e4 f6 fc}".casecmp?("\u{c4 d6 dc}") #=> true
3433 *
3434 * +nil+ is returned if the two strings have incompatible encodings,
3435 * or if +other_str+ is not a string.
3436 *
3437 * "foo".casecmp?(2) #=> nil
3438 * "\u{e4 f6 fc}".encode("ISO-8859-1").casecmp?("\u{c4 d6 dc}") #=> nil
3439 */
3440
3441static VALUE
3442rb_str_casecmp_p(VALUE str1, VALUE str2)
3443{
3444 VALUE s = rb_check_string_type(str2);
3445 if (NIL_P(s)) {
3446 return Qnil;
3447 }
3448 return str_casecmp_p(str1, s);
3449}
3450
3451static VALUE
3452str_casecmp_p(VALUE str1, VALUE str2)
3453{
3454 rb_encoding *enc;
3455 VALUE folded_str1, folded_str2;
3456 VALUE fold_opt = sym_fold;
3457
3458 enc = rb_enc_compatible(str1, str2);
3459 if (!enc) {
3460 return Qnil;
3461 }
3462
3463 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3464 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3465
3466 return rb_str_eql(folded_str1, folded_str2);
3467}
3468
3469static long
3470strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3471 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3472{
3473 const char *search_start = str_ptr;
3474 long pos, search_len = str_len - offset;
3475
3476 for (;;) {
3477 const char *t;
3478 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3479 if (pos < 0) return pos;
3480 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3481 if (t == search_start + pos) break;
3482 search_len -= t - search_start;
3483 if (search_len <= 0) return -1;
3484 offset += t - search_start;
3485 search_start = t;
3486 }
3487 return pos + offset;
3488}
3489
3490#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3491
3492static long
3493rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3494{
3495 const char *str_ptr, *str_ptr_end, *sub_ptr;
3496 long str_len, sub_len;
3497 int single_byte = single_byte_optimizable(str);
3498 rb_encoding *enc;
3499
3500 enc = rb_enc_check(str, sub);
3501 if (is_broken_string(sub)) return -1;
3502
3503 str_ptr = RSTRING_PTR(str);
3504 str_ptr_end = RSTRING_END(str);
3505 str_len = RSTRING_LEN(str);
3506 sub_ptr = RSTRING_PTR(sub);
3507 sub_len = RSTRING_LEN(sub);
3508
3509 if (str_len < sub_len) return -1;
3510
3511 if (offset != 0) {
3512 long str_len_char, sub_len_char;
3513 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3514 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3515 if (offset < 0) {
3516 offset += str_len_char;
3517 if (offset < 0) return -1;
3518 }
3519 if (str_len_char - offset < sub_len_char) return -1;
3520 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
3521 str_ptr += offset;
3522 }
3523 if (sub_len == 0) return offset;
3524
3525 /* need proceed one character at a time */
3526 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3527}
3528
3529
3530/*
3531 * call-seq:
3532 * str.index(substring [, offset]) -> integer or nil
3533 * str.index(regexp [, offset]) -> integer or nil
3534 *
3535 * Returns the index of the first occurrence of the given <i>substring</i> or
3536 * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
3537 * found. If the second parameter is present, it specifies the position in the
3538 * string to begin the search.
3539 *
3540 * "hello".index('e') #=> 1
3541 * "hello".index('lo') #=> 3
3542 * "hello".index('a') #=> nil
3543 * "hello".index(?e) #=> 1
3544 * "hello".index(/[aeiou]/, -3) #=> 4
3545 */
3546
3547static VALUE
3548rb_str_index_m(int argc, VALUE *argv, VALUE str)
3549{
3550 VALUE sub;
3551 VALUE initpos;
3552 long pos;
3553
3554 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
3555 pos = NUM2LONG(initpos);
3556 }
3557 else {
3558 pos = 0;
3559 }
3560 if (pos < 0) {
3561 pos += str_strlen(str, NULL);
3562 if (pos < 0) {
3563 if (RB_TYPE_P(sub, T_REGEXP)) {
3565 }
3566 return Qnil;
3567 }
3568 }
3569
3570 if (SPECIAL_CONST_P(sub)) goto generic;
3571 switch (BUILTIN_TYPE(sub)) {
3572 case T_REGEXP:
3573 if (pos > str_strlen(str, NULL))
3574 return Qnil;
3575 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3576 rb_enc_check(str, sub), single_byte_optimizable(str));
3577
3578 pos = rb_reg_search(sub, str, pos, 0);
3579 pos = rb_str_sublen(str, pos);
3580 break;
3581
3582 generic:
3583 default: {
3584 VALUE tmp;
3585
3587 if (NIL_P(tmp)) {
3588 rb_raise(rb_eTypeError, "type mismatch: %s given",
3590 }
3591 sub = tmp;
3592 }
3593 /* fall through */
3594 case T_STRING:
3595 pos = rb_str_index(str, sub, pos);
3596 pos = rb_str_sublen(str, pos);
3597 break;
3598 }
3599
3600 if (pos == -1) return Qnil;
3601 return LONG2NUM(pos);
3602}
3603
3604#ifdef HAVE_MEMRCHR
3605static long
3606str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
3607{
3608 char *hit, *adjusted;
3609 int c;
3610 long slen, searchlen;
3611 char *sbeg, *e, *t;
3612
3613 slen = RSTRING_LEN(sub);
3614 if (slen == 0) return pos;
3615 sbeg = RSTRING_PTR(str);
3616 e = RSTRING_END(str);
3617 t = RSTRING_PTR(sub);
3618 c = *t & 0xff;
3619 searchlen = s - sbeg + 1;
3620
3621 do {
3622 hit = memrchr(sbeg, c, searchlen);
3623 if (!hit) break;
3624 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
3625 if (hit != adjusted) {
3626 searchlen = adjusted - sbeg;
3627 continue;
3628 }
3629 if (memcmp(hit, t, slen) == 0)
3630 return rb_str_sublen(str, hit - sbeg);
3631 searchlen = adjusted - sbeg;
3632 } while (searchlen > 0);
3633
3634 return -1;
3635}
3636#else
3637static long
3638str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
3639{
3640 long slen;
3641 char *sbeg, *e, *t;
3642
3643 sbeg = RSTRING_PTR(str);
3644 e = RSTRING_END(str);
3645 t = RSTRING_PTR(sub);
3646 slen = RSTRING_LEN(sub);
3647
3648 while (s) {
3649 if (memcmp(s, t, slen) == 0) {
3650 return pos;
3651 }
3652 if (pos == 0) break;
3653 pos--;
3654 s = rb_enc_prev_char(sbeg, s, e, enc);
3655 }
3656
3657 return -1;
3658}
3659#endif
3660
3661static long
3662rb_str_rindex(VALUE str, VALUE sub, long pos)
3663{
3664 long len, slen;
3665 char *sbeg, *s;
3666 rb_encoding *enc;
3667 int singlebyte;
3668
3669 enc = rb_enc_check(str, sub);
3670 if (is_broken_string(sub)) return -1;
3671 singlebyte = single_byte_optimizable(str);
3672 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
3673 slen = str_strlen(sub, enc); /* rb_enc_check */
3674
3675 /* substring longer than string */
3676 if (len < slen) return -1;
3677 if (len - pos < slen) pos = len - slen;
3678 if (len == 0) return pos;
3679
3680 sbeg = RSTRING_PTR(str);
3681
3682 if (pos == 0) {
3683 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
3684 return 0;
3685 else
3686 return -1;
3687 }
3688
3689 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
3690 return str_rindex(str, sub, s, pos, enc);
3691}
3692
3693
3694/*
3695 * call-seq:
3696 * str.rindex(substring [, integer]) -> integer or nil
3697 * str.rindex(regexp [, integer]) -> integer or nil
3698 *
3699 * Returns the index of the last occurrence of the given <i>substring</i> or
3700 * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
3701 * found. If the second parameter is present, it specifies the position in the
3702 * string to end the search---characters beyond this point will not be
3703 * considered.
3704 *
3705 * "hello".rindex('e') #=> 1
3706 * "hello".rindex('l') #=> 3
3707 * "hello".rindex('a') #=> nil
3708 * "hello".rindex(?e) #=> 1
3709 * "hello".rindex(/[aeiou]/, -2) #=> 1
3710 */
3711
3712static VALUE
3713rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
3714{
3715 VALUE sub;
3716 VALUE vpos;
3717 rb_encoding *enc = STR_ENC_GET(str);
3718 long pos, len = str_strlen(str, enc); /* str's enc */
3719
3720 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
3721 pos = NUM2LONG(vpos);
3722 if (pos < 0) {
3723 pos += len;
3724 if (pos < 0) {
3725 if (RB_TYPE_P(sub, T_REGEXP)) {
3727 }
3728 return Qnil;
3729 }
3730 }
3731 if (pos > len) pos = len;
3732 }
3733 else {
3734 pos = len;
3735 }
3736
3737 if (SPECIAL_CONST_P(sub)) goto generic;
3738 switch (BUILTIN_TYPE(sub)) {
3739 case T_REGEXP:
3740 /* enc = rb_get_check(str, sub); */
3741 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3742 enc, single_byte_optimizable(str));
3743
3744 pos = rb_reg_search(sub, str, pos, 1);
3745 pos = rb_str_sublen(str, pos);
3746 if (pos >= 0) return LONG2NUM(pos);
3747 break;
3748
3749 generic:
3750 default: {
3751 VALUE tmp;
3752
3754 if (NIL_P(tmp)) {
3755 rb_raise(rb_eTypeError, "type mismatch: %s given",
3757 }
3758 sub = tmp;
3759 }
3760 /* fall through */
3761 case T_STRING:
3762 pos = rb_str_rindex(str, sub, pos);
3763 if (pos >= 0) return LONG2NUM(pos);
3764 break;
3765 }
3766 return Qnil;
3767}
3768
3769/*
3770 * call-seq:
3771 * str =~ obj -> integer or nil
3772 *
3773 * Match---If <i>obj</i> is a Regexp, use it as a pattern to match
3774 * against <i>str</i>,and returns the position the match starts, or
3775 * <code>nil</code> if there is no match. Otherwise, invokes
3776 * <i>obj.=~</i>, passing <i>str</i> as an argument. The default
3777 * <code>=~</code> in Object returns <code>nil</code>.
3778 *
3779 * Note: <code>str =~ regexp</code> is not the same as
3780 * <code>regexp =~ str</code>. Strings captured from named capture groups
3781 * are assigned to local variables only in the second case.
3782 *
3783 * "cat o' 9 tails" =~ /\d/ #=> 7
3784 * "cat o' 9 tails" =~ 9 #=> nil
3785 */
3786
3787static VALUE
3788rb_str_match(VALUE x, VALUE y)
3789{
3790 if (SPECIAL_CONST_P(y)) goto generic;
3791 switch (BUILTIN_TYPE(y)) {
3792 case T_STRING:
3793 rb_raise(rb_eTypeError, "type mismatch: String given");
3794
3795 case T_REGEXP:
3796 return rb_reg_match(y, x);
3797
3798 generic:
3799 default:
3800 return rb_funcall(y, idEqTilde, 1, x);
3801 }
3802}
3803
3804
3805static VALUE get_pat(VALUE);
3806
3807
3808/*
3809 * call-seq:
3810 * str.match(pattern) -> matchdata or nil
3811 * str.match(pattern, pos) -> matchdata or nil
3812 *
3813 * Converts <i>pattern</i> to a Regexp (if it isn't already one),
3814 * then invokes its <code>match</code> method on <i>str</i>. If the second
3815 * parameter is present, it specifies the position in the string to begin the
3816 * search.
3817 *
3818 * 'hello'.match('(.)\1') #=> #<MatchData "ll" 1:"l">
3819 * 'hello'.match('(.)\1')[0] #=> "ll"
3820 * 'hello'.match(/(.)\1/)[0] #=> "ll"
3821 * 'hello'.match(/(.)\1/, 3) #=> nil
3822 * 'hello'.match('xx') #=> nil
3823 *
3824 * If a block is given, invoke the block with MatchData if match succeed, so
3825 * that you can write
3826 *
3827 * str.match(pat) {|m| ...}
3828 *
3829 * instead of
3830 *
3831 * if m = str.match(pat)
3832 * ...
3833 * end
3834 *
3835 * The return value is a value from block execution in this case.
3836 */
3837
3838static VALUE
3839rb_str_match_m(int argc, VALUE *argv, VALUE str)
3840{
3841 VALUE re, result;
3842 if (argc < 1)
3843 rb_check_arity(argc, 1, 2);
3844 re = argv[0];
3845 argv[0] = str;
3846 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
3847 if (!NIL_P(result) && rb_block_given_p()) {
3848 return rb_yield(result);
3849 }
3850 return result;
3851}
3852
3853/*
3854 * call-seq:
3855 * str.match?(pattern) -> true or false
3856 * str.match?(pattern, pos) -> true or false
3857 *
3858 * Converts _pattern_ to a +Regexp+ (if it isn't already one), then
3859 * returns a +true+ or +false+ indicates whether the regexp is
3860 * matched _str_ or not without updating <code>$~</code> and other
3861 * related variables. If the second parameter is present, it
3862 * specifies the position in the string to begin the search.
3863 *
3864 * "Ruby".match?(/R.../) #=> true
3865 * "Ruby".match?(/R.../, 1) #=> false
3866 * "Ruby".match?(/P.../) #=> false
3867 * $& #=> nil
3868 */
3869
3870static VALUE
3871rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
3872{
3873 VALUE re;
3874 rb_check_arity(argc, 1, 2);
3875 re = get_pat(argv[0]);
3876 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
3877}
3878
3884
3885static enum neighbor_char
3886enc_succ_char(char *p, long len, rb_encoding *enc)
3887{
3888 long i;
3889 int l;
3890
3891 if (rb_enc_mbminlen(enc) > 1) {
3892 /* wchar, trivial case */
3893 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
3894 if (!MBCLEN_CHARFOUND_P(r)) {
3895 return NEIGHBOR_NOT_CHAR;
3896 }
3897 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
3898 l = rb_enc_code_to_mbclen(c, enc);
3899 if (!l) return NEIGHBOR_NOT_CHAR;
3900 if (l != len) return NEIGHBOR_WRAPPED;
3901 rb_enc_mbcput(c, p, enc);
3902 r = rb_enc_precise_mbclen(p, p + len, enc);
3903 if (!MBCLEN_CHARFOUND_P(r)) {
3904 return NEIGHBOR_NOT_CHAR;
3905 }
3906 return NEIGHBOR_FOUND;
3907 }
3908 while (1) {
3909 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
3910 p[i] = '\0';
3911 if (i < 0)
3912 return NEIGHBOR_WRAPPED;
3913 ++((unsigned char*)p)[i];
3914 l = rb_enc_precise_mbclen(p, p+len, enc);
3915 if (MBCLEN_CHARFOUND_P(l)) {
3916 l = MBCLEN_CHARFOUND_LEN(l);
3917 if (l == len) {
3918 return NEIGHBOR_FOUND;
3919 }
3920 else {
3921 memset(p+l, 0xff, len-l);
3922 }
3923 }
3924 if (MBCLEN_INVALID_P(l) && i < len-1) {
3925 long len2;
3926 int l2;
3927 for (len2 = len-1; 0 < len2; len2--) {
3928 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
3929 if (!MBCLEN_INVALID_P(l2))
3930 break;
3931 }
3932 memset(p+len2+1, 0xff, len-(len2+1));
3933 }
3934 }
3935}
3936
3937static enum neighbor_char
3938enc_pred_char(char *p, long len, rb_encoding *enc)
3939{
3940 long i;
3941 int l;
3942 if (rb_enc_mbminlen(enc) > 1) {
3943 /* wchar, trivial case */
3944 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
3945 if (!MBCLEN_CHARFOUND_P(r)) {
3946 return NEIGHBOR_NOT_CHAR;
3947 }
3948 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
3949 if (!c) return NEIGHBOR_NOT_CHAR;
3950 --c;
3951 l = rb_enc_code_to_mbclen(c, enc);
3952 if (!l) return NEIGHBOR_NOT_CHAR;
3953 if (l != len) return NEIGHBOR_WRAPPED;
3954 rb_enc_mbcput(c, p, enc);
3955 r = rb_enc_precise_mbclen(p, p + len, enc);
3956 if (!MBCLEN_CHARFOUND_P(r)) {
3957 return NEIGHBOR_NOT_CHAR;
3958 }
3959 return NEIGHBOR_FOUND;
3960 }
3961 while (1) {
3962 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
3963 p[i] = '\xff';
3964 if (i < 0)
3965 return NEIGHBOR_WRAPPED;
3966 --((unsigned char*)p)[i];
3967 l = rb_enc_precise_mbclen(p, p+len, enc);
3968 if (MBCLEN_CHARFOUND_P(l)) {
3969 l = MBCLEN_CHARFOUND_LEN(l);
3970 if (l == len) {
3971 return NEIGHBOR_FOUND;
3972 }
3973 else {
3974 memset(p+l, 0, len-l);
3975 }
3976 }
3977 if (MBCLEN_INVALID_P(l) && i < len-1) {
3978 long len2;
3979 int l2;
3980 for (len2 = len-1; 0 < len2; len2--) {
3981 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
3982 if (!MBCLEN_INVALID_P(l2))
3983 break;
3984 }
3985 memset(p+len2+1, 0, len-(len2+1));
3986 }
3987 }
3988}
3989
3990/*
3991 overwrite +p+ by succeeding letter in +enc+ and returns
3992 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
3993 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
3994 assuming each ranges are successive, and mbclen
3995 never change in each ranges.
3996 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
3997 character.
3998 */
3999static enum neighbor_char
4000enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4001{
4002 enum neighbor_char ret;
4003 unsigned int c;
4004 int ctype;
4005 int range;
4006 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4007
4008 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4009 int try;
4010 const int max_gaps = 1;
4011
4012 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4014 ctype = ONIGENC_CTYPE_DIGIT;
4015 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4016 ctype = ONIGENC_CTYPE_ALPHA;
4017 else
4018 return NEIGHBOR_NOT_CHAR;
4019
4020 MEMCPY(save, p, char, len);
4021 for (try = 0; try <= max_gaps; ++try) {
4022 ret = enc_succ_char(p, len, enc);
4023 if (ret == NEIGHBOR_FOUND) {
4024 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4025 if (rb_enc_isctype(c, ctype, enc))
4026 return NEIGHBOR_FOUND;
4027 }
4028 }
4029 MEMCPY(p, save, char, len);
4030 range = 1;
4031 while (1) {
4032 MEMCPY(save, p, char, len);
4033 ret = enc_pred_char(p, len, enc);
4034 if (ret == NEIGHBOR_FOUND) {
4035 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4036 if (!rb_enc_isctype(c, ctype, enc)) {
4037 MEMCPY(p, save, char, len);
4038 break;
4039 }
4040 }
4041 else {
4042 MEMCPY(p, save, char, len);
4043 break;
4044 }
4045 range++;
4046 }
4047 if (range == 1) {
4048 return NEIGHBOR_NOT_CHAR;
4049 }
4050
4051 if (ctype != ONIGENC_CTYPE_DIGIT) {
4052 MEMCPY(carry, p, char, len);
4053 return NEIGHBOR_WRAPPED;
4054 }
4055
4056 MEMCPY(carry, p, char, len);
4057 enc_succ_char(carry, len, enc);
4058 return NEIGHBOR_WRAPPED;
4059}
4060
4061
4062static VALUE str_succ(VALUE str);
4063
4064/*
4065 * call-seq:
4066 * str.succ -> new_str
4067 * str.next -> new_str
4068 *
4069 * Returns the successor to <i>str</i>. The successor is calculated by
4070 * incrementing characters starting from the rightmost alphanumeric (or
4071 * the rightmost character if there are no alphanumerics) in the
4072 * string. Incrementing a digit always results in another digit, and
4073 * incrementing a letter results in another letter of the same case.
4074 * Incrementing nonalphanumerics uses the underlying character set's
4075 * collating sequence.
4076 *
4077 * If the increment generates a ``carry,'' the character to the left of
4078 * it is incremented. This process repeats until there is no carry,
4079 * adding an additional character if necessary.
4080 *
4081 * "abcd".succ #=> "abce"
4082 * "THX1138".succ #=> "THX1139"
4083 * "<<koala>>".succ #=> "<<koalb>>"
4084 * "1999zzz".succ #=> "2000aaa"
4085 * "ZZZ9999".succ #=> "AAAA0000"
4086 * "***".succ #=> "**+"
4087 */
4088
4089VALUE
4091{
4092 VALUE str;
4093 str = rb_str_new_with_class(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
4094 rb_enc_cr_str_copy_for_substr(str, orig);
4095 return str_succ(str);
4096}
4097
4098static VALUE
4099str_succ(VALUE str)
4100{
4101 rb_encoding *enc;
4102 char *sbeg, *s, *e, *last_alnum = 0;
4103 int found_alnum = 0;
4104 long l, slen;
4105 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4106 long carry_pos = 0, carry_len = 1;
4107 enum neighbor_char neighbor = NEIGHBOR_FOUND;
4108
4109 slen = RSTRING_LEN(str);
4110 if (slen == 0) return str;
4111
4112 enc = STR_ENC_GET(str);
4113 sbeg = RSTRING_PTR(str);
4114 s = e = sbeg + slen;
4115
4116 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4117 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4118 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4119 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4120 break;
4121 }
4122 }
4123 l = rb_enc_precise_mbclen(s, e, enc);
4124 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4126 neighbor = enc_succ_alnum_char(s, l, enc, carry);
4127 switch (neighbor) {
4128 case NEIGHBOR_NOT_CHAR:
4129 continue;
4130 case NEIGHBOR_FOUND:
4131 return str;
4132 case NEIGHBOR_WRAPPED:
4133 last_alnum = s;
4134 break;
4135 }
4136 found_alnum = 1;
4137 carry_pos = s - sbeg;
4138 carry_len = l;
4139 }
4140 if (!found_alnum) { /* str contains no alnum */
4141 s = e;
4142 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4143 enum neighbor_char neighbor;
4145 l = rb_enc_precise_mbclen(s, e, enc);
4146 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4148 MEMCPY(tmp, s, char, l);
4149 neighbor = enc_succ_char(tmp, l, enc);
4150 switch (neighbor) {
4151 case NEIGHBOR_FOUND:
4152 MEMCPY(s, tmp, char, l);
4153 return str;
4154 break;
4155 case NEIGHBOR_WRAPPED:
4156 MEMCPY(s, tmp, char, l);
4157 break;
4158 case NEIGHBOR_NOT_CHAR:
4159 break;
4160 }
4161 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4162 /* wrapped to \0...\0. search next valid char. */
4163 enc_succ_char(s, l, enc);
4164 }
4165 if (!rb_enc_asciicompat(enc)) {
4166 MEMCPY(carry, s, char, l);
4167 carry_len = l;
4168 }
4169 carry_pos = s - sbeg;
4170 }
4172 }
4173 RESIZE_CAPA(str, slen + carry_len);
4174 sbeg = RSTRING_PTR(str);
4175 s = sbeg + carry_pos;
4176 memmove(s + carry_len, s, slen - carry_pos);
4177 memmove(s, carry, carry_len);
4178 slen += carry_len;
4179 STR_SET_LEN(str, slen);
4180 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4182 return str;
4183}
4184
4185
4186/*
4187 * call-seq:
4188 * str.succ! -> str
4189 * str.next! -> str
4190 *
4191 * Equivalent to String#succ, but modifies the receiver in place.
4192 */
4193
4194static VALUE
4195rb_str_succ_bang(VALUE str)
4196{
4198 str_succ(str);
4199 return str;
4200}
4201
4202static int
4203all_digits_p(const char *s, long len)
4204{
4205 while (len-- > 0) {
4206 if (!ISDIGIT(*s)) return 0;
4207 s++;
4208 }
4209 return 1;
4210}
4211
4212static int
4213str_upto_i(VALUE str, VALUE arg)
4214{
4215 rb_yield(str);
4216 return 0;
4217}
4218
4219/*
4220 * call-seq:
4221 * str.upto(other_str, exclusive=false) {|s| block } -> str
4222 * str.upto(other_str, exclusive=false) -> an_enumerator
4223 *
4224 * Iterates through successive values, starting at <i>str</i> and
4225 * ending at <i>other_str</i> inclusive, passing each value in turn
4226 * to the block. The String#succ method is used to generate each
4227 * value. If optional second argument exclusive is omitted or is
4228 * false, the last value will be included; otherwise it will be
4229 * excluded.
4230 *
4231 * If no block is given, an enumerator is returned instead.
4232 *
4233 * "a8".upto("b6") {|s| print s, ' ' }
4234 * for s in "a8".."b6"
4235 * print s, ' '
4236 * end
4237 *
4238 * <em>produces:</em>
4239 *
4240 * a8 a9 b0 b1 b2 b3 b4 b5 b6
4241 * a8 a9 b0 b1 b2 b3 b4 b5 b6
4242 *
4243 * If <i>str</i> and <i>other_str</i> contains only ascii numeric characters,
4244 * both are recognized as decimal numbers. In addition, the width of
4245 * string (e.g. leading zeros) is handled appropriately.
4246 *
4247 * "9".upto("11").to_a #=> ["9", "10", "11"]
4248 * "25".upto("5").to_a #=> []
4249 * "07".upto("11").to_a #=> ["07", "08", "09", "10", "11"]
4250 */
4251
4252static VALUE
4253rb_str_upto(int argc, VALUE *argv, VALUE beg)
4254{
4255 VALUE end, exclusive;
4256
4257 rb_scan_args(argc, argv, "11", &end, &exclusive);
4259 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
4260}
4261
4262VALUE
4263rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
4264{
4265 VALUE current, after_end;
4266 ID succ;
4267 int n, ascii;
4268 rb_encoding *enc;
4269
4270 CONST_ID(succ, "succ");
4271 StringValue(end);
4272 enc = rb_enc_check(beg, end);
4273 ascii = (is_ascii_string(beg) && is_ascii_string(end));
4274 /* single character */
4275 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
4276 char c = RSTRING_PTR(beg)[0];
4277 char e = RSTRING_PTR(end)[0];
4278
4279 if (c > e || (excl && c == e)) return beg;
4280 for (;;) {
4281 if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
4282 if (!excl && c == e) break;
4283 c++;
4284 if (excl && c == e) break;
4285 }
4286 return beg;
4287 }
4288 /* both edges are all digits */
4289 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
4290 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
4291 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
4292 VALUE b, e;
4293 int width;
4294
4295 width = RSTRING_LENINT(beg);
4296 b = rb_str_to_inum(beg, 10, FALSE);
4297 e = rb_str_to_inum(end, 10, FALSE);
4298 if (FIXNUM_P(b) && FIXNUM_P(e)) {
4299 long bi = FIX2LONG(b);
4300 long ei = FIX2LONG(e);
4301 rb_encoding *usascii = rb_usascii_encoding();
4302
4303 while (bi <= ei) {
4304 if (excl && bi == ei) break;
4305 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4306 bi++;
4307 }
4308 }
4309 else {
4310 ID op = excl ? '<' : idLE;
4311 VALUE args[2], fmt = rb_fstring_lit("%.*d");
4312
4313 args[0] = INT2FIX(width);
4314 while (rb_funcall(b, op, 1, e)) {
4315 args[1] = b;
4316 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4317 b = rb_funcallv(b, succ, 0, 0);
4318 }
4319 }
4320 return beg;
4321 }
4322 /* normal case */
4323 n = rb_str_cmp(beg, end);
4324 if (n > 0 || (excl && n == 0)) return beg;
4325
4326 after_end = rb_funcallv(end, succ, 0, 0);
4327 current = rb_str_dup(beg);
4328 while (!rb_str_equal(current, after_end)) {
4329 VALUE next = Qnil;
4330 if (excl || !rb_str_equal(current, end))
4331 next = rb_funcallv(current, succ, 0, 0);
4332 if ((*each)(current, arg)) break;
4333 if (NIL_P(next)) break;
4334 current = next;
4335 StringValue(current);
4336 if (excl && rb_str_equal(current, end)) break;
4337 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
4338 break;
4339 }
4340
4341 return beg;
4342}
4343
4344VALUE
4346{
4347 VALUE current;
4348 ID succ;
4349
4350 CONST_ID(succ, "succ");
4351 /* both edges are all digits */
4352 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
4353 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
4354 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
4355 int width = RSTRING_LENINT(beg);
4356 b = rb_str_to_inum(beg, 10, FALSE);
4357 if (FIXNUM_P(b)) {
4358 long bi = FIX2LONG(b);
4359 rb_encoding *usascii = rb_usascii_encoding();
4360
4361 while (FIXABLE(bi)) {
4362 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4363 bi++;
4364 }
4365 b = LONG2NUM(bi);
4366 }
4367 args[0] = INT2FIX(width);
4368 while (1) {
4369 args[1] = b;
4370 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4371 b = rb_funcallv(b, succ, 0, 0);
4372 }
4373 }
4374 /* normal case */
4375 current = rb_str_dup(beg);
4376 while (1) {
4377 VALUE next = rb_funcallv(current, succ, 0, 0);
4378 if ((*each)(current, arg)) break;
4379 current = next;
4380 StringValue(current);
4381 if (RSTRING_LEN(current) == 0)
4382 break;
4383 }
4384
4385 return beg;
4386}
4387
4388static int
4389include_range_i(VALUE str, VALUE arg)
4390{
4391 VALUE *argp = (VALUE *)arg;
4392 if (!rb_equal(str, *argp)) return 0;
4393 *argp = Qnil;
4394 return 1;
4395}
4396
4397VALUE
4399{
4400 beg = rb_str_new_frozen(beg);
4401 StringValue(end);
4402 end = rb_str_new_frozen(end);
4403 if (NIL_P(val)) return Qfalse;
4404 val = rb_check_string_type(val);
4405 if (NIL_P(val)) return Qfalse;
4406 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
4409 const char *bp = RSTRING_PTR(beg);
4410 const char *ep = RSTRING_PTR(end);
4411 const char *vp = RSTRING_PTR(val);
4412 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
4413 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
4414 return Qfalse;
4415 else {
4416 char b = *bp;
4417 char e = *ep;
4418 char v = *vp;
4419
4420 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
4421 if (b <= v && v < e) return Qtrue;
4422 if (!RTEST(exclusive) && v == e) return Qtrue;
4423 return Qfalse;
4424 }
4425 }
4426 }
4427#if 0
4428 /* both edges are all digits */
4429 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
4430 all_digits_p(bp, RSTRING_LEN(beg)) &&
4431 all_digits_p(ep, RSTRING_LEN(end))) {
4432 /* TODO */
4433 }
4434#endif
4435 }
4436 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
4437
4438 return NIL_P(val) ? Qtrue : Qfalse;
4439}
4440
4441static VALUE
4442rb_str_subpat(VALUE str, VALUE re, VALUE backref)
4443{
4444 if (rb_reg_search(re, str, 0, 0) >= 0) {
4445 VALUE match = rb_backref_get();
4446 int nth = rb_reg_backref_number(match, backref);
4447 return rb_reg_nth_match(nth, match);
4448 }
4449 return Qnil;
4450}
4451
4452static VALUE
4453rb_str_aref(VALUE str, VALUE indx)
4454{
4455 long idx;
4456
4457 if (FIXNUM_P(indx)) {
4458 idx = FIX2LONG(indx);
4459 }
4460 else if (RB_TYPE_P(indx, T_REGEXP)) {
4461 return rb_str_subpat(str, indx, INT2FIX(0));
4462 }
4463 else if (RB_TYPE_P(indx, T_STRING)) {
4464 if (rb_str_index(str, indx, 0) != -1)
4465 return rb_str_dup(indx);
4466 return Qnil;
4467 }
4468 else {
4469 /* check if indx is Range */
4470 long beg, len = str_strlen(str, NULL);
4471 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4472 case Qfalse:
4473 break;
4474 case Qnil:
4475 return Qnil;
4476 default:
4477 return rb_str_substr(str, beg, len);
4478 }
4479 idx = NUM2LONG(indx);
4480 }
4481
4482 return str_substr(str, idx, 1, FALSE);
4483}
4484
4485
4486/*
4487 * call-seq:
4488 * str[index] -> new_str or nil
4489 * str[start, length] -> new_str or nil
4490 * str[range] -> new_str or nil
4491 * str[regexp] -> new_str or nil
4492 * str[regexp, capture] -> new_str or nil
4493 * str[match_str] -> new_str or nil
4494 * str.slice(index) -> new_str or nil
4495 * str.slice(start, length) -> new_str or nil
4496 * str.slice(range) -> new_str or nil
4497 * str.slice(regexp) -> new_str or nil
4498 * str.slice(regexp, capture) -> new_str or nil
4499 * str.slice(match_str) -> new_str or nil
4500 *
4501 * Element Reference --- If passed a single +index+, returns a substring of
4502 * one character at that index. If passed a +start+ index and a +length+,
4503 * returns a substring containing +length+ characters starting at the
4504 * +start+ index. If passed a +range+, its beginning and end are interpreted as
4505 * offsets delimiting the substring to be returned.
4506 *
4507 * In these three cases, if an index is negative, it is counted from the end
4508 * of the string. For the +start+ and +range+ cases the starting index
4509 * is just before a character and an index matching the string's size.
4510 * Additionally, an empty string is returned when the starting index for a
4511 * character range is at the end of the string.
4512 *
4513 * Returns +nil+ if the initial index falls outside the string or the length
4514 * is negative.
4515 *
4516 * If a +Regexp+ is supplied, the matching portion of the string is
4517 * returned. If a +capture+ follows the regular expression, which may be a
4518 * capture group index or name, follows the regular expression that component
4519 * of the MatchData is returned instead.
4520 *
4521 * If a +match_str+ is given, that string is returned if it occurs in
4522 * the string.
4523 *
4524 * Returns +nil+ if the regular expression does not match or the match string
4525 * cannot be found.
4526 *
4527 * a = "hello there"
4528 *
4529 * a[1] #=> "e"
4530 * a[2, 3] #=> "llo"
4531 * a[2..3] #=> "ll"
4532 *
4533 * a[-3, 2] #=> "er"
4534 * a[7..-2] #=> "her"
4535 * a[-4..-2] #=> "her"
4536 * a[-2..-4] #=> ""
4537 *
4538 * a[11, 0] #=> ""
4539 * a[11] #=> nil
4540 * a[12, 0] #=> nil
4541 * a[12..-1] #=> nil
4542 *
4543 * a[/[aeiou](.)\1/] #=> "ell"
4544 * a[/[aeiou](.)\1/, 0] #=> "ell"
4545 * a[/[aeiou](.)\1/, 1] #=> "l"
4546 * a[/[aeiou](.)\1/, 2] #=> nil
4547 *
4548 * a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] #=> "l"
4549 * a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "vowel"] #=> "e"
4550 *
4551 * a["lo"] #=> "lo"
4552 * a["bye"] #=> nil
4553 */
4554
4555static VALUE
4556rb_str_aref_m(int argc, VALUE *argv, VALUE str)
4557{
4558 if (argc == 2) {
4559 if (RB_TYPE_P(argv[0], T_REGEXP)) {
4560 return rb_str_subpat(str, argv[0], argv[1]);
4561 }
4562 else {
4563 long beg = NUM2LONG(argv[0]);
4564 long len = NUM2LONG(argv[1]);
4565 return rb_str_substr(str, beg, len);
4566 }
4567 }
4568 rb_check_arity(argc, 1, 2);
4569 return rb_str_aref(str, argv[0]);
4570}
4571
4572VALUE
4574{
4575 char *ptr = RSTRING_PTR(str);
4576 long olen = RSTRING_LEN(str), nlen;
4577
4578 str_modifiable(str);
4579 if (len > olen) len = olen;
4580 nlen = olen - len;
4581 if (STR_EMBEDDABLE_P(nlen, TERM_LEN(str))) {
4582 char *oldptr = ptr;
4583 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
4585 STR_SET_EMBED_LEN(str, nlen);
4586 ptr = RSTRING(str)->as.ary;
4587 memmove(ptr, oldptr + len, nlen);
4588 if (fl == STR_NOEMBED) xfree(oldptr);
4589 }
4590 else {
4592 ptr = RSTRING(str)->as.heap.ptr += len;
4593 RSTRING(str)->as.heap.len = nlen;
4594 }
4595 ptr[nlen] = 0;
4597 return str;
4598}
4599
4600static void
4601rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
4602{
4603 char *sptr;
4604 long slen, vlen = RSTRING_LEN(val);
4605 int cr;
4606
4607 if (beg == 0 && vlen == 0) {
4609 return;
4610 }
4611
4612 str_modify_keep_cr(str);
4613 RSTRING_GETMEM(str, sptr, slen);
4614 if (len < vlen) {
4615 /* expand string */
4616 RESIZE_CAPA(str, slen + vlen - len);
4617 sptr = RSTRING_PTR(str);
4618 }
4619
4621 cr = rb_enc_str_coderange(val);
4622 else
4624
4625 if (vlen != len) {
4626 memmove(sptr + beg + vlen,
4627 sptr + beg + len,
4628 slen - (beg + len));
4629 }
4630 if (vlen < beg && len < 0) {
4631 MEMZERO(sptr + slen, char, -len);
4632 }
4633 if (vlen > 0) {
4634 memmove(sptr + beg, RSTRING_PTR(val), vlen);
4635 }
4636 slen += vlen - len;
4637 STR_SET_LEN(str, slen);
4638 TERM_FILL(&sptr[slen], TERM_LEN(str));
4640}
4641
4642void
4643rb_str_update(VALUE str, long beg, long len, VALUE val)
4644{
4645 long slen;
4646 char *p, *e;
4647 rb_encoding *enc;
4648 int singlebyte = single_byte_optimizable(str);
4649 int cr;
4650
4651 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
4652
4653 StringValue(val);
4654 enc = rb_enc_check(str, val);
4655 slen = str_strlen(str, enc); /* rb_enc_check */
4656
4657 if (slen < beg) {
4658 out_of_range:
4659 rb_raise(rb_eIndexError, "index %ld out of string", beg);
4660 }
4661 if (beg < 0) {
4662 if (beg + slen < 0) {
4663 goto out_of_range;
4664 }
4665 beg += slen;
4666 }
4667 assert(beg >= 0);
4668 assert(beg <= slen);
4669 if (len > slen - beg) {
4670 len = slen - beg;
4671 }
4672 str_modify_keep_cr(str);
4673 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
4674 if (!p) p = RSTRING_END(str);
4675 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
4676 if (!e) e = RSTRING_END(str);
4677 /* error check */
4678 beg = p - RSTRING_PTR(str); /* physical position */
4679 len = e - p; /* physical length */
4680 rb_str_splice_0(str, beg, len, val);
4681 rb_enc_associate(str, enc);
4683 if (cr != ENC_CODERANGE_BROKEN)
4685}
4686
4687#define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
4688
4689static void
4690rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
4691{
4692 int nth;
4693 VALUE match;
4694 long start, end, len;
4695 rb_encoding *enc;
4696 struct re_registers *regs;
4697
4698 if (rb_reg_search(re, str, 0, 0) < 0) {
4699 rb_raise(rb_eIndexError, "regexp not matched");
4700 }
4701 match = rb_backref_get();
4702 nth = rb_reg_backref_number(match, backref);
4703 regs = RMATCH_REGS(match);
4704 if (nth >= regs->num_regs) {
4705 out_of_range:
4706 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
4707 }
4708 if (nth < 0) {
4709 if (-nth >= regs->num_regs) {
4710 goto out_of_range;
4711 }
4712 nth += regs->num_regs;
4713 }
4714
4715 start = BEG(nth);
4716 if (start == -1) {
4717 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
4718 }
4719 end = END(nth);
4720 len = end - start;
4721 StringValue(val);
4722 enc = rb_enc_check_str(str, val);
4723 rb_str_splice_0(str, start, len, val);
4724 rb_enc_associate(str, enc);
4725}
4726
4727static VALUE
4728rb_str_aset(VALUE str, VALUE indx, VALUE val)
4729{
4730 long idx, beg;
4731
4732 if (FIXNUM_P(indx)) {
4733 idx = FIX2LONG(indx);
4734 num_index:
4735 rb_str_splice(str, idx, 1, val);
4736 return val;
4737 }
4738
4739 if (SPECIAL_CONST_P(indx)) goto generic;
4740 switch (BUILTIN_TYPE(indx)) {
4741 case T_REGEXP:
4742 rb_str_subpat_set(str, indx, INT2FIX(0), val);
4743 return val;
4744
4745 case T_STRING:
4746 beg = rb_str_index(str, indx, 0);
4747 if (beg < 0) {
4748 rb_raise(rb_eIndexError, "string not matched");
4749 }
4750 beg = rb_str_sublen(str, beg);
4751 rb_str_splice(str, beg, str_strlen(indx, NULL), val);
4752 return val;
4753
4754 generic:
4755 default:
4756 /* check if indx is Range */
4757 {
4758 long beg, len;
4759 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
4760 rb_str_splice(str, beg, len, val);
4761 return val;
4762 }
4763 }
4764 idx = NUM2LONG(indx);
4765 goto num_index;
4766 }
4767}
4768
4769/*
4770 * call-seq:
4771 * str[integer] = new_str
4772 * str[integer, integer] = new_str
4773 * str[range] = aString
4774 * str[regexp] = new_str
4775 * str[regexp, integer] = new_str
4776 * str[regexp, name] = new_str
4777 * str[other_str] = new_str
4778 *
4779 * Element Assignment---Replaces some or all of the content of
4780 * <i>str</i>. The portion of the string affected is determined using
4781 * the same criteria as String#[]. If the replacement string is not
4782 * the same length as the text it is replacing, the string will be
4783 * adjusted accordingly. If the regular expression or string is used
4784 * as the index doesn't match a position in the string, IndexError is
4785 * raised. If the regular expression form is used, the optional
4786 * second Integer allows you to specify which portion of the match to
4787 * replace (effectively using the MatchData indexing rules. The forms
4788 * that take an Integer will raise an IndexError if the value is out
4789 * of range; the Range form will raise a RangeError, and the Regexp
4790 * and String will raise an IndexError on negative match.
4791 */
4792
4793static VALUE
4794rb_str_aset_m(int argc, VALUE *argv, VALUE str)
4795{
4796 if (argc == 3) {
4797 if (RB_TYPE_P(argv[0], T_REGEXP)) {
4798 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
4799 }
4800 else {
4802 }
4803 return argv[2];
4804 }
4805 rb_check_arity(argc, 2, 3);
4806 return rb_str_aset(str, argv[0], argv[1]);
4807}
4808
4809/*
4810 * call-seq:
4811 * str.insert(index, other_str) -> str
4812 *
4813 * Inserts <i>other_str</i> before the character at the given
4814 * <i>index</i>, modifying <i>str</i>. Negative indices count from the
4815 * end of the string, and insert <em>after</em> the given character.
4816 * The intent is insert <i>aString</i> so that it starts at the given
4817 * <i>index</i>.
4818 *
4819 * "abcd".insert(0, 'X') #=> "Xabcd"
4820 * "abcd".insert(3, 'X') #=> "abcXd"
4821 * "abcd".insert(4, 'X') #=> "abcdX"
4822 * "abcd".insert(-3, 'X') #=> "abXcd"
4823 * "abcd".insert(-1, 'X') #=> "abcdX"
4824 */
4825
4826static VALUE
4827rb_str_insert(VALUE str, VALUE idx, VALUE str2)
4828{
4829 long pos = NUM2LONG(idx);
4830
4831 if (pos == -1) {
4832 return rb_str_append(str, str2);
4833 }
4834 else if (pos < 0) {
4835 pos++;
4836 }
4837 rb_str_splice(str, pos, 0, str2);
4838 return str;
4839}
4840
4841
4842/*
4843 * call-seq:
4844 * str.slice!(integer) -> new_str or nil
4845 * str.slice!(integer, integer) -> new_str or nil
4846 * str.slice!(range) -> new_str or nil
4847 * str.slice!(regexp) -> new_str or nil
4848 * str.slice!(other_str) -> new_str or nil
4849 *
4850 * Deletes the specified portion from <i>str</i>, and returns the portion
4851 * deleted.
4852 *
4853 * string = "this is a string"
4854 * string.slice!(2) #=> "i"
4855 * string.slice!(3..6) #=> " is "
4856 * string.slice!(/s.*t/) #=> "sa st"
4857 * string.slice!("r") #=> "r"
4858 * string #=> "thing"
4859 */
4860
4861static VALUE
4862rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
4863{
4864 VALUE result;
4865 VALUE buf[3];
4866 int i;
4867
4868 rb_check_arity(argc, 1, 2);
4869 for (i=0; i<argc; i++) {
4870 buf[i] = argv[i];
4871 }
4872 str_modify_keep_cr(str);
4873 result = rb_str_aref_m(argc, buf, str);
4874 if (!NIL_P(result)) {
4875 buf[i] = rb_str_new(0,0);
4876 rb_str_aset_m(argc+1, buf, str);
4877 }
4878 return result;
4879}
4880
4881static VALUE
4882get_pat(VALUE pat)
4883{
4884 VALUE val;
4885
4886 if (SPECIAL_CONST_P(pat)) goto to_string;
4887 switch (BUILTIN_TYPE(pat)) {
4888 case T_REGEXP:
4889 return pat;
4890
4891 case T_STRING:
4892 break;
4893
4894 default:
4895 to_string:
4896 val = rb_check_string_type(pat);
4897 if (NIL_P(val)) {
4898 Check_Type(pat, T_REGEXP);
4899 }
4900 pat = val;
4901 }
4902
4903 return rb_reg_regcomp(pat);
4904}
4905
4906static VALUE
4907get_pat_quoted(VALUE pat, int check)
4908{
4909 VALUE val;
4910
4911 if (SPECIAL_CONST_P(pat)) goto to_string;
4912 switch (BUILTIN_TYPE(pat)) {
4913 case T_REGEXP:
4914 return pat;
4915
4916 case T_STRING:
4917 break;
4918
4919 default:
4920 to_string:
4921 val = rb_check_string_type(pat);
4922 if (NIL_P(val)) {
4923 Check_Type(pat, T_REGEXP);
4924 }
4925 pat = val;
4926 }
4927 if (check && is_broken_string(pat)) {
4929 }
4930 return pat;
4931}
4932
4933static long
4934rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
4935{
4936 if (BUILTIN_TYPE(pat) == T_STRING) {
4937 pos = rb_strseq_index(str, pat, pos, 1);
4938 if (set_backref_str) {
4939 if (pos >= 0) {
4942 }
4943 else {
4945 }
4946 }
4947 return pos;
4948 }
4949 else {
4950 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
4951 }
4952}
4953
4954
4955/*
4956 * call-seq:
4957 * str.sub!(pattern, replacement) -> str or nil
4958 * str.sub!(pattern) {|match| block } -> str or nil
4959 *
4960 * Performs the same substitution as String#sub in-place.
4961 *
4962 * Returns +str+ if a substitution was performed or +nil+ if no substitution
4963 * was performed.
4964 */
4965
4966static VALUE
4967rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
4968{
4969 VALUE pat, repl, hash = Qnil;
4970 int iter = 0;
4971 long plen;
4972 int min_arity = rb_block_given_p() ? 1 : 2;
4973 long beg;
4974
4975 rb_check_arity(argc, min_arity, 2);
4976 if (argc == 1) {
4977 iter = 1;
4978 }
4979 else {
4980 repl = argv[1];
4981 hash = rb_check_hash_type(argv[1]);
4982 if (NIL_P(hash)) {
4983 StringValue(repl);
4984 }
4985 }
4986
4987 pat = get_pat_quoted(argv[0], 1);
4988
4989 str_modifiable(str);
4990 beg = rb_pat_search(pat, str, 0, 1);
4991 if (beg >= 0) {
4992 rb_encoding *enc;
4993 int cr = ENC_CODERANGE(str);
4994 long beg0, end0;
4995 VALUE match, match0 = Qnil;
4996 struct re_registers *regs;
4997 char *p, *rp;
4998 long len, rlen;
4999
5000 match = rb_backref_get();
5001 regs = RMATCH_REGS(match);
5002 if (RB_TYPE_P(pat, T_STRING)) {
5003 beg0 = beg;
5004 end0 = beg0 + RSTRING_LEN(pat);
5005 match0 = pat;
5006 }
5007 else {
5008 beg0 = BEG(0);
5009 end0 = END(0);
5010 if (iter) match0 = rb_reg_nth_match(0, match);
5011 }
5012
5013 if (iter || !NIL_P(hash)) {
5015
5016 if (iter) {
5017 repl = rb_obj_as_string(rb_yield(match0));
5018 }
5019 else {
5020 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5021 repl = rb_obj_as_string(repl);
5022 }
5023 str_mod_check(str, p, len);
5025 }
5026 else {
5027 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5028 }
5029
5030 enc = rb_enc_compatible(str, repl);
5031 if (!enc) {
5032 rb_encoding *str_enc = STR_ENC_GET(str);
5034 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5035 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5036 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5037 rb_enc_name(str_enc),
5038 rb_enc_name(STR_ENC_GET(repl)));
5039 }
5040 enc = STR_ENC_GET(repl);
5041 }
5043 rb_enc_associate(str, enc);
5045 int cr2 = ENC_CODERANGE(repl);
5046 if (cr2 == ENC_CODERANGE_BROKEN ||
5047 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5049 else
5050 cr = cr2;
5051 }
5052 plen = end0 - beg0;
5053 rlen = RSTRING_LEN(repl);
5054 len = RSTRING_LEN(str);
5055 if (rlen > plen) {
5056 RESIZE_CAPA(str, len + rlen - plen);
5057 }
5058 p = RSTRING_PTR(str);
5059 if (rlen != plen) {
5060 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5061 }
5062 rp = RSTRING_PTR(repl);
5063 memmove(p + beg0, rp, rlen);
5064 len += rlen - plen;
5068
5069 return str;
5070 }
5071 return Qnil;
5072}
5073
5074
5075/*
5076 * call-seq:
5077 * str.sub(pattern, replacement) -> new_str
5078 * str.sub(pattern, hash) -> new_str
5079 * str.sub(pattern) {|match| block } -> new_str
5080 *
5081 * Returns a copy of +str+ with the _first_ occurrence of +pattern+
5082 * replaced by the second argument. The +pattern+ is typically a Regexp; if
5083 * given as a String, any regular expression metacharacters it contains will
5084 * be interpreted literally, e.g. <code>\d</code> will match a backslash
5085 * followed by 'd', instead of a digit.
5086 *
5087 * If +replacement+ is a String it will be substituted for the matched text.
5088 * It may contain back-references to the pattern's capture groups of the form
5089 * <code>\d</code>, where <i>d</i> is a group number, or
5090 * <code>\k<n></code>, where <i>n</i> is a group name.
5091 * Similarly, <code>\&</code>, <code>\'</code>, <code>\`</code>, and
5092 * <code>\+</code> correspond to special variables, <code>$&</code>,
5093 * <code>$'</code>, <code>$`</code>, and <code>$+</code>, respectively.
5094 * (See rdoc-ref:regexp.rdoc for details.)
5095 * <code>\0</code> is the same as <code>\&</code>.
5096 * <code>\\\</code> is interpreted as an escape, i.e., a single backslash.
5097 * Note that, within +replacement+ the special match variables, such as
5098 * <code>$&</code>, will not refer to the current match.
5099 *
5100 * If the second argument is a Hash, and the matched text is one of its keys,
5101 * the corresponding value is the replacement string.
5102 *
5103 * In the block form, the current match string is passed in as a parameter,
5104 * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
5105 * <code>$&</code>, and <code>$'</code> will be set appropriately.
5106 * (See rdoc-ref:regexp.rdoc for details.)
5107 * The value returned by the block will be substituted for the match on each
5108 * call.
5109 *
5110 * "hello".sub(/[aeiou]/, '*') #=> "h*llo"
5111 * "hello".sub(/([aeiou])/, '<\1>') #=> "h<e>llo"
5112 * "hello".sub(/./) {|s| s.ord.to_s + ' ' } #=> "104 ello"
5113 * "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*') #=> "h*e*llo"
5114 * 'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
5115 * #=> "Is /bin/bash your preferred shell?"
5116 *
5117 * Note that a string literal consumes backslashes.
5118 * (See rdoc-ref:syntax/literals.rdoc for details about string literals.)
5119 * Back-references are typically preceded by an additional backslash.
5120 * For example, if you want to write a back-reference <code>\&</code> in
5121 * +replacement+ with a double-quoted string literal, you need to write:
5122 * <code>"..\\\\&.."</code>.
5123 * If you want to write a non-back-reference string <code>\&</code> in
5124 * +replacement+, you need first to escape the backslash to prevent
5125 * this method from interpreting it as a back-reference, and then you
5126 * need to escape the backslashes again to prevent a string literal from
5127 * consuming them: <code>"..\\\\\\\\&.."</code>.
5128 * You may want to use the block form to avoid a lot of backslashes.
5129 */
5130
5131static VALUE
5132rb_str_sub(int argc, VALUE *argv, VALUE str)
5133{
5134 str = rb_str_dup(str);
5135 rb_str_sub_bang(argc, argv, str);
5136 return str;
5137}
5138
5139static VALUE
5140str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5141{
5142 VALUE pat, val = Qnil, repl, match, match0 = Qnil, dest, hash = Qnil;
5143 struct re_registers *regs;
5144 long beg, beg0, end0;
5145 long offset, blen, slen, len, last;
5146 enum {STR, ITER, MAP} mode = STR;
5147 char *sp, *cp;
5148 int need_backref = -1;
5149 rb_encoding *str_enc;
5150
5151 switch (argc) {
5152 case 1:
5154 mode = ITER;
5155 break;
5156 case 2:
5157 repl = argv[1];
5158 hash = rb_check_hash_type(argv[1]);
5159 if (NIL_P(hash)) {
5160 StringValue(repl);
5161 }
5162 else {
5163 mode = MAP;
5164 }
5165 break;
5166 default:
5167 rb_error_arity(argc, 1, 2);
5168 }
5169
5170 pat = get_pat_quoted(argv[0], 1);
5171 beg = rb_pat_search(pat, str, 0, need_backref);
5172 if (beg < 0) {
5173 if (bang) return Qnil; /* no match, no substitution */
5174 return rb_str_dup(str);
5175 }
5176
5177 offset = 0;
5178 blen = RSTRING_LEN(str) + 30; /* len + margin */
5179 dest = rb_str_buf_new(blen);
5180 sp = RSTRING_PTR(str);
5181 slen = RSTRING_LEN(str);
5182 cp = sp;
5183 str_enc = STR_ENC_GET(str);
5184 rb_enc_associate(dest, str_enc);
5186
5187 do {
5188 match = rb_backref_get();
5189 regs = RMATCH_REGS(match);
5190 if (RB_TYPE_P(pat, T_STRING)) {
5191 beg0 = beg;
5192 end0 = beg0 + RSTRING_LEN(pat);
5193 match0 = pat;
5194 }
5195 else {
5196 beg0 = BEG(0);
5197 end0 = END(0);
5198 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5199 }
5200
5201 if (mode) {
5202 if (mode == ITER) {
5203 val = rb_obj_as_string(rb_yield(match0));
5204 }
5205 else {
5206 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5207 val = rb_obj_as_string(val);
5208 }
5209 str_mod_check(str, sp, slen);
5210 if (val == dest) { /* paranoid check [ruby-dev:24827] */
5211 rb_raise(rb_eRuntimeError, "block should not cheat");
5212 }
5213 }
5214 else if (need_backref) {
5215 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5216 if (need_backref < 0) {
5217 need_backref = val != repl;
5218 }
5219 }
5220 else {
5221 val = repl;
5222 }
5223
5224 len = beg0 - offset; /* copy pre-match substr */
5225 if (len) {
5226 rb_enc_str_buf_cat(dest, cp, len, str_enc);
5227 }
5228
5229 rb_str_buf_append(dest, val);
5230
5231 last = offset;
5232 offset = end0;
5233 if (beg0 == end0) {
5234 /*
5235 * Always consume at least one character of the input string
5236 * in order to prevent infinite loops.
5237 */
5238 if (RSTRING_LEN(str) <= end0) break;
5240 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5241 offset = end0 + len;
5242 }
5243 cp = RSTRING_PTR(str) + offset;
5244 if (offset > RSTRING_LEN(str)) break;
5245 beg = rb_pat_search(pat, str, offset, need_backref);
5246 } while (beg >= 0);
5247 if (RSTRING_LEN(str) > offset) {
5248 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5249 }
5250 rb_pat_search(pat, str, last, 1);
5251 if (bang) {
5252 str_shared_replace(str, dest);
5253 }
5254 else {
5256 str = dest;
5257 }
5258
5259 return str;
5260}
5261
5262
5263/*
5264 * call-seq:
5265 * str.gsub!(pattern, replacement) -> str or nil
5266 * str.gsub!(pattern, hash) -> str or nil
5267 * str.gsub!(pattern) {|match| block } -> str or nil
5268 * str.gsub!(pattern) -> an_enumerator
5269 *
5270 * Performs the substitutions of String#gsub in place, returning
5271 * <i>str</i>, or <code>nil</code> if no substitutions were
5272 * performed. If no block and no <i>replacement</i> is given, an
5273 * enumerator is returned instead.
5274 */
5275
5276static VALUE
5277rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
5278{
5279 str_modify_keep_cr(str);
5280 return str_gsub(argc, argv, str, 1);
5281}
5282
5283
5284/*
5285 * call-seq:
5286 * str.gsub(pattern, replacement) -> new_str
5287 * str.gsub(pattern, hash) -> new_str
5288 * str.gsub(pattern) {|match| block } -> new_str
5289 * str.gsub(pattern) -> enumerator
5290 *
5291 * Returns a copy of <i>str</i> with <em>all</em> occurrences of
5292 * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
5293 * typically a Regexp; if given as a String, any
5294 * regular expression metacharacters it contains will be interpreted
5295 * literally, e.g. <code>\d</code> will match a backslash followed by 'd',
5296 * instead of a digit.
5297 *
5298 * If +replacement+ is a String it will be substituted for the matched text.
5299 * It may contain back-references to the pattern's capture groups of the form
5300 * <code>\d</code>, where <i>d</i> is a group number, or
5301 * <code>\k<n></code>, where <i>n</i> is a group name.
5302 * Similarly, <code>\&</code>, <code>\'</code>, <code>\`</code>, and
5303 * <code>\+</code> correspond to special variables, <code>$&</code>,
5304 * <code>$'</code>, <code>$`</code>, and <code>$+</code>, respectively.
5305 * (See rdoc-ref:regexp.rdoc for details.)
5306 * <code>\0</code> is the same as <code>\&</code>.
5307 * <code>\\\</code> is interpreted as an escape, i.e., a single backslash.
5308 * Note that, within +replacement+ the special match variables, such as
5309 * <code>$&</code>, will not refer to the current match.
5310 *
5311 * If the second argument is a Hash, and the matched text is one
5312 * of its keys, the corresponding value is the replacement string.
5313 *
5314 * In the block form, the current match string is passed in as a parameter,
5315 * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
5316 * <code>$&</code>, and <code>$'</code> will be set appropriately.
5317 * (See rdoc-ref:regexp.rdoc for details.)
5318 * The value returned by the block will be substituted for the match on each
5319 * call.
5320 *
5321 * When neither a block nor a second argument is supplied, an
5322 * Enumerator is returned.
5323 *
5324 * "hello".gsub(/[aeiou]/, '*') #=> "h*ll*"
5325 * "hello".gsub(/([aeiou])/, '<\1>') #=> "h<e>ll<o>"
5326 * "hello".gsub(/./) {|s| s.ord.to_s + ' '} #=> "104 101 108 108 111 "
5327 * "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}') #=> "h{e}ll{o}"
5328 * 'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*') #=> "h3ll*"
5329 *
5330 * Note that a string literal consumes backslashes.
5331 * (See rdoc-ref:syntax/literals.rdoc for details on string literals.)
5332 * Back-references are typically preceded by an additional backslash.
5333 * For example, if you want to write a back-reference <code>\&</code> in
5334 * +replacement+ with a double-quoted string literal, you need to write:
5335 * <code>"..\\\\&.."</code>.
5336 * If you want to write a non-back-reference string <code>\&</code> in
5337 * +replacement+, you need first to escape the backslash to prevent
5338 * this method from interpreting it as a back-reference, and then you
5339 * need to escape the backslashes again to prevent a string literal from
5340 * consuming them: <code>"..\\\\\\\\&.."</code>.
5341 * You may want to use the block form to avoid a lot of backslashes.
5342 */
5343
5344static VALUE
5345rb_str_gsub(int argc, VALUE *argv, VALUE str)
5346{
5347 return str_gsub(argc, argv, str, 0);
5348}
5349
5350
5351/*
5352 * call-seq:
5353 * str.replace(other_str) -> str
5354 *
5355 * Replaces the contents of <i>str</i> with the corresponding
5356 * values in <i>other_str</i>.
5357 *
5358 * s = "hello" #=> "hello"
5359 * s.replace "world" #=> "world"
5360 */
5361
5362VALUE
5364{
5365 str_modifiable(str);
5366 if (str == str2) return str;
5367
5368 StringValue(str2);
5369 str_discard(str);
5370 return str_replace(str, str2);
5371}
5372
5373/*
5374 * call-seq:
5375 * string.clear -> string
5376 *
5377 * Makes string empty.
5378 *
5379 * a = "abcde"
5380 * a.clear #=> ""
5381 */
5382
5383static VALUE
5384rb_str_clear(VALUE str)
5385{
5386 str_discard(str);
5389 RSTRING_PTR(str)[0] = 0;
5392 else
5394 return str;
5395}
5396
5397/*
5398 * call-seq:
5399 * string.chr -> string
5400 *
5401 * Returns a one-character string at the beginning of the string.
5402 *
5403 * a = "abcde"
5404 * a.chr #=> "a"
5405 */
5406
5407static VALUE
5408rb_str_chr(VALUE str)
5409{
5410 return rb_str_substr(str, 0, 1);
5411}
5412
5413/*
5414 * call-seq:
5415 * str.getbyte(index) -> 0 .. 255
5416 *
5417 * returns the <i>index</i>th byte as an integer.
5418 */
5419static VALUE
5420rb_str_getbyte(VALUE str, VALUE index)
5421{
5422 long pos = NUM2LONG(index);
5423
5424 if (pos < 0)
5425 pos += RSTRING_LEN(str);
5426 if (pos < 0 || RSTRING_LEN(str) <= pos)
5427 return Qnil;
5428
5429 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
5430}
5431
5432/*
5433 * call-seq:
5434 * str.setbyte(index, integer) -> integer
5435 *
5436 * modifies the <i>index</i>th byte as <i>integer</i>.
5437 */
5438static VALUE
5439rb_str_setbyte(VALUE str, VALUE index, VALUE value)
5440{
5441 long pos = NUM2LONG(index);
5442 long len = RSTRING_LEN(str);
5443 char *head, *left = 0;
5444 unsigned char *ptr;
5445 rb_encoding *enc;
5446 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
5447
5448 if (pos < -len || len <= pos)
5449 rb_raise(rb_eIndexError, "index %ld out of string", pos);
5450 if (pos < 0)
5451 pos += len;
5452
5453 VALUE v = rb_to_int(value);
5454 VALUE w = rb_int_and(v, INT2FIX(0xff));
5455 unsigned char byte = NUM2INT(w) & 0xFF;
5456
5457 if (!str_independent(str))
5458 str_make_independent(str);
5459 enc = STR_ENC_GET(str);
5460 head = RSTRING_PTR(str);
5461 ptr = (unsigned char *)&head[pos];
5462 if (!STR_EMBED_P(str)) {
5463 cr = ENC_CODERANGE(str);
5464 switch (cr) {
5465 case ENC_CODERANGE_7BIT:
5466 left = (char *)ptr;
5467 *ptr = byte;
5468 if (ISASCII(byte)) goto end;
5469 nlen = rb_enc_precise_mbclen(left, head+len, enc);
5470 if (!MBCLEN_CHARFOUND_P(nlen))
5472 else
5474 goto end;
5476 left = rb_enc_left_char_head(head, ptr, head+len, enc);
5477 width = rb_enc_precise_mbclen(left, head+len, enc);
5478 *ptr = byte;
5479 nlen = rb_enc_precise_mbclen(left, head+len, enc);
5480 if (!MBCLEN_CHARFOUND_P(nlen))
5482 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
5484 goto end;
5485 }
5486 }
5488 *ptr = byte;
5489
5490 end:
5491 return value;
5492}
5493
5494static VALUE
5495str_byte_substr(VALUE str, long beg, long len, int empty)
5496{
5497 char *p, *s = RSTRING_PTR(str);
5498 long n = RSTRING_LEN(str);
5499 VALUE str2;
5500
5501 if (beg > n || len < 0) return Qnil;
5502 if (beg < 0) {
5503 beg += n;
5504 if (beg < 0) return Qnil;
5505 }
5506 if (len > n - beg)
5507 len = n - beg;
5508 if (len <= 0) {
5509 if (!empty) return Qnil;
5510 len = 0;
5511 p = 0;
5512 }
5513 else
5514 p = s + beg;
5515
5517 str2 = rb_str_new_frozen(str);
5518 str2 = str_new_shared(rb_obj_class(str2), str2);
5519 RSTRING(str2)->as.heap.ptr += beg;
5520 RSTRING(str2)->as.heap.len = len;
5521 }
5522 else {
5523 str2 = rb_str_new_with_class(str, p, len);
5524 }
5525
5526 str_enc_copy(str2, str);
5527
5528 if (RSTRING_LEN(str2) == 0) {
5531 else
5533 }
5534 else {
5535 switch (ENC_CODERANGE(str)) {
5536 case ENC_CODERANGE_7BIT:
5538 break;
5539 default:
5541 break;
5542 }
5543 }
5544
5545 return str2;
5546}
5547
5548static VALUE
5549str_byte_aref(VALUE str, VALUE indx)
5550{
5551 long idx;
5552 if (FIXNUM_P(indx)) {
5553 idx = FIX2LONG(indx);
5554 }
5555 else {
5556 /* check if indx is Range */
5557 long beg, len = RSTRING_LEN(str);
5558
5559 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5560 case Qfalse:
5561 break;
5562 case Qnil:
5563 return Qnil;
5564 default:
5565 return str_byte_substr(str, beg, len, TRUE);
5566 }
5567
5568 idx = NUM2LONG(indx);
5569 }
5570 return str_byte_substr(str, idx, 1, FALSE);
5571}
5572
5573/*
5574 * call-seq:
5575 * str.byteslice(integer) -> new_str or nil
5576 * str.byteslice(integer, integer) -> new_str or nil
5577 * str.byteslice(range) -> new_str or nil
5578 *
5579 * Byte Reference---If passed a single Integer, returns a
5580 * substring of one byte at that position. If passed two Integer
5581 * objects, returns a substring starting at the offset given by the first, and
5582 * a length given by the second. If given a Range, a substring containing
5583 * bytes at offsets given by the range is returned. In all three cases, if
5584 * an offset is negative, it is counted from the end of <i>str</i>. Returns
5585 * <code>nil</code> if the initial offset falls outside the string, the length
5586 * is negative, or the beginning of the range is greater than the end.
5587 * The encoding of the resulted string keeps original encoding.
5588 *
5589 * "hello".byteslice(1) #=> "e"
5590 * "hello".byteslice(-1) #=> "o"
5591 * "hello".byteslice(1, 2) #=> "el"
5592 * "\x80\u3042".byteslice(1, 3) #=> "\u3042"
5593 * "\x03\u3042\xff".byteslice(1..3) #=> "\u3042"
5594 */
5595
5596static VALUE
5597rb_str_byteslice(int argc, VALUE *argv, VALUE str)
5598{
5599 if (argc == 2) {
5600 long beg = NUM2LONG(argv[0]);
5601 long end = NUM2LONG(argv[1]);
5602 return str_byte_substr(str, beg, end, TRUE);
5603 }
5604 rb_check_arity(argc, 1, 2);
5605 return str_byte_aref(str, argv[0]);
5606}
5607
5608/*
5609 * call-seq:
5610 * str.reverse -> new_str
5611 *
5612 * Returns a new string with the characters from <i>str</i> in reverse order.
5613 *
5614 * "stressed".reverse #=> "desserts"
5615 */
5616
5617static VALUE
5618rb_str_reverse(VALUE str)
5619{
5620 rb_encoding *enc;
5621 VALUE rev;
5622 char *s, *e, *p;
5623 int cr;
5624
5625 if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
5626 enc = STR_ENC_GET(str);
5628 s = RSTRING_PTR(str); e = RSTRING_END(str);
5629 p = RSTRING_END(rev);
5630 cr = ENC_CODERANGE(str);
5631
5632 if (RSTRING_LEN(str) > 1) {
5633 if (single_byte_optimizable(str)) {
5634 while (s < e) {
5635 *--p = *s++;
5636 }
5637 }
5638 else if (cr == ENC_CODERANGE_VALID) {
5639 while (s < e) {
5640 int clen = rb_enc_fast_mbclen(s, e, enc);
5641
5642 p -= clen;
5643 memcpy(p, s, clen);
5644 s += clen;
5645 }
5646 }
5647 else {
5648 cr = rb_enc_asciicompat(enc) ?
5650 while (s < e) {
5651 int clen = rb_enc_mbclen(s, e, enc);
5652
5653 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
5654 p -= clen;
5655 memcpy(p, s, clen);
5656 s += clen;
5657 }
5658 }
5659 }
5661 str_enc_copy(rev, str);
5662 ENC_CODERANGE_SET(rev, cr);
5663
5664 return rev;
5665}
5666
5667
5668/*
5669 * call-seq:
5670 * str.reverse! -> str
5671 *
5672 * Reverses <i>str</i> in place.
5673 */
5674
5675static VALUE
5676rb_str_reverse_bang(VALUE str)
5677{
5678 if (RSTRING_LEN(str) > 1) {
5679 if (single_byte_optimizable(str)) {
5680 char *s, *e, c;
5681
5682 str_modify_keep_cr(str);
5683 s = RSTRING_PTR(str);
5684 e = RSTRING_END(str) - 1;
5685 while (s < e) {
5686 c = *s;
5687 *s++ = *e;
5688 *e-- = c;
5689 }
5690 }
5691 else {
5692 str_shared_replace(str, rb_str_reverse(str));
5693 }
5694 }
5695 else {
5696 str_modify_keep_cr(str);
5697 }
5698 return str;
5699}
5700
5701
5702/*
5703 * call-seq:
5704 * str.include? other_str -> true or false
5705 *
5706 * Returns <code>true</code> if <i>str</i> contains the given string or
5707 * character.
5708 *
5709 * "hello".include? "lo" #=> true
5710 * "hello".include? "ol" #=> false
5711 * "hello".include? ?h #=> true
5712 */
5713
5714static VALUE
5715rb_str_include(VALUE str, VALUE arg)
5716{
5717 long i;
5718
5720 i = rb_str_index(str, arg, 0);
5721
5722 if (i == -1) return Qfalse;
5723 return Qtrue;
5724}
5725
5726
5727/*
5728 * call-seq:
5729 * str.to_i(base=10) -> integer
5730 *
5731 * Returns the result of interpreting leading characters in <i>str</i> as an
5732 * integer base <i>base</i> (between 2 and 36). Extraneous characters past the
5733 * end of a valid number are ignored. If there is not a valid number at the
5734 * start of <i>str</i>, <code>0</code> is returned. This method never raises an
5735 * exception when <i>base</i> is valid.
5736 *
5737 * "12345".to_i #=> 12345
5738 * "99 red balloons".to_i #=> 99
5739 * "0a".to_i #=> 0
5740 * "0a".to_i(16) #=> 10
5741 * "hello".to_i #=> 0
5742 * "1100101".to_i(2) #=> 101
5743 * "1100101".to_i(8) #=> 294977
5744 * "1100101".to_i(10) #=> 1100101
5745 * "1100101".to_i(16) #=> 17826049
5746 */
5747
5748static VALUE
5749rb_str_to_i(int argc, VALUE *argv, VALUE str)
5750{
5751 int base = 10;
5752
5753 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
5754 rb_raise(rb_eArgError, "invalid radix %d", base);
5755 }
5756 return rb_str_to_inum(str, base, FALSE);
5757}
5758
5759
5760/*
5761 * call-seq:
5762 * str.to_f -> float
5763 *
5764 * Returns the result of interpreting leading characters in <i>str</i> as a
5765 * floating point number. Extraneous characters past the end of a valid number
5766 * are ignored. If there is not a valid number at the start of <i>str</i>,
5767 * <code>0.0</code> is returned. This method never raises an exception.
5768 *
5769 * "123.45e1".to_f #=> 1234.5
5770 * "45.67 degrees".to_f #=> 45.67
5771 * "thx1138".to_f #=> 0.0
5772 */
5773
5774static VALUE
5775rb_str_to_f(VALUE str)
5776{
5777 return DBL2NUM(rb_str_to_dbl(str, FALSE));
5778}
5779
5780
5781/*
5782 * call-seq:
5783 * str.to_s -> str
5784 * str.to_str -> str
5785 *
5786 * Returns +self+.
5787 *
5788 * If called on a subclass of String, converts the receiver to a String object.
5789 */
5790
5791static VALUE
5792rb_str_to_s(VALUE str)
5793{
5794 if (rb_obj_class(str) != rb_cString) {
5795 return str_duplicate(rb_cString, str);
5796 }
5797 return str;
5798}
5799
5800#if 0
5801static void
5802str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
5803{
5804 char s[RUBY_MAX_CHAR_LEN];
5805 int n = rb_enc_codelen(c, enc);
5806
5807 rb_enc_mbcput(c, s, enc);
5808 rb_enc_str_buf_cat(str, s, n, enc);
5809}
5810#endif
5811
5812#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
5813
5814int
5815rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
5816{
5817 char buf[CHAR_ESC_LEN + 1];
5818 int l;
5819
5820#if SIZEOF_INT > 4
5821 c &= 0xffffffff;
5822#endif
5823 if (unicode_p) {
5824 if (c < 0x7F && ISPRINT(c)) {
5825 snprintf(buf, CHAR_ESC_LEN, "%c", c);
5826 }
5827 else if (c < 0x10000) {
5828 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
5829 }
5830 else {
5831 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
5832 }
5833 }
5834 else {
5835 if (c < 0x100) {
5836 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
5837 }
5838 else {
5839 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
5840 }
5841 }
5842 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
5843 rb_str_buf_cat(result, buf, l);
5844 return l;
5845}
5846
5847const char *
5849{
5850 switch (c) {
5851 case '\0': return "\\0";
5852 case '\n': return "\\n";
5853 case '\r': return "\\r";
5854 case '\t': return "\\t";
5855 case '\f': return "\\f";
5856 case '\013': return "\\v";
5857 case '\010': return "\\b";
5858 case '\007': return "\\a";
5859 case '\033': return "\\e";
5860 case '\x7f': return "\\c?";
5861 }
5862 return NULL;
5863}
5864
5865VALUE
5867{
5868 int encidx = ENCODING_GET(str);
5869 rb_encoding *enc = rb_enc_from_index(encidx);
5870 const char *p = RSTRING_PTR(str);
5871 const char *pend = RSTRING_END(str);
5872 const char *prev = p;
5873 char buf[CHAR_ESC_LEN + 1];
5874 VALUE result = rb_str_buf_new(0);
5875 int unicode_p = rb_enc_unicode_p(enc);
5876 int asciicompat = rb_enc_asciicompat(enc);
5877
5878 while (p < pend) {
5879 unsigned int c;
5880 const char *cc;
5881 int n = rb_enc_precise_mbclen(p, pend, enc);
5882 if (!MBCLEN_CHARFOUND_P(n)) {
5883 if (p > prev) str_buf_cat(result, prev, p - prev);
5884 n = rb_enc_mbminlen(enc);
5885 if (pend < p + n)
5886 n = (int)(pend - p);
5887 while (n--) {
5888 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
5889 str_buf_cat(result, buf, strlen(buf));
5890 prev = ++p;
5891 }
5892 continue;
5893 }
5895 c = rb_enc_mbc_to_codepoint(p, pend, enc);
5896 p += n;
5897 cc = ruby_escaped_char(c);
5898 if (cc) {
5899 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
5900 str_buf_cat(result, cc, strlen(cc));
5901 prev = p;
5902 }
5903 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
5904 }
5905 else {
5906 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
5907 rb_str_buf_cat_escaped_char(result, c, unicode_p);
5908 prev = p;
5909 }
5910 }
5911 if (p > prev) str_buf_cat(result, prev, p - prev);
5913
5914 return result;
5915}
5916
5917/*
5918 * call-seq:
5919 * str.inspect -> string
5920 *
5921 * Returns a printable version of _str_, surrounded by quote marks,
5922 * with special characters escaped.
5923 *
5924 * str = "hello"
5925 * str[3] = "\b"
5926 * str.inspect #=> "\"hel\\bo\""
5927 */
5928
5929VALUE
5931{
5932 int encidx = ENCODING_GET(str);
5933 rb_encoding *enc = rb_enc_from_index(encidx), *actenc;
5934 const char *p, *pend, *prev;
5935 char buf[CHAR_ESC_LEN + 1];
5936 VALUE result = rb_str_buf_new(0);
5938 int unicode_p = rb_enc_unicode_p(enc);
5939 int asciicompat = rb_enc_asciicompat(enc);
5940
5941 if (resenc == NULL) resenc = rb_default_external_encoding();
5942 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
5943 rb_enc_associate(result, resenc);
5944 str_buf_cat2(result, "\"");
5945
5946 p = RSTRING_PTR(str); pend = RSTRING_END(str);
5947 prev = p;
5948 actenc = get_actual_encoding(encidx, str);
5949 if (actenc != enc) {
5950 enc = actenc;
5951 if (unicode_p) unicode_p = rb_enc_unicode_p(enc);
5952 }
5953 while (p < pend) {
5954 unsigned int c, cc;
5955 int n;
5956
5957 n = rb_enc_precise_mbclen(p, pend, enc);
5958 if (!MBCLEN_CHARFOUND_P(n)) {
5959 if (p > prev) str_buf_cat(result, prev, p - prev);
5960 n = rb_enc_mbminlen(enc);
5961 if (pend < p + n)
5962 n = (int)(pend - p);
5963 while (n--) {
5964 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
5965 str_buf_cat(result, buf, strlen(buf));
5966 prev = ++p;
5967 }
5968 continue;
5969 }
5971 c = rb_enc_mbc_to_codepoint(p, pend, enc);
5972 p += n;
5973 if ((asciicompat || unicode_p) &&
5974 (c == '"'|| c == '\\' ||
5975 (c == '#' &&
5976 p < pend &&
5978 (cc = rb_enc_codepoint(p,pend,enc),
5979 (cc == '$' || cc == '@' || cc == '{'))))) {
5980 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
5981 str_buf_cat2(result, "\\");
5982 if (asciicompat || enc == resenc) {
5983 prev = p - n;
5984 continue;
5985 }
5986 }
5987 switch (c) {
5988 case '\n': cc = 'n'; break;
5989 case '\r': cc = 'r'; break;
5990 case '\t': cc = 't'; break;
5991 case '\f': cc = 'f'; break;
5992 case '\013': cc = 'v'; break;
5993 case '\010': cc = 'b'; break;
5994 case '\007': cc = 'a'; break;
5995 case 033: cc = 'e'; break;
5996 default: cc = 0; break;
5997 }
5998 if (cc) {
5999 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6000 buf[0] = '\\';
6001 buf[1] = (char)cc;
6002 str_buf_cat(result, buf, 2);
6003 prev = p;
6004 continue;
6005 }
6006 if ((enc == resenc && rb_enc_isprint(c, enc)) ||
6007 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6008 continue;
6009 }
6010 else {
6011 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6012 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6013 prev = p;
6014 continue;
6015 }
6016 }
6017 if (p > prev) str_buf_cat(result, prev, p - prev);
6018 str_buf_cat2(result, "\"");
6019
6020 return result;
6021}
6022
6023#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6024
6025/*
6026 * call-seq:
6027 * str.dump -> new_str
6028 *
6029 * Returns a quoted version of the string with all non-printing characters
6030 * replaced by <code>\xHH</code> notation and all special characters escaped.
6031 *
6032 * This method can be used for round-trip: if the resulting +new_str+ is
6033 * eval'ed, it will produce the original string.
6034 *
6035 * "hello \n ''".dump #=> "\"hello \\n ''\""
6036 * "\f\x00\xff\\\"".dump #=> "\"\\f\\x00\\xFF\\\\\\\"\""
6037 *
6038 * See also String#undump.
6039 */
6040
6041VALUE
6043{
6044 int encidx = rb_enc_get_index(str);
6045 rb_encoding *enc = rb_enc_from_index(encidx);
6046 long len;
6047 const char *p, *pend;
6048 char *q, *qend;
6049 VALUE result;
6050 int u8 = (encidx == rb_utf8_encindex());
6051 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6052
6053 len = 2; /* "" */
6054 if (!rb_enc_asciicompat(enc)) {
6055 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6056 len += strlen(enc->name);
6057 }
6058
6059 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6060 while (p < pend) {
6061 int clen;
6062 unsigned char c = *p++;
6063
6064 switch (c) {
6065 case '"': case '\\':
6066 case '\n': case '\r':
6067 case '\t': case '\f':
6068 case '\013': case '\010': case '\007': case '\033':
6069 clen = 2;
6070 break;
6071
6072 case '#':
6073 clen = IS_EVSTR(p, pend) ? 2 : 1;
6074 break;
6075
6076 default:
6077 if (ISPRINT(c)) {
6078 clen = 1;
6079 }
6080 else {
6081 if (u8 && c > 0x7F) { /* \u notation */
6082 int n = rb_enc_precise_mbclen(p-1, pend, enc);
6083 if (MBCLEN_CHARFOUND_P(n)) {
6084 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6085 if (cc <= 0xFFFF)
6086 clen = 6; /* \uXXXX */
6087 else if (cc <= 0xFFFFF)
6088 clen = 9; /* \u{XXXXX} */
6089 else
6090 clen = 10; /* \u{XXXXXX} */
6091 p += MBCLEN_CHARFOUND_LEN(n)-1;
6092 break;
6093 }
6094 }
6095 clen = 4; /* \xNN */
6096 }
6097 break;
6098 }
6099
6100 if (clen > LONG_MAX - len) {
6101 rb_raise(rb_eRuntimeError, "string size too big");
6102 }
6103 len += clen;
6104 }
6105
6106 result = rb_str_new_with_class(str, 0, len);
6107 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6108 q = RSTRING_PTR(result); qend = q + len + 1;
6109
6110 *q++ = '"';
6111 while (p < pend) {
6112 unsigned char c = *p++;
6113
6114 if (c == '"' || c == '\\') {
6115 *q++ = '\\';
6116 *q++ = c;
6117 }
6118 else if (c == '#') {
6119 if (IS_EVSTR(p, pend)) *q++ = '\\';
6120 *q++ = '#';
6121 }
6122 else if (c == '\n') {
6123 *q++ = '\\';
6124 *q++ = 'n';
6125 }
6126 else if (c == '\r') {
6127 *q++ = '\\';
6128 *q++ = 'r';
6129 }
6130 else if (c == '\t') {
6131 *q++ = '\\';
6132 *q++ = 't';
6133 }
6134 else if (c == '\f') {
6135 *q++ = '\\';
6136 *q++ = 'f';
6137 }
6138 else if (c == '\013') {
6139 *q++ = '\\';
6140 *q++ = 'v';
6141 }
6142 else if (c == '\010') {
6143 *q++ = '\\';
6144 *q++ = 'b';
6145 }
6146 else if (c == '\007') {
6147 *q++ = '\\';
6148 *q++ = 'a';
6149 }
6150 else if (c == '\033') {
6151 *q++ = '\\';
6152 *q++ = 'e';
6153 }
6154 else if (ISPRINT(c)) {
6155 *q++ = c;
6156 }
6157 else {
6158 *q++ = '\\';
6159 if (u8) {
6160 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
6161 if (MBCLEN_CHARFOUND_P(n)) {
6162 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6163 p += n;
6164 if (cc <= 0xFFFF)
6165 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
6166 else
6167 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
6168 q += strlen(q);
6169 continue;
6170 }
6171 }
6172 snprintf(q, qend-q, "x%02X", c);
6173 q += 3;
6174 }
6175 }
6176 *q++ = '"';
6177 *q = '\0';
6178 if (!rb_enc_asciicompat(enc)) {
6179 snprintf(q, qend-q, nonascii_suffix, enc->name);
6180 encidx = rb_ascii8bit_encindex();
6181 }
6182 /* result from dump is ASCII */
6183 rb_enc_associate_index(result, encidx);
6185 return result;
6186}
6187
6188static int
6189unescape_ascii(unsigned int c)
6190{
6191 switch (c) {
6192 case 'n':
6193 return '\n';
6194 case 'r':
6195 return '\r';
6196 case 't':
6197 return '\t';
6198 case 'f':
6199 return '\f';
6200 case 'v':
6201 return '\13';
6202 case 'b':
6203 return '\010';
6204 case 'a':
6205 return '\007';
6206 case 'e':
6207 return 033;
6208 default:
6210 }
6211}
6212
6213static void
6214undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
6215{
6216 const char *s = *ss;
6217 unsigned int c;
6218 int codelen;
6219 size_t hexlen;
6220 unsigned char buf[6];
6221 static rb_encoding *enc_utf8 = NULL;
6222
6223 switch (*s) {
6224 case '\\':
6225 case '"':
6226 case '#':
6227 rb_str_cat(undumped, s, 1); /* cat itself */
6228 s++;
6229 break;
6230 case 'n':
6231 case 'r':
6232 case 't':
6233 case 'f':
6234 case 'v':
6235 case 'b':
6236 case 'a':
6237 case 'e':
6238 *buf = unescape_ascii(*s);
6239 rb_str_cat(undumped, (char *)buf, 1);
6240 s++;
6241 break;
6242 case 'u':
6243 if (*binary) {
6244 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6245 }
6246 *utf8 = true;
6247 if (++s >= s_end) {
6248 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6249 }
6250 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
6251 if (*penc != enc_utf8) {
6252 *penc = enc_utf8;
6253 rb_enc_associate(undumped, enc_utf8);
6254 }
6255 if (*s == '{') { /* handle \u{...} form */
6256 s++;
6257 for (;;) {
6258 if (s >= s_end) {
6259 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
6260 }
6261 if (*s == '}') {
6262 s++;
6263 break;
6264 }
6265 if (ISSPACE(*s)) {
6266 s++;
6267 continue;
6268 }
6269 c = scan_hex(s, s_end-s, &hexlen);
6270 if (hexlen == 0 || hexlen > 6) {
6271 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6272 }
6273 if (c > 0x10ffff) {
6274 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
6275 }
6276 if (0xd800 <= c && c <= 0xdfff) {
6277 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6278 }
6279 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6280 rb_str_cat(undumped, (char *)buf, codelen);
6281 s += hexlen;
6282 }
6283 }
6284 else { /* handle \uXXXX form */
6285 c = scan_hex(s, 4, &hexlen);
6286 if (hexlen != 4) {
6287 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6288 }
6289 if (0xd800 <= c && c <= 0xdfff) {
6290 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6291 }
6292 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6293 rb_str_cat(undumped, (char *)buf, codelen);
6294 s += hexlen;
6295 }
6296 break;
6297 case 'x':
6298 if (*utf8) {
6299 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6300 }
6301 *binary = true;
6302 if (++s >= s_end) {
6303 rb_raise(rb_eRuntimeError, "invalid hex escape");
6304 }
6305 *buf = scan_hex(s, 2, &hexlen);
6306 if (hexlen != 2) {
6307 rb_raise(rb_eRuntimeError, "invalid hex escape");
6308 }
6309 rb_str_cat(undumped, (char *)buf, 1);
6310 s += hexlen;
6311 break;
6312 default:
6313 rb_str_cat(undumped, s-1, 2);
6314 s++;
6315 }
6316
6317 *ss = s;
6318}
6319
6320static VALUE rb_str_is_ascii_only_p(VALUE str);
6321
6322/*
6323 * call-seq:
6324 * str.undump -> new_str
6325 *
6326 * Returns an unescaped version of the string.
6327 * This does the inverse of String#dump.
6328 *
6329 * "\"hello \\n ''\"".undump #=> "hello \n ''"
6330 */
6331
6332static VALUE
6333str_undump(VALUE str)
6334{
6335 const char *s = RSTRING_PTR(str);
6336 const char *s_end = RSTRING_END(str);
6337 rb_encoding *enc = rb_enc_get(str);
6338 VALUE undumped = rb_enc_str_new(s, 0L, enc);
6339 bool utf8 = false;
6340 bool binary = false;
6341 int w;
6342
6344 if (rb_str_is_ascii_only_p(str) == Qfalse) {
6345 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
6346 }
6347 if (!str_null_check(str, &w)) {
6348 rb_raise(rb_eRuntimeError, "string contains null byte");
6349 }
6350 if (RSTRING_LEN(str) < 2) goto invalid_format;
6351 if (*s != '"') goto invalid_format;
6352
6353 /* strip '"' at the start */
6354 s++;
6355
6356 for (;;) {
6357 if (s >= s_end) {
6358 rb_raise(rb_eRuntimeError, "unterminated dumped string");
6359 }
6360
6361 if (*s == '"') {
6362 /* epilogue */
6363 s++;
6364 if (s == s_end) {
6365 /* ascii compatible dumped string */
6366 break;
6367 }
6368 else {
6369 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
6370 static const char dup_suffix[] = ".dup";
6371 const char *encname;
6372 int encidx;
6374
6375 /* check separately for strings dumped by older versions */
6376 size = sizeof(dup_suffix) - 1;
6377 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
6378
6379 size = sizeof(force_encoding_suffix) - 1;
6380 if (s_end - s <= size) goto invalid_format;
6381 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
6382 s += size;
6383
6384 if (utf8) {
6385 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
6386 }
6387
6388 encname = s;
6389 s = memchr(s, '"', s_end-s);
6390 size = s - encname;
6391 if (!s) goto invalid_format;
6392 if (s_end - s != 2) goto invalid_format;
6393 if (s[0] != '"' || s[1] != ')') goto invalid_format;
6394
6395 encidx = rb_enc_find_index2(encname, (long)size);
6396 if (encidx < 0) {
6397 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
6398 }
6399 rb_enc_associate_index(undumped, encidx);
6400 }
6401 break;
6402 }
6403
6404 if (*s == '\\') {
6405 s++;
6406 if (s >= s_end) {
6407 rb_raise(rb_eRuntimeError, "invalid escape");
6408 }
6409 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
6410 }
6411 else {
6412 rb_str_cat(undumped, s++, 1);
6413 }
6414 }
6415
6416 return undumped;
6417invalid_format:
6418 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
6419}
6420
6421static void
6422rb_str_check_dummy_enc(rb_encoding *enc)
6423{
6424 if (rb_enc_dummy_p(enc)) {
6425 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
6426 rb_enc_name(enc));
6427 }
6428}
6429
6430static rb_encoding *
6431str_true_enc(VALUE str)
6432{
6433 rb_encoding *enc = STR_ENC_GET(str);
6434 rb_str_check_dummy_enc(enc);
6435 return enc;
6436}
6437
6438static OnigCaseFoldType
6439check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
6440{
6441 if (argc==0)
6442 return flags;
6443 if (argc>2)
6444 rb_raise(rb_eArgError, "too many options");
6445 if (argv[0]==sym_turkic) {
6447 if (argc==2) {
6448 if (argv[1]==sym_lithuanian)
6450 else
6451 rb_raise(rb_eArgError, "invalid second option");
6452 }
6453 }
6454 else if (argv[0]==sym_lithuanian) {
6456 if (argc==2) {
6457 if (argv[1]==sym_turkic)
6459 else
6460 rb_raise(rb_eArgError, "invalid second option");
6461 }
6462 }
6463 else if (argc>1)
6464 rb_raise(rb_eArgError, "too many options");
6465 else if (argv[0]==sym_ascii)
6466 flags |= ONIGENC_CASE_ASCII_ONLY;
6467 else if (argv[0]==sym_fold) {
6470 else
6471 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
6472 }
6473 else
6474 rb_raise(rb_eArgError, "invalid option");
6475 return flags;
6476}
6477
6478static inline bool
6479case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
6480{
6481 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
6482 return true;
6484}
6485
6486/* 16 should be long enough to absorb any kind of single character length increase */
6487#define CASE_MAPPING_ADDITIONAL_LENGTH 20
6488#ifndef CASEMAP_DEBUG
6489# define CASEMAP_DEBUG 0
6490#endif
6491
6492struct mapping_buffer;
6493typedef struct mapping_buffer {
6494 size_t capa;
6495 size_t used;
6499
6500static void
6501mapping_buffer_free(void *p)
6502{
6503 mapping_buffer *previous_buffer;
6504 mapping_buffer *current_buffer = p;
6505 while (current_buffer) {
6506 previous_buffer = current_buffer;
6507 current_buffer = current_buffer->next;
6508 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
6509 }
6510}
6511
6512static const rb_data_type_t mapping_buffer_type = {
6513 "mapping_buffer",
6514 {0, mapping_buffer_free,}
6515};
6516
6517static VALUE
6518rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
6519{
6520 VALUE target;
6521
6522 const OnigUChar *source_current, *source_end;
6523 int target_length = 0;
6524 VALUE buffer_anchor;
6525 mapping_buffer *current_buffer = 0;
6526 mapping_buffer **pre_buffer;
6527 size_t buffer_count = 0;
6528 int buffer_length_or_invalid;
6529
6530 if (RSTRING_LEN(source) == 0) return rb_str_dup(source);
6531
6532 source_current = (OnigUChar*)RSTRING_PTR(source);
6533 source_end = (OnigUChar*)RSTRING_END(source);
6534
6535 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
6536 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
6537 while (source_current < source_end) {
6538 /* increase multiplier using buffer count to converge quickly */
6539 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
6540 if (CASEMAP_DEBUG) {
6541 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
6542 }
6543 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
6544 *pre_buffer = current_buffer;
6545 pre_buffer = &current_buffer->next;
6546 current_buffer->next = NULL;
6547 current_buffer->capa = capa;
6548 buffer_length_or_invalid = enc->case_map(flags,
6549 (const OnigUChar**)&source_current, source_end,
6550 current_buffer->space,
6551 current_buffer->space+current_buffer->capa,
6552 enc);
6553 if (buffer_length_or_invalid < 0) {
6554 current_buffer = DATA_PTR(buffer_anchor);
6555 DATA_PTR(buffer_anchor) = 0;
6556 mapping_buffer_free(current_buffer);
6557 rb_raise(rb_eArgError, "input string invalid");
6558 }
6559 target_length += current_buffer->used = buffer_length_or_invalid;
6560 }
6561 if (CASEMAP_DEBUG) {
6562 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
6563 }
6564
6565 if (buffer_count==1) {
6566 target = rb_str_new_with_class(source, (const char*)current_buffer->space, target_length);
6567 }
6568 else {
6569 char *target_current;
6570
6571 target = rb_str_new_with_class(source, 0, target_length);
6572 target_current = RSTRING_PTR(target);
6573 current_buffer = DATA_PTR(buffer_anchor);
6574 while (current_buffer) {
6575 memcpy(target_current, current_buffer->space, current_buffer->used);
6576 target_current += current_buffer->used;
6577 current_buffer = current_buffer->next;
6578 }
6579 }
6580 current_buffer = DATA_PTR(buffer_anchor);
6581 DATA_PTR(buffer_anchor) = 0;
6582 mapping_buffer_free(current_buffer);
6583
6584 /* TODO: check about string terminator character */
6585 str_enc_copy(target, source);
6586 /*ENC_CODERANGE_SET(mapped, cr);*/
6587
6588 return target;
6589}
6590
6591static VALUE
6592rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
6593{
6594 const OnigUChar *source_current, *source_end;
6595 OnigUChar *target_current, *target_end;
6596 long old_length = RSTRING_LEN(source);
6597 int length_or_invalid;
6598
6599 if (old_length == 0) return Qnil;
6600
6601 source_current = (OnigUChar*)RSTRING_PTR(source);
6602 source_end = (OnigUChar*)RSTRING_END(source);
6603 if (source == target) {
6604 target_current = (OnigUChar*)source_current;
6605 target_end = (OnigUChar*)source_end;
6606 }
6607 else {
6608 target_current = (OnigUChar*)RSTRING_PTR(target);
6609 target_end = (OnigUChar*)RSTRING_END(target);
6610 }
6611
6612 length_or_invalid = onigenc_ascii_only_case_map(flags,
6613 &source_current, source_end,
6614 target_current, target_end, enc);
6615 if (length_or_invalid < 0)
6616 rb_raise(rb_eArgError, "input string invalid");
6617 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
6618 fprintf(stderr, "problem with rb_str_ascii_casemap"
6619 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
6620 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
6621 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
6622 }
6623
6624 str_enc_copy(target, source);
6625
6626 return target;
6627}
6628
6629static bool
6630upcase_single(VALUE str)
6631{
6632 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
6633 bool modified = false;
6634
6635 while (s < send) {
6636 unsigned int c = *(unsigned char*)s;
6637
6638 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
6639 *s = 'A' + (c - 'a');
6640 modified = true;
6641 }
6642 s++;
6643 }
6644 return modified;
6645}
6646
6647/*
6648 * call-seq:
6649 * str.upcase! -> str or nil
6650 * str.upcase!([options]) -> str or nil
6651 *
6652 * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
6653 * were made.
6654 *
6655 * See String#downcase for meaning of +options+ and use with different encodings.
6656 */
6657
6658static VALUE
6659rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
6660{
6661 rb_encoding *enc;
6663
6664 flags = check_case_options(argc, argv, flags);
6665 str_modify_keep_cr(str);
6666 enc = str_true_enc(str);
6667 if (case_option_single_p(flags, enc, str)) {
6668 if (upcase_single(str))
6669 flags |= ONIGENC_CASE_MODIFIED;
6670 }
6671 else if (flags&ONIGENC_CASE_ASCII_ONLY)
6672 rb_str_ascii_casemap(str, str, &flags, enc);
6673 else
6674 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6675
6676 if (ONIGENC_CASE_MODIFIED&flags) return str;
6677 return Qnil;
6678}
6679
6680
6681/*
6682 * call-seq:
6683 * str.upcase -> new_str
6684 * str.upcase([options]) -> new_str
6685 *
6686 * Returns a copy of <i>str</i> with all lowercase letters replaced with their
6687 * uppercase counterparts.
6688 *
6689 * See String#downcase for meaning of +options+ and use with different encodings.
6690 *
6691 * "hEllO".upcase #=> "HELLO"
6692 */
6693
6694static VALUE
6695rb_str_upcase(int argc, VALUE *argv, VALUE str)
6696{
6697 rb_encoding *enc;
6699 VALUE ret;
6700
6701 flags = check_case_options(argc, argv, flags);
6702 enc = str_true_enc(str);
6703 if (case_option_single_p(flags, enc, str)) {
6705 str_enc_copy(ret, str);
6706 upcase_single(ret);
6707 }
6708 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
6710 rb_str_ascii_casemap(str, ret, &flags, enc);
6711 }
6712 else {
6713 ret = rb_str_casemap(str, &flags, enc);
6714 }
6715
6716 return ret;
6717}
6718
6719static bool
6720downcase_single(VALUE str)
6721{
6722 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
6723 bool modified = false;
6724
6725 while (s < send) {
6726 unsigned int c = *(unsigned char*)s;
6727
6728 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
6729 *s = 'a' + (c - 'A');
6730 modified = true;
6731 }
6732 s++;
6733 }
6734
6735 return modified;
6736}
6737
6738/*
6739 * call-seq:
6740 * str.downcase! -> str or nil
6741 * str.downcase!([options]) -> str or nil
6742 *
6743 * Downcases the contents of <i>str</i>, returning <code>nil</code> if no
6744 * changes were made.
6745 *
6746 * See String#downcase for meaning of +options+ and use with different encodings.
6747 */
6748
6749static VALUE
6750rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
6751{
6752 rb_encoding *enc;
6754
6755 flags = check_case_options(argc, argv, flags);
6756 str_modify_keep_cr(str);
6757 enc = str_true_enc(str);
6758 if (case_option_single_p(flags, enc, str)) {
6759 if (downcase_single(str))
6760 flags |= ONIGENC_CASE_MODIFIED;
6761 }
6762 else if (flags&ONIGENC_CASE_ASCII_ONLY)
6763 rb_str_ascii_casemap(str, str, &flags, enc);
6764 else
6765 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6766
6767 if (ONIGENC_CASE_MODIFIED&flags) return str;
6768 return Qnil;
6769}
6770
6771
6772/*
6773 * call-seq:
6774 * str.downcase -> new_str
6775 * str.downcase([options]) -> new_str
6776 *
6777 * Returns a copy of <i>str</i> with all uppercase letters replaced with their
6778 * lowercase counterparts. Which letters exactly are replaced, and by which
6779 * other letters, depends on the presence or absence of options, and on the
6780 * +encoding+ of the string.
6781 *
6782 * The meaning of the +options+ is as follows:
6783 *
6784 * No option ::
6785 * Full Unicode case mapping, suitable for most languages
6786 * (see :turkic and :lithuanian options below for exceptions).
6787 * Context-dependent case mapping as described in Table 3-14 of the
6788 * Unicode standard is currently not supported.
6789 * :ascii ::
6790 * Only the ASCII region, i.e. the characters ``A'' to ``Z'' and
6791 * ``a'' to ``z'', are affected.
6792 * This option cannot be combined with any other option.
6793 * :turkic ::
6794 * Full Unicode case mapping, adapted for Turkic languages
6795 * (Turkish, Azerbaijani, ...). This means that upper case I is mapped to
6796 * lower case dotless i, and so on.
6797 * :lithuanian ::
6798 * Currently, just full Unicode case mapping. In the future, full Unicode
6799 * case mapping adapted for Lithuanian (keeping the dot on the lower case
6800 * i even if there is an accent on top).
6801 * :fold ::
6802 * Only available on +downcase+ and +downcase!+. Unicode case <b>folding</b>,
6803 * which is more far-reaching than Unicode case mapping.
6804 * This option currently cannot be combined with any other option
6805 * (i.e. there is currently no variant for turkic languages).
6806 *
6807 * Please note that several assumptions that are valid for ASCII-only case
6808 * conversions do not hold for more general case conversions. For example,
6809 * the length of the result may not be the same as the length of the input
6810 * (neither in characters nor in bytes), some roundtrip assumptions
6811 * (e.g. str.downcase == str.upcase.downcase) may not apply, and Unicode
6812 * normalization (i.e. String#unicode_normalize) is not necessarily maintained
6813 * by case mapping operations.
6814 *
6815 * Non-ASCII case mapping/folding is currently supported for UTF-8,
6816 * UTF-16BE/LE, UTF-32BE/LE, and ISO-8859-1~16 Strings/Symbols.
6817 * This support will be extended to other encodings.
6818 *
6819 * "hEllO".downcase #=> "hello"
6820 */
6821
6822static VALUE
6823rb_str_downcase(int argc, VALUE *argv, VALUE str)
6824{
6825 rb_encoding *enc;
6827 VALUE ret;
6828
6829 flags = check_case_options(argc, argv, flags);
6830 enc = str_true_enc(str);
6831 if (case_option_single_p(flags, enc, str)) {
6833 str_enc_copy(ret, str);
6834 downcase_single(ret);
6835 }
6836 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
6838 rb_str_ascii_casemap(str, ret, &flags, enc);
6839 }
6840 else {
6841 ret = rb_str_casemap(str, &flags, enc);
6842 }
6843
6844 return ret;
6845}
6846
6847
6848/*
6849 * call-seq:
6850 * str.capitalize! -> str or nil
6851 * str.capitalize!([options]) -> str or nil
6852 *
6853 * Modifies <i>str</i> by converting the first character to uppercase and the
6854 * remainder to lowercase. Returns <code>nil</code> if no changes are made.
6855 * There is an exception for modern Georgian (mkhedruli/MTAVRULI), where
6856 * the result is the same as for String#downcase, to avoid mixed case.
6857 *
6858 * See String#downcase for meaning of +options+ and use with different encodings.
6859 *
6860 * a = "hello"
6861 * a.capitalize! #=> "Hello"
6862 * a #=> "Hello"
6863 * a.capitalize! #=> nil
6864 */
6865
6866static VALUE
6867rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
6868{
6869 rb_encoding *enc;
6871
6872 flags = check_case_options(argc, argv, flags);
6873 str_modify_keep_cr(str);
6874 enc = str_true_enc(str);
6875 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
6876 if (flags&ONIGENC_CASE_ASCII_ONLY)
6877 rb_str_ascii_casemap(str, str, &flags, enc);
6878 else
6879 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6880
6881 if (ONIGENC_CASE_MODIFIED&flags) return str;
6882 return Qnil;
6883}
6884
6885
6886/*
6887 * call-seq:
6888 * str.capitalize -> new_str
6889 * str.capitalize([options]) -> new_str
6890 *
6891 * Returns a copy of <i>str</i> with the first character converted to uppercase
6892 * and the remainder to lowercase.
6893 *
6894 * See String#downcase for meaning of +options+ and use with different encodings.
6895 *
6896 * "hello".capitalize #=> "Hello"
6897 * "HELLO".capitalize #=> "Hello"
6898 * "123ABC".capitalize #=> "123abc"
6899 */
6900
6901static VALUE
6902rb_str_capitalize(int argc, VALUE *argv, VALUE str)
6903{
6904 rb_encoding *enc;
6906 VALUE ret;
6907
6908 flags = check_case_options(argc, argv, flags);
6909 enc = str_true_enc(str);
6910 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
6911 if (flags&ONIGENC_CASE_ASCII_ONLY) {
6913 rb_str_ascii_casemap(str, ret, &flags, enc);
6914 }
6915 else {
6916 ret = rb_str_casemap(str, &flags, enc);
6917 }
6918 return ret;
6919}
6920
6921
6922/*
6923 * call-seq:
6924 * str.swapcase! -> str or nil
6925 * str.swapcase!([options]) -> str or nil
6926 *
6927 * Equivalent to String#swapcase, but modifies the receiver in place,
6928 * returning <i>str</i>, or <code>nil</code> if no changes were made.
6929 *
6930 * See String#downcase for meaning of +options+ and use with
6931 * different encodings.
6932 */
6933
6934static VALUE
6935rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
6936{
6937 rb_encoding *enc;
6939
6940 flags = check_case_options(argc, argv, flags);
6941 str_modify_keep_cr(str);
6942 enc = str_true_enc(str);
6943 if (flags&ONIGENC_CASE_ASCII_ONLY)
6944 rb_str_ascii_casemap(str, str, &flags, enc);
6945 else
6946 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6947
6948 if (ONIGENC_CASE_MODIFIED&flags) return str;
6949 return Qnil;
6950}
6951
6952
6953/*
6954 * call-seq:
6955 * str.swapcase -> new_str
6956 * str.swapcase([options]) -> new_str
6957 *
6958 * Returns a copy of <i>str</i> with uppercase alphabetic characters converted
6959 * to lowercase and lowercase characters converted to uppercase.
6960 *
6961 * See String#downcase for meaning of +options+ and use with different encodings.
6962 *
6963 * "Hello".swapcase #=> "hELLO"
6964 * "cYbEr_PuNk11".swapcase #=> "CyBeR_pUnK11"
6965 */
6966
6967static VALUE
6968rb_str_swapcase(int argc, VALUE *argv, VALUE str)
6969{
6970 rb_encoding *enc;
6972 VALUE ret;
6973
6974 flags = check_case_options(argc, argv, flags);
6975 enc = str_true_enc(str);
6976 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
6977 if (flags&ONIGENC_CASE_ASCII_ONLY) {
6979 rb_str_ascii_casemap(str, ret, &flags, enc);
6980 }
6981 else {
6982 ret = rb_str_casemap(str, &flags, enc);
6983 }
6984 return ret;
6985}
6986
6987typedef unsigned char *USTR;
6988
6989struct tr {
6990 int gen;
6991 unsigned int now, max;
6992 char *p, *pend;
6993};
6994
6995static unsigned int
6996trnext(struct tr *t, rb_encoding *enc)
6997{
6998 int n;
6999
7000 for (;;) {
7001 if (!t->gen) {
7002nextpart:
7003 if (t->p == t->pend) return -1;
7004 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
7005 t->p += n;
7006 }
7007 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7008 t->p += n;
7009 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
7010 t->p += n;
7011 if (t->p < t->pend) {
7012 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7013 t->p += n;
7014 if (t->now > c) {
7015 if (t->now < 0x80 && c < 0x80) {
7017 "invalid range \"%c-%c\" in string transliteration",
7018 t->now, c);
7019 }
7020 else {
7021 rb_raise(rb_eArgError, "invalid range in string transliteration");
7022 }
7023 continue; /* not reached */
7024 }
7025 t->gen = 1;
7026 t->max = c;
7027 }
7028 }
7029 return t->now;
7030 }
7031 else {
7032 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7033 if (t->now == t->max) {
7034 t->gen = 0;
7035 goto nextpart;
7036 }
7037 }
7038 if (t->now < t->max) {
7039 return t->now;
7040 }
7041 else {
7042 t->gen = 0;
7043 return t->max;
7044 }
7045 }
7046 }
7047}
7048
7049static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
7050
7051static VALUE
7052tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
7053{
7054 const unsigned int errc = -1;
7055 unsigned int trans[256];
7056 rb_encoding *enc, *e1, *e2;
7057 struct tr trsrc, trrepl;
7058 int cflag = 0;
7059 unsigned int c, c0, last = 0;
7060 int modify = 0, i, l;
7061 unsigned char *s, *send;
7062 VALUE hash = 0;
7063 int singlebyte = single_byte_optimizable(str);
7064 int termlen;
7065 int cr;
7066
7067#define CHECK_IF_ASCII(c) \
7068 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7069 (cr = ENC_CODERANGE_VALID) : 0)
7070
7072 StringValue(repl);
7073 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7074 if (RSTRING_LEN(repl) == 0) {
7075 return rb_str_delete_bang(1, &src, str);
7076 }
7077
7078 cr = ENC_CODERANGE(str);
7079 e1 = rb_enc_check(str, src);
7080 e2 = rb_enc_check(str, repl);
7081 if (e1 == e2) {
7082 enc = e1;
7083 }
7084 else {
7085 enc = rb_enc_check(src, repl);
7086 }
7087 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7088 if (RSTRING_LEN(src) > 1 &&
7089 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7090 trsrc.p + l < trsrc.pend) {
7091 cflag = 1;
7092 trsrc.p += l;
7093 }
7094 trrepl.p = RSTRING_PTR(repl);
7095 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7096 trsrc.gen = trrepl.gen = 0;
7097 trsrc.now = trrepl.now = 0;
7098 trsrc.max = trrepl.max = 0;
7099
7100 if (cflag) {
7101 for (i=0; i<256; i++) {
7102 trans[i] = 1;
7103 }
7104 while ((c = trnext(&trsrc, enc)) != errc) {
7105 if (c < 256) {
7106 trans[c] = errc;
7107 }
7108 else {
7109 if (!hash) hash = rb_hash_new();
7110 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
7111 }
7112 }
7113 while ((c = trnext(&trrepl, enc)) != errc)
7114 /* retrieve last replacer */;
7115 last = trrepl.now;
7116 for (i=0; i<256; i++) {
7117 if (trans[i] != errc) {
7118 trans[i] = last;
7119 }
7120 }
7121 }
7122 else {
7123 unsigned int r;
7124
7125 for (i=0; i<256; i++) {
7126 trans[i] = errc;
7127 }
7128 while ((c = trnext(&trsrc, enc)) != errc) {
7129 r = trnext(&trrepl, enc);
7130 if (r == errc) r = trrepl.now;
7131 if (c < 256) {
7132 trans[c] = r;
7133 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7134 }
7135 else {
7136 if (!hash) hash = rb_hash_new();
7137 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
7138 }
7139 }
7140 }
7141
7142 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
7143 cr = ENC_CODERANGE_7BIT;
7144 str_modify_keep_cr(str);
7145 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
7146 termlen = rb_enc_mbminlen(enc);
7147 if (sflag) {
7148 int clen, tlen;
7149 long offset, max = RSTRING_LEN(str);
7150 unsigned int save = -1;
7151 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7152
7153 while (s < send) {
7154 int may_modify = 0;
7155
7156 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7157 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7158
7159 s += clen;
7160 if (c < 256) {
7161 c = trans[c];
7162 }
7163 else if (hash) {
7164 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7165 if (NIL_P(tmp)) {
7166 if (cflag) c = last;
7167 else c = errc;
7168 }
7169 else if (cflag) c = errc;
7170 else c = NUM2INT(tmp);
7171 }
7172 else {
7173 c = errc;
7174 }
7175 if (c != (unsigned int)-1) {
7176 if (save == c) {
7177 CHECK_IF_ASCII(c);
7178 continue;
7179 }
7180 save = c;
7181 tlen = rb_enc_codelen(c, enc);
7182 modify = 1;
7183 }
7184 else {
7185 save = -1;
7186 c = c0;
7187 if (enc != e1) may_modify = 1;
7188 }
7189 if ((offset = t - buf) + tlen > max) {
7190 size_t MAYBE_UNUSED(old) = max + termlen;
7191 max = offset + tlen + (send - s);
7192 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7193 t = buf + offset;
7194 }
7195 rb_enc_mbcput(c, t, enc);
7196 if (may_modify && memcmp(s, t, tlen) != 0) {
7197 modify = 1;
7198 }
7199 CHECK_IF_ASCII(c);
7200 t += tlen;
7201 }
7202 if (!STR_EMBED_P(str)) {
7204 }
7205 TERM_FILL((char *)t, termlen);
7206 RSTRING(str)->as.heap.ptr = (char *)buf;
7207 RSTRING(str)->as.heap.len = t - buf;
7209 RSTRING(str)->as.heap.aux.capa = max;
7210 }
7211 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
7212 while (s < send) {
7213 c = (unsigned char)*s;
7214 if (trans[c] != errc) {
7215 if (!cflag) {
7216 c = trans[c];
7217 *s = c;
7218 modify = 1;
7219 }
7220 else {
7221 *s = last;
7222 modify = 1;
7223 }
7224 }
7225 CHECK_IF_ASCII(c);
7226 s++;
7227 }
7228 }
7229 else {
7230 int clen, tlen;
7231 long offset, max = (long)((send - s) * 1.2);
7232 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7233
7234 while (s < send) {
7235 int may_modify = 0;
7236 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7237 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7238
7239 if (c < 256) {
7240 c = trans[c];
7241 }
7242 else if (hash) {
7243 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7244 if (NIL_P(tmp)) {
7245 if (cflag) c = last;
7246 else c = errc;
7247 }
7248 else if (cflag) c = errc;
7249 else c = NUM2INT(tmp);
7250 }
7251 else {
7252 c = cflag ? last : errc;
7253 }
7254 if (c != errc) {
7255 tlen = rb_enc_codelen(c, enc);
7256 modify = 1;
7257 }
7258 else {
7259 c = c0;
7260 if (enc != e1) may_modify = 1;
7261 }
7262 if ((offset = t - buf) + tlen > max) {
7263 size_t MAYBE_UNUSED(old) = max + termlen;
7264 max = offset + tlen + (long)((send - s) * 1.2);
7265 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7266 t = buf + offset;
7267 }
7268 if (s != t) {
7269 rb_enc_mbcput(c, t, enc);
7270 if (may_modify && memcmp(s, t, tlen) != 0) {
7271 modify = 1;
7272 }
7273 }
7274 CHECK_IF_ASCII(c);
7275 s += clen;
7276 t += tlen;
7277 }
7278 if (!STR_EMBED_P(str)) {
7280 }
7281 TERM_FILL((char *)t, termlen);
7282 RSTRING(str)->as.heap.ptr = (char *)buf;
7283 RSTRING(str)->as.heap.len = t - buf;
7285 RSTRING(str)->as.heap.aux.capa = max;
7286 }
7287
7288 if (modify) {
7289 if (cr != ENC_CODERANGE_BROKEN)
7291 rb_enc_associate(str, enc);
7292 return str;
7293 }
7294 return Qnil;
7295}
7296
7297
7298/*
7299 * call-seq:
7300 * str.tr!(from_str, to_str) -> str or nil
7301 *
7302 * Translates <i>str</i> in place, using the same rules as
7303 * String#tr. Returns <i>str</i>, or <code>nil</code> if no changes
7304 * were made.
7305 */
7306
7307static VALUE
7308rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
7309{
7310 return tr_trans(str, src, repl, 0);
7311}
7312
7313
7314/*
7315 * call-seq:
7316 * str.tr(from_str, to_str) => new_str
7317 *
7318 * Returns a copy of +str+ with the characters in +from_str+ replaced by the
7319 * corresponding characters in +to_str+. If +to_str+ is shorter than
7320 * +from_str+, it is padded with its last character in order to maintain the
7321 * correspondence.
7322 *
7323 * "hello".tr('el', 'ip') #=> "hippo"
7324 * "hello".tr('aeiou', '*') #=> "h*ll*"
7325 * "hello".tr('aeiou', 'AA*') #=> "hAll*"
7326 *
7327 * Both strings may use the <code>c1-c2</code> notation to denote ranges of
7328 * characters, and +from_str+ may start with a <code>^</code>, which denotes
7329 * all characters except those listed.
7330 *
7331 * "hello".tr('a-y', 'b-z') #=> "ifmmp"
7332 * "hello".tr('^aeiou', '*') #=> "*e**o"
7333 *
7334 * The backslash character <code>\</code> can be used to escape
7335 * <code>^</code> or <code>-</code> and is otherwise ignored unless it
7336 * appears at the end of a range or the end of the +from_str+ or +to_str+:
7337 *
7338 * "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
7339 * "hello-world".tr("a\\-eo", "*") #=> "h*ll**w*rld"
7340 *
7341 * "hello\r\nworld".tr("\r", "") #=> "hello\nworld"
7342 * "hello\r\nworld".tr("\\r", "") #=> "hello\r\nwold"
7343 * "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
7344 *
7345 * "X['\\b']".tr("X\\", "") #=> "['b']"
7346 * "X['\\b']".tr("X-\\]", "") #=> "'b'"
7347 */
7348
7349static VALUE
7350rb_str_tr(VALUE str, VALUE src, VALUE repl)
7351{
7352 str = rb_str_dup(str);
7353 tr_trans(str, src, repl, 0);
7354 return str;
7355}
7356
7357#define TR_TABLE_SIZE 257
7358static void
7359tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
7360 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
7361{
7362 const unsigned int errc = -1;
7363 char buf[256];
7364 struct tr tr;
7365 unsigned int c;
7366 VALUE table = 0, ptable = 0;
7367 int i, l, cflag = 0;
7368
7370 tr.gen = tr.now = tr.max = 0;
7371
7372 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
7373 cflag = 1;
7374 tr.p += l;
7375 }
7376 if (first) {
7377 for (i=0; i<256; i++) {
7378 stable[i] = 1;
7379 }
7380 stable[256] = cflag;
7381 }
7382 else if (stable[256] && !cflag) {
7383 stable[256] = 0;
7384 }
7385 for (i=0; i<256; i++) {
7386 buf[i] = cflag;
7387 }
7388
7389 while ((c = trnext(&tr, enc)) != errc) {
7390 if (c < 256) {
7391 buf[c & 0xff] = !cflag;
7392 }
7393 else {
7394 VALUE key = UINT2NUM(c);
7395
7396 if (!table && (first || *tablep || stable[256])) {
7397 if (cflag) {
7398 ptable = *ctablep;
7399 table = ptable ? ptable : rb_hash_new();
7400 *ctablep = table;
7401 }
7402 else {
7403 table = rb_hash_new();
7404 ptable = *tablep;
7405 *tablep = table;
7406 }
7407 }
7408 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
7409 rb_hash_aset(table, key, Qtrue);
7410 }
7411 }
7412 }
7413 for (i=0; i<256; i++) {
7414 stable[i] = stable[i] && buf[i];
7415 }
7416 if (!table && !cflag) {
7417 *tablep = 0;
7418 }
7419}
7420
7421
7422static int
7423tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
7424{
7425 if (c < 256) {
7426 return table[c] != 0;
7427 }
7428 else {
7429 VALUE v = UINT2NUM(c);
7430
7431 if (del) {
7432 if (!NIL_P(rb_hash_lookup(del, v)) &&
7433 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
7434 return TRUE;
7435 }
7436 }
7437 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
7438 return FALSE;
7439 }
7440 return table[256] ? TRUE : FALSE;
7441 }
7442}
7443
7444/*
7445 * call-seq:
7446 * str.delete!([other_str]+) -> str or nil
7447 *
7448 * Performs a <code>delete</code> operation in place, returning <i>str</i>, or
7449 * <code>nil</code> if <i>str</i> was not modified.
7450 */
7451
7452static VALUE
7453rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
7454{
7455 char squeez[TR_TABLE_SIZE];
7456 rb_encoding *enc = 0;
7457 char *s, *send, *t;
7458 VALUE del = 0, nodel = 0;
7459 int modify = 0;
7460 int i, ascompat, cr;
7461
7462 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7464 for (i=0; i<argc; i++) {
7465 VALUE s = argv[i];
7466
7467 StringValue(s);
7468 enc = rb_enc_check(str, s);
7469 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
7470 }
7471
7472 str_modify_keep_cr(str);
7473 ascompat = rb_enc_asciicompat(enc);
7474 s = t = RSTRING_PTR(str);
7475 send = RSTRING_END(str);
7476 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
7477 while (s < send) {
7478 unsigned int c;
7479 int clen;
7480
7481 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
7482 if (squeez[c]) {
7483 modify = 1;
7484 }
7485 else {
7486 if (t != s) *t = c;
7487 t++;
7488 }
7489 s++;
7490 }
7491 else {
7492 c = rb_enc_codepoint_len(s, send, &clen, enc);
7493
7494 if (tr_find(c, squeez, del, nodel)) {
7495 modify = 1;
7496 }
7497 else {
7498 if (t != s) rb_enc_mbcput(c, t, enc);
7499 t += clen;
7501 }
7502 s += clen;
7503 }
7504 }
7505 TERM_FILL(t, TERM_LEN(str));
7508
7509 if (modify) return str;
7510 return Qnil;
7511}
7512
7513
7514/*
7515 * call-seq:
7516 * str.delete([other_str]+) -> new_str
7517 *
7518 * Returns a copy of <i>str</i> with all characters in the intersection of its
7519 * arguments deleted. Uses the same rules for building the set of characters as
7520 * String#count.
7521 *
7522 * "hello".delete "l","lo" #=> "heo"
7523 * "hello".delete "lo" #=> "he"
7524 * "hello".delete "aeiou", "^e" #=> "hell"
7525 * "hello".delete "ej-m" #=> "ho"
7526 */
7527
7528static VALUE
7529rb_str_delete(int argc, VALUE *argv, VALUE str)
7530{
7531 str = rb_str_dup(str);
7532 rb_str_delete_bang(argc, argv, str);
7533 return str;
7534}
7535
7536
7537/*
7538 * call-seq:
7539 * str.squeeze!([other_str]*) -> str or nil
7540 *
7541 * Squeezes <i>str</i> in place, returning either <i>str</i>, or
7542 * <code>nil</code> if no changes were made.
7543 */
7544
7545static VALUE
7546rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
7547{
7548 char squeez[TR_TABLE_SIZE];
7549 rb_encoding *enc = 0;
7550 VALUE del = 0, nodel = 0;
7551 unsigned char *s, *send, *t;
7552 int i, modify = 0;
7553 int ascompat, singlebyte = single_byte_optimizable(str);
7554 unsigned int save;
7555
7556 if (argc == 0) {
7557 enc = STR_ENC_GET(str);
7558 }
7559 else {
7560 for (i=0; i<argc; i++) {
7561 VALUE s = argv[i];
7562
7563 StringValue(s);
7564 enc = rb_enc_check(str, s);
7565 if (singlebyte && !single_byte_optimizable(s))
7566 singlebyte = 0;
7567 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
7568 }
7569 }
7570
7571 str_modify_keep_cr(str);
7572 s = t = (unsigned char *)RSTRING_PTR(str);
7573 if (!s || RSTRING_LEN(str) == 0) return Qnil;
7574 send = (unsigned char *)RSTRING_END(str);
7575 save = -1;
7576 ascompat = rb_enc_asciicompat(enc);
7577
7578 if (singlebyte) {
7579 while (s < send) {
7580 unsigned int c = *s++;
7581 if (c != save || (argc > 0 && !squeez[c])) {
7582 *t++ = save = c;
7583 }
7584 }
7585 }
7586 else {
7587 while (s < send) {
7588 unsigned int c;
7589 int clen;
7590
7591 if (ascompat && (c = *s) < 0x80) {
7592 if (c != save || (argc > 0 && !squeez[c])) {
7593 *t++ = save = c;
7594 }
7595 s++;
7596 }
7597 else {
7598 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
7599
7600 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
7601 if (t != s) rb_enc_mbcput(c, t, enc);
7602 save = c;
7603 t += clen;
7604 }
7605 s += clen;
7606 }
7607 }
7608 }
7609
7610 TERM_FILL((char *)t, TERM_LEN(str));
7611 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
7612 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
7613 modify = 1;
7614 }
7615
7616 if (modify) return str;
7617 return Qnil;
7618}
7619
7620
7621/*
7622 * call-seq:
7623 * str.squeeze([other_str]*) -> new_str
7624 *
7625 * Builds a set of characters from the <i>other_str</i> parameter(s)
7626 * using the procedure described for String#count. Returns a new
7627 * string where runs of the same character that occur in this set are
7628 * replaced by a single character. If no arguments are given, all
7629 * runs of identical characters are replaced by a single character.
7630 *
7631 * "yellow moon".squeeze #=> "yelow mon"
7632 * " now is the".squeeze(" ") #=> " now is the"
7633 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
7634 */
7635
7636static VALUE
7637rb_str_squeeze(int argc, VALUE *argv, VALUE str)
7638{
7639 str = rb_str_dup(str);
7640 rb_str_squeeze_bang(argc, argv, str);
7641 return str;
7642}
7643
7644
7645/*
7646 * call-seq:
7647 * str.tr_s!(from_str, to_str) -> str or nil
7648 *
7649 * Performs String#tr_s processing on <i>str</i> in place,
7650 * returning <i>str</i>, or <code>nil</code> if no changes were made.
7651 */
7652
7653static VALUE
7654rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
7655{
7656 return tr_trans(str, src, repl, 1);
7657}
7658
7659
7660/*
7661 * call-seq:
7662 * str.tr_s(from_str, to_str) -> new_str
7663 *
7664 * Processes a copy of <i>str</i> as described under String#tr, then
7665 * removes duplicate characters in regions that were affected by the
7666 * translation.
7667 *
7668 * "hello".tr_s('l', 'r') #=> "hero"
7669 * "hello".tr_s('el', '*') #=> "h*o"
7670 * "hello".tr_s('el', 'hx') #=> "hhxo"
7671 */
7672
7673static VALUE
7674rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
7675{
7676 str = rb_str_dup(str);
7677 tr_trans(str, src, repl, 1);
7678 return str;
7679}
7680
7681
7682/*
7683 * call-seq:
7684 * str.count([other_str]+) -> integer
7685 *
7686 * Each +other_str+ parameter defines a set of characters to count. The
7687 * intersection of these sets defines the characters to count in +str+. Any
7688 * +other_str+ that starts with a caret <code>^</code> is negated. The
7689 * sequence <code>c1-c2</code> means all characters between c1 and c2. The
7690 * backslash character <code>\</code> can be used to escape <code>^</code> or
7691 * <code>-</code> and is otherwise ignored unless it appears at the end of a
7692 * sequence or the end of a +other_str+.
7693 *
7694 * a = "hello world"
7695 * a.count "lo" #=> 5
7696 * a.count "lo", "o" #=> 2
7697 * a.count "hello", "^l" #=> 4
7698 * a.count "ej-m" #=> 4
7699 *
7700 * "hello^world".count "\\^aeiou" #=> 4
7701 * "hello-world".count "a\\-eo" #=> 4
7702 *
7703 * c = "hello world\\r\\n"
7704 * c.count "\\" #=> 2
7705 * c.count "\\A" #=> 0
7706 * c.count "X-\\w" #=> 3
7707 */
7708
7709static VALUE
7710rb_str_count(int argc, VALUE *argv, VALUE str)
7711{
7712 char table[TR_TABLE_SIZE];
7713 rb_encoding *enc = 0;
7714 VALUE del = 0, nodel = 0, tstr;
7715 char *s, *send;
7716 int i;
7717 int ascompat;
7718
7720
7721 tstr = argv[0];
7722 StringValue(tstr);
7723 enc = rb_enc_check(str, tstr);
7724 if (argc == 1) {
7725 const char *ptstr;
7726 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
7727 (ptstr = RSTRING_PTR(tstr),
7728 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
7730 int n = 0;
7731 int clen;
7732 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
7733
7734 s = RSTRING_PTR(str);
7735 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
7736 send = RSTRING_END(str);
7737 while (s < send) {
7738 if (*(unsigned char*)s++ == c) n++;
7739 }
7740 return INT2NUM(n);
7741 }
7742 }
7743
7744 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
7745 for (i=1; i<argc; i++) {
7746 tstr = argv[i];
7747 StringValue(tstr);
7748 enc = rb_enc_check(str, tstr);
7749 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
7750 }
7751
7752 s = RSTRING_PTR(str);
7753 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
7754 send = RSTRING_END(str);
7755 ascompat = rb_enc_asciicompat(enc);
7756 i = 0;
7757 while (s < send) {
7758 unsigned int c;
7759
7760 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
7761 if (table[c]) {
7762 i++;
7763 }
7764 s++;
7765 }
7766 else {
7767 int clen;
7768 c = rb_enc_codepoint_len(s, send, &clen, enc);
7769 if (tr_find(c, table, del, nodel)) {
7770 i++;
7771 }
7772 s += clen;
7773 }
7774 }
7775
7776 return INT2NUM(i);
7777}
7778
7779static VALUE
7780rb_fs_check(VALUE val)
7781{
7782 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
7783 val = rb_check_string_type(val);
7784 if (NIL_P(val)) return 0;
7785 }
7786 return val;
7787}
7788
7789static const char isspacetable[256] = {
7790 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
7791 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7792 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7793 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7794 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7795 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7796 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7797 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7798 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7799 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7800 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7801 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7802 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7803 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7804 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7805 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
7806};
7807
7808#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
7809
7810static long
7811split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
7812{
7813 if (empty_count >= 0 && len == 0) {
7814 return empty_count + 1;
7815 }
7816 if (empty_count > 0) {
7817 /* make different substrings */
7818 if (result) {
7819 do {
7820 rb_ary_push(result, str_new_empty(str));
7821 } while (--empty_count > 0);
7822 }
7823 else {
7824 do {
7825 rb_yield(str_new_empty(str));
7826 } while (--empty_count > 0);
7827 }
7828 }
7829 str = rb_str_subseq(str, beg, len);
7830 if (result) {
7831 rb_ary_push(result, str);
7832 }
7833 else {
7834 rb_yield(str);
7835 }
7836 return empty_count;
7837}
7838
7839/*
7840 * call-seq:
7841 * str.split(pattern=nil, [limit]) -> an_array
7842 * str.split(pattern=nil, [limit]) {|sub| block } -> str
7843 *
7844 * Divides <i>str</i> into substrings based on a delimiter, returning an array
7845 * of these substrings.
7846 *
7847 * If <i>pattern</i> is a String, then its contents are used as
7848 * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
7849 * space, <i>str</i> is split on whitespace, with leading and trailing
7850 * whitespace and runs of contiguous whitespace characters ignored.
7851 *
7852 * If <i>pattern</i> is a Regexp, <i>str</i> is divided where the
7853 * pattern matches. Whenever the pattern matches a zero-length string,
7854 * <i>str</i> is split into individual characters. If <i>pattern</i> contains
7855 * groups, the respective matches will be returned in the array as well.
7856 *
7857 * If <i>pattern</i> is <code>nil</code>, the value of <code>$;</code> is used.
7858 * If <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
7859 * split on whitespace as if ' ' were specified.
7860 *
7861 * If the <i>limit</i> parameter is omitted, trailing null fields are
7862 * suppressed. If <i>limit</i> is a positive number, at most that number
7863 * of split substrings will be returned (captured groups will be returned
7864 * as well, but are not counted towards the limit).
7865 * If <i>limit</i> is <code>1</code>, the entire
7866 * string is returned as the only entry in an array. If negative, there is no
7867 * limit to the number of fields returned, and trailing null fields are not
7868 * suppressed.
7869 *
7870 * When the input +str+ is empty an empty Array is returned as the string is
7871 * considered to have no fields to split.
7872 *
7873 * " now's the time ".split #=> ["now's", "the", "time"]
7874 * " now's the time ".split(' ') #=> ["now's", "the", "time"]
7875 * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"]
7876 * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
7877 * "hello".split(//) #=> ["h", "e", "l", "l", "o"]
7878 * "hello".split(//, 3) #=> ["h", "e", "llo"]
7879 * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"]
7880 *
7881 * "mellow yellow".split("ello") #=> ["m", "w y", "w"]
7882 * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
7883 * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
7884 * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
7885 *
7886 * "1:2:3".split(/(:)()()/, 2) #=> ["1", ":", "", "", "2:3"]
7887 *
7888 * "".split(',', -1) #=> []
7889 *
7890 * If a block is given, invoke the block with each split substring.
7891 *
7892 */
7893
7894static VALUE
7895rb_str_split_m(int argc, VALUE *argv, VALUE str)
7896{
7897 rb_encoding *enc;
7898 VALUE spat;
7899 VALUE limit;
7900 enum {awk, string, regexp, chars} split_type;
7901 long beg, end, i = 0, empty_count = -1;
7902 int lim = 0;
7903 VALUE result, tmp;
7904
7905 result = rb_block_given_p() ? Qfalse : Qnil;
7906 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
7907 lim = NUM2INT(limit);
7908 if (lim <= 0) limit = Qnil;
7909 else if (lim == 1) {
7910 if (RSTRING_LEN(str) == 0)
7911 return result ? rb_ary_new2(0) : str;
7912 tmp = rb_str_dup(str);
7913 if (!result) {
7914 rb_yield(tmp);
7915 return str;
7916 }
7917 return rb_ary_new3(1, tmp);
7918 }
7919 i = 1;
7920 }
7921 if (NIL_P(limit) && !lim) empty_count = 0;
7922
7923 enc = STR_ENC_GET(str);
7924 split_type = regexp;
7925 if (!NIL_P(spat)) {
7926 spat = get_pat_quoted(spat, 0);
7927 }
7928 else if (NIL_P(spat = rb_fs)) {
7929 split_type = awk;
7930 }
7931 else if (!(spat = rb_fs_check(spat))) {
7932 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
7933 }
7934 else {
7935 rb_warn("$; is set to non-nil value");
7936 }
7937 if (split_type != awk) {
7938 if (BUILTIN_TYPE(spat) == T_STRING) {
7939 rb_encoding *enc2 = STR_ENC_GET(spat);
7940
7941 mustnot_broken(spat);
7942 split_type = string;
7943 if (RSTRING_LEN(spat) == 0) {
7944 /* Special case - split into chars */
7945 split_type = chars;
7946 }
7947 else if (rb_enc_asciicompat(enc2) == 1) {
7948 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' ') {
7949 split_type = awk;
7950 }
7951 }
7952 else {
7953 int l;
7954 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
7955 RSTRING_LEN(spat) == l) {
7956 split_type = awk;
7957 }
7958 }
7959 }
7960 }
7961
7962#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
7963
7964 if (result) result = rb_ary_new();
7965 beg = 0;
7966 char *ptr = RSTRING_PTR(str);
7967 char *eptr = RSTRING_END(str);
7968 if (split_type == awk) {
7969 char *bptr = ptr;
7970 int skip = 1;
7971 unsigned int c;
7972
7973 end = beg;
7974 if (is_ascii_string(str)) {
7975 while (ptr < eptr) {
7976 c = (unsigned char)*ptr++;
7977 if (skip) {
7978 if (ascii_isspace(c)) {
7979 beg = ptr - bptr;
7980 }
7981 else {
7982 end = ptr - bptr;
7983 skip = 0;
7984 if (!NIL_P(limit) && lim <= i) break;
7985 }
7986 }
7987 else if (ascii_isspace(c)) {
7988 SPLIT_STR(beg, end-beg);
7989 skip = 1;
7990 beg = ptr - bptr;
7991 if (!NIL_P(limit)) ++i;
7992 }
7993 else {
7994 end = ptr - bptr;
7995 }
7996 }
7997 }
7998 else {
7999 while (ptr < eptr) {
8000 int n;
8001
8002 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8003 ptr += n;
8004 if (skip) {
8005 if (rb_isspace(c)) {
8006 beg = ptr - bptr;
8007 }
8008 else {
8009 end = ptr - bptr;
8010 skip = 0;
8011 if (!NIL_P(limit) && lim <= i) break;
8012 }
8013 }
8014 else if (rb_isspace(c)) {
8015 SPLIT_STR(beg, end-beg);
8016 skip = 1;
8017 beg = ptr - bptr;
8018 if (!NIL_P(limit)) ++i;
8019 }
8020 else {
8021 end = ptr - bptr;
8022 }
8023 }
8024 }
8025 }
8026 else if (split_type == string) {
8027 char *str_start = ptr;
8028 char *substr_start = ptr;
8029 char *sptr = RSTRING_PTR(spat);
8030 long slen = RSTRING_LEN(spat);
8031
8032 mustnot_broken(str);
8033 enc = rb_enc_check(str, spat);
8034 while (ptr < eptr &&
8035 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8036 /* Check we are at the start of a char */
8037 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
8038 if (t != ptr + end) {
8039 ptr = t;
8040 continue;
8041 }
8042 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8043 ptr += end + slen;
8044 substr_start = ptr;
8045 if (!NIL_P(limit) && lim <= ++i) break;
8046 }
8047 beg = ptr - str_start;
8048 }
8049 else if (split_type == chars) {
8050 char *str_start = ptr;
8051 int n;
8052
8053 mustnot_broken(str);
8054 enc = rb_enc_get(str);
8055 while (ptr < eptr &&
8056 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8057 SPLIT_STR(ptr - str_start, n);
8058 ptr += n;
8059 if (!NIL_P(limit) && lim <= ++i) break;
8060 }
8061 beg = ptr - str_start;
8062 }
8063 else {
8064 long len = RSTRING_LEN(str);
8065 long start = beg;
8066 long idx;
8067 int last_null = 0;
8068 struct re_registers *regs;
8069 VALUE match = 0;
8070
8071 for (; (end = rb_reg_search(spat, str, start, 0)) >= 0;
8072 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
8073 match = rb_backref_get();
8074 if (!result) rb_match_busy(match);
8075 regs = RMATCH_REGS(match);
8076 if (start == end && BEG(0) == END(0)) {
8077 if (!ptr) {
8078 SPLIT_STR(0, 0);
8079 break;
8080 }
8081 else if (last_null == 1) {
8082 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8083 beg = start;
8084 }
8085 else {
8086 if (start == len)
8087 start++;
8088 else
8089 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8090 last_null = 1;
8091 continue;
8092 }
8093 }
8094 else {
8095 SPLIT_STR(beg, end-beg);
8096 beg = start = END(0);
8097 }
8098 last_null = 0;
8099
8100 for (idx=1; idx < regs->num_regs; idx++) {
8101 if (BEG(idx) == -1) continue;
8102 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8103 }
8104 if (!NIL_P(limit) && lim <= ++i) break;
8105 }
8106 if (match) rb_match_unbusy(match);
8107 }
8108 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8109 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
8110 }
8111
8112 return result ? result : str;
8113}
8114
8115VALUE
8116rb_str_split(VALUE str, const char *sep0)
8117{
8118 VALUE sep;
8119
8121 sep = rb_str_new_cstr(sep0);
8122 return rb_str_split_m(1, &sep, str);
8123}
8124
8125#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8126
8127static inline int
8128enumerator_element(VALUE ary, VALUE e)
8129{
8130 if (ary) {
8131 rb_ary_push(ary, e);
8132 return 0;
8133 }
8134 else {
8135 rb_yield(e);
8136 return 1;
8137 }
8138}
8139
8140#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8141
8142static const char *
8143chomp_newline(const char *p, const char *e, rb_encoding *enc)
8144{
8145 const char *prev = rb_enc_prev_char(p, e, e, enc);
8146 if (rb_enc_is_newline(prev, e, enc)) {
8147 e = prev;
8148 prev = rb_enc_prev_char(p, e, e, enc);
8149 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
8150 e = prev;
8151 }
8152 return e;
8153}
8154
8155static VALUE
8156rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
8157{
8158 rb_encoding *enc;
8159 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
8160 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
8161 long pos, len, rslen;
8162 int rsnewline = 0;
8163
8164 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
8165 rs = rb_rs;
8166 if (!NIL_P(opts)) {
8167 static ID keywords[1];
8168 if (!keywords[0]) {
8169 keywords[0] = rb_intern_const("chomp");
8170 }
8171 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
8172 chomp = (chomp != Qundef && RTEST(chomp));
8173 }
8174
8175 if (NIL_P(rs)) {
8176 if (!ENUM_ELEM(ary, str)) {
8177 return ary;
8178 }
8179 else {
8180 return orig;
8181 }
8182 }
8183
8184 if (!RSTRING_LEN(str)) goto end;
8186 ptr = subptr = RSTRING_PTR(str);
8187 pend = RSTRING_END(str);
8188 len = RSTRING_LEN(str);
8189 StringValue(rs);
8190 rslen = RSTRING_LEN(rs);
8191
8192 if (rs == rb_default_rs)
8193 enc = rb_enc_get(str);
8194 else
8195 enc = rb_enc_check(str, rs);
8196
8197 if (rslen == 0) {
8198 /* paragraph mode */
8199 int n;
8200 const char *eol = NULL;
8201 subend = subptr;
8202 while (subend < pend) {
8203 do {
8204 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
8205 n = 0;
8206 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
8207 if (rb_enc_is_newline(subend + n, pend, enc)) {
8208 if (eol == subend) break;
8209 subend += rslen;
8210 if (subptr) eol = subend;
8211 }
8212 else {
8213 if (!subptr) subptr = subend;
8214 subend += rslen;
8215 }
8216 rslen = 0;
8217 } while (subend < pend);
8218 if (!subptr) break;
8219 line = rb_str_subseq(str, subptr - ptr,
8220 subend - subptr + (chomp ? 0 : rslen));
8221 if (ENUM_ELEM(ary, line)) {
8222 str_mod_check(str, ptr, len);
8223 }
8224 subptr = eol = NULL;
8225 }
8226 goto end;
8227 }
8228 else {
8229 rsptr = RSTRING_PTR(rs);
8230 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
8231 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
8232 rsnewline = 1;
8233 }
8234 }
8235
8236 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
8237 rs = rb_str_new(rsptr, rslen);
8238 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
8239 rsptr = RSTRING_PTR(rs);
8240 rslen = RSTRING_LEN(rs);
8241 }
8242
8243 while (subptr < pend) {
8244 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
8245 if (pos < 0) break;
8246 hit = subptr + pos;
8247 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
8248 if (hit != adjusted) {
8249 subptr = adjusted;
8250 continue;
8251 }
8252 subend = hit += rslen;
8253 if (chomp) {
8254 if (rsnewline) {
8255 subend = chomp_newline(subptr, subend, enc);
8256 }
8257 else {
8258 subend -= rslen;
8259 }
8260 }
8261 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
8262 if (ENUM_ELEM(ary, line)) {
8263 str_mod_check(str, ptr, len);
8264 }
8265 subptr = hit;
8266 }
8267
8268 if (subptr != pend) {
8269 if (chomp) {
8270 if (rsnewline) {
8271 pend = chomp_newline(subptr, pend, enc);
8272 }
8273 else if (pend - subptr >= rslen &&
8274 memcmp(pend - rslen, rsptr, rslen) == 0) {
8275 pend -= rslen;
8276 }
8277 }
8278 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
8279 ENUM_ELEM(ary, line);
8281 }
8282
8283 end:
8284 if (ary)
8285 return ary;
8286 else
8287 return orig;
8288}
8289
8290/*
8291 * call-seq:
8292 * str.each_line(separator=$/, chomp: false) {|substr| block } -> str
8293 * str.each_line(separator=$/, chomp: false) -> an_enumerator
8294 *
8295 * Splits <i>str</i> using the supplied parameter as the record
8296 * separator (<code>$/</code> by default), passing each substring in
8297 * turn to the supplied block. If a zero-length record separator is
8298 * supplied, the string is split into paragraphs delimited by
8299 * multiple successive newlines.
8300 *
8301 * If +chomp+ is +true+, +separator+ will be removed from the end of each
8302 * line.
8303 *
8304 * If no block is given, an enumerator is returned instead.
8305 *
8306 * "hello\nworld".each_line {|s| p s}
8307 * # prints:
8308 * # "hello\n"
8309 * # "world"
8310 *
8311 * "hello\nworld".each_line('l') {|s| p s}
8312 * # prints:
8313 * # "hel"
8314 * # "l"
8315 * # "o\nworl"
8316 * # "d"
8317 *
8318 * "hello\n\n\nworld".each_line('') {|s| p s}
8319 * # prints
8320 * # "hello\n\n"
8321 * # "world"
8322 *
8323 * "hello\nworld".each_line(chomp: true) {|s| p s}
8324 * # prints:
8325 * # "hello"
8326 * # "world"
8327 *
8328 * "hello\nworld".each_line('l', chomp: true) {|s| p s}
8329 * # prints:
8330 * # "he"
8331 * # ""
8332 * # "o\nwor"
8333 * # "d"
8334 *
8335 */
8336
8337static VALUE
8338rb_str_each_line(int argc, VALUE *argv, VALUE str)
8339{
8341 return rb_str_enumerate_lines(argc, argv, str, 0);
8342}
8343
8344/*
8345 * call-seq:
8346 * str.lines(separator=$/, chomp: false) -> an_array
8347 *
8348 * Returns an array of lines in <i>str</i> split using the supplied
8349 * record separator (<code>$/</code> by default). This is a
8350 * shorthand for <code>str.each_line(separator, getline_args).to_a</code>.
8351 *
8352 * If +chomp+ is +true+, +separator+ will be removed from the end of each
8353 * line.
8354 *
8355 * "hello\nworld\n".lines #=> ["hello\n", "world\n"]
8356 * "hello world".lines(' ') #=> ["hello ", " ", "world"]
8357 * "hello\nworld\n".lines(chomp: true) #=> ["hello", "world"]
8358 *
8359 * If a block is given, which is a deprecated form, works the same as
8360 * <code>each_line</code>.
8361 */
8362
8363static VALUE
8364rb_str_lines(int argc, VALUE *argv, VALUE str)
8365{
8366 VALUE ary = WANTARRAY("lines", 0);
8367 return rb_str_enumerate_lines(argc, argv, str, ary);
8368}
8369
8370static VALUE
8371rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
8372{
8373 return LONG2FIX(RSTRING_LEN(str));
8374}
8375
8376static VALUE
8377rb_str_enumerate_bytes(VALUE str, VALUE ary)
8378{
8379 long i;
8380
8381 for (i=0; i<RSTRING_LEN(str); i++) {
8382 ENUM_ELEM(ary, INT2FIX(RSTRING_PTR(str)[i] & 0xff));
8383 }
8384 if (ary)
8385 return ary;
8386 else
8387 return str;
8388}
8389
8390/*
8391 * call-seq:
8392 * str.each_byte {|integer| block } -> str
8393 * str.each_byte -> an_enumerator
8394 *
8395 * Passes each byte in <i>str</i> to the given block, or returns an
8396 * enumerator if no block is given.
8397 *
8398 * "hello".each_byte {|c| print c, ' ' }
8399 *
8400 * <em>produces:</em>
8401 *
8402 * 104 101 108 108 111
8403 */
8404
8405static VALUE
8406rb_str_each_byte(VALUE str)
8407{
8408 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
8409 return rb_str_enumerate_bytes(str, 0);
8410}
8411
8412/*
8413 * call-seq:
8414 * str.bytes -> an_array
8415 *
8416 * Returns an array of bytes in <i>str</i>. This is a shorthand for
8417 * <code>str.each_byte.to_a</code>.
8418 *
8419 * If a block is given, which is a deprecated form, works the same as
8420 * <code>each_byte</code>.
8421 */
8422
8423static VALUE
8424rb_str_bytes(VALUE str)
8425{
8426 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
8427 return rb_str_enumerate_bytes(str, ary);
8428}
8429
8430static VALUE
8431rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
8432{
8433 return rb_str_length(str);
8434}
8435
8436static VALUE
8437rb_str_enumerate_chars(VALUE str, VALUE ary)
8438{
8439 VALUE orig = str;
8440 long i, len, n;
8441 const char *ptr;
8442 rb_encoding *enc;
8443
8445 ptr = RSTRING_PTR(str);
8446 len = RSTRING_LEN(str);
8447 enc = rb_enc_get(str);
8448
8450 for (i = 0; i < len; i += n) {
8451 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
8453 }
8454 }
8455 else {
8456 for (i = 0; i < len; i += n) {
8457 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
8459 }
8460 }
8462 if (ary)
8463 return ary;
8464 else
8465 return orig;
8466}
8467
8468/*
8469 * call-seq:
8470 * str.each_char {|cstr| block } -> str
8471 * str.each_char -> an_enumerator
8472 *
8473 * Passes each character in <i>str</i> to the given block, or returns
8474 * an enumerator if no block is given.
8475 *
8476 * "hello".each_char {|c| print c, ' ' }
8477 *
8478 * <em>produces:</em>
8479 *
8480 * h e l l o
8481 */
8482
8483static VALUE
8484rb_str_each_char(VALUE str)
8485{
8486 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
8487 return rb_str_enumerate_chars(str, 0);
8488}
8489
8490/*
8491 * call-seq:
8492 * str.chars -> an_array
8493 *
8494 * Returns an array of characters in <i>str</i>. This is a shorthand
8495 * for <code>str.each_char.to_a</code>.
8496 *
8497 * If a block is given, which is a deprecated form, works the same as
8498 * <code>each_char</code>.
8499 */
8500
8501static VALUE
8502rb_str_chars(VALUE str)
8503{
8504 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
8505 return rb_str_enumerate_chars(str, ary);
8506}
8507
8508static VALUE
8509rb_str_enumerate_codepoints(VALUE str, VALUE ary)
8510{
8511 VALUE orig = str;
8512 int n;
8513 unsigned int c;
8514 const char *ptr, *end;
8515 rb_encoding *enc;
8516
8517 if (single_byte_optimizable(str))
8518 return rb_str_enumerate_bytes(str, ary);
8519
8521 ptr = RSTRING_PTR(str);
8522 end = RSTRING_END(str);
8523 enc = STR_ENC_GET(str);
8524
8525 while (ptr < end) {
8526 c = rb_enc_codepoint_len(ptr, end, &n, enc);
8527 ENUM_ELEM(ary, UINT2NUM(c));
8528 ptr += n;
8529 }
8531 if (ary)
8532 return ary;
8533 else
8534 return orig;
8535}
8536
8537/*
8538 * call-seq:
8539 * str.each_codepoint {|integer| block } -> str
8540 * str.each_codepoint -> an_enumerator
8541 *
8542 * Passes the Integer ordinal of each character in <i>str</i>,
8543 * also known as a <i>codepoint</i> when applied to Unicode strings to the
8544 * given block. For encodings other than UTF-8/UTF-16(BE|LE)/UTF-32(BE|LE),
8545 * values are directly derived from the binary representation
8546 * of each character.
8547 *
8548 * If no block is given, an enumerator is returned instead.
8549 *
8550 * "hello\u0639".each_codepoint {|c| print c, ' ' }
8551 *
8552 * <em>produces:</em>
8553 *
8554 * 104 101 108 108 111 1593
8555 */
8556
8557static VALUE
8558rb_str_each_codepoint(VALUE str)
8559{
8560 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
8561 return rb_str_enumerate_codepoints(str, 0);
8562}
8563
8564/*
8565 * call-seq:
8566 * str.codepoints -> an_array
8567 *
8568 * Returns an array of the Integer ordinals of the
8569 * characters in <i>str</i>. This is a shorthand for
8570 * <code>str.each_codepoint.to_a</code>.
8571 *
8572 * If a block is given, which is a deprecated form, works the same as
8573 * <code>each_codepoint</code>.
8574 */
8575
8576static VALUE
8577rb_str_codepoints(VALUE str)
8578{
8579 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
8580 return rb_str_enumerate_codepoints(str, ary);
8581}
8582
8583static regex_t *
8584get_reg_grapheme_cluster(rb_encoding *enc)
8585{
8586 int encidx = rb_enc_to_index(enc);
8587 regex_t *reg_grapheme_cluster = NULL;
8588 static regex_t *reg_grapheme_cluster_utf8 = NULL;
8589
8590 /* synchronize */
8591 if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
8592 reg_grapheme_cluster = reg_grapheme_cluster_utf8;
8593 }
8594 if (!reg_grapheme_cluster) {
8595 const OnigUChar source_ascii[] = "\\X";
8596 OnigErrorInfo einfo;
8597 const OnigUChar *source = source_ascii;
8598 size_t source_len = sizeof(source_ascii) - 1;
8599 switch (encidx) {
8600#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
8601#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
8602#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
8603#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
8604#define CASE_UTF(e) \
8605 case ENCINDEX_UTF_##e: { \
8606 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
8607 source = source_UTF_##e; \
8608 source_len = sizeof(source_UTF_##e); \
8609 break; \
8610 }
8611 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
8612#undef CASE_UTF
8613#undef CHARS_16BE
8614#undef CHARS_16LE
8615#undef CHARS_32BE
8616#undef CHARS_32LE
8617 }
8618 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
8620 if (r) {
8622 onig_error_code_to_str(message, r, &einfo);
8623 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
8624 }
8625 if (encidx == rb_utf8_encindex()) {
8626 reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
8627 }
8628 }
8629 return reg_grapheme_cluster;
8630}
8631
8632static VALUE
8633rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
8634{
8635 size_t grapheme_cluster_count = 0;
8636 regex_t *reg_grapheme_cluster = NULL;
8638 const char *ptr, *end;
8639
8640 if (!rb_enc_unicode_p(enc)) {
8641 return rb_str_length(str);
8642 }
8643
8644 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
8645 ptr = RSTRING_PTR(str);
8646 end = RSTRING_END(str);
8647
8648 while (ptr < end) {
8649 OnigPosition len = onig_match(reg_grapheme_cluster,
8650 (const OnigUChar *)ptr, (const OnigUChar *)end,
8651 (const OnigUChar *)ptr, NULL, 0);
8652 if (len <= 0) break;
8653 grapheme_cluster_count++;
8654 ptr += len;
8655 }
8656
8657 return SIZET2NUM(grapheme_cluster_count);
8658}
8659
8660static VALUE
8661rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
8662{
8663 VALUE orig = str;
8664 regex_t *reg_grapheme_cluster = NULL;
8666 const char *ptr0, *ptr, *end;
8667
8668 if (!rb_enc_unicode_p(enc)) {
8669 return rb_str_enumerate_chars(str, ary);
8670 }
8671
8672 if (!ary) str = rb_str_new_frozen(str);
8673 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
8674 ptr0 = ptr = RSTRING_PTR(str);
8675 end = RSTRING_END(str);
8676
8677 while (ptr < end) {
8678 OnigPosition len = onig_match(reg_grapheme_cluster,
8679 (const OnigUChar *)ptr, (const OnigUChar *)end,
8680 (const OnigUChar *)ptr, NULL, 0);
8681 if (len <= 0) break;
8683 ptr += len;
8684 }
8686 if (ary)
8687 return ary;
8688 else
8689 return orig;
8690}
8691
8692/*
8693 * call-seq:
8694 * str.each_grapheme_cluster {|cstr| block } -> str
8695 * str.each_grapheme_cluster -> an_enumerator
8696 *
8697 * Passes each grapheme cluster in <i>str</i> to the given block, or returns
8698 * an enumerator if no block is given.
8699 * Unlike String#each_char, this enumerates by grapheme clusters defined by
8700 * Unicode Standard Annex #29 http://unicode.org/reports/tr29/
8701 *
8702 * "a\u0300".each_char.to_a.size #=> 2
8703 * "a\u0300".each_grapheme_cluster.to_a.size #=> 1
8704 *
8705 */
8706
8707static VALUE
8708rb_str_each_grapheme_cluster(VALUE str)
8709{
8710 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
8711 return rb_str_enumerate_grapheme_clusters(str, 0);
8712}
8713
8714/*
8715 * call-seq:
8716 * str.grapheme_clusters -> an_array
8717 *
8718 * Returns an array of grapheme clusters in <i>str</i>. This is a shorthand
8719 * for <code>str.each_grapheme_cluster.to_a</code>.
8720 *
8721 * If a block is given, which is a deprecated form, works the same as
8722 * <code>each_grapheme_cluster</code>.
8723 */
8724
8725static VALUE
8726rb_str_grapheme_clusters(VALUE str)
8727{
8728 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
8729 return rb_str_enumerate_grapheme_clusters(str, ary);
8730}
8731
8732static long
8733chopped_length(VALUE str)
8734{
8735 rb_encoding *enc = STR_ENC_GET(str);
8736 const char *p, *p2, *beg, *end;
8737
8738 beg = RSTRING_PTR(str);
8739 end = beg + RSTRING_LEN(str);
8740 if (beg >= end) return 0;
8741 p = rb_enc_prev_char(beg, end, end, enc);
8742 if (!p) return 0;
8743 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
8744 p2 = rb_enc_prev_char(beg, p, end, enc);
8745 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
8746 }
8747 return p - beg;
8748}
8749
8750/*
8751 * call-seq:
8752 * str.chop! -> str or nil
8753 *
8754 * Processes <i>str</i> as for String#chop, returning <i>str</i>, or
8755 * <code>nil</code> if <i>str</i> is the empty string. See also
8756 * String#chomp!.
8757 */
8758
8759static VALUE
8760rb_str_chop_bang(VALUE str)
8761{
8762 str_modify_keep_cr(str);
8763 if (RSTRING_LEN(str) > 0) {
8764 long len;
8765 len = chopped_length(str);
8770 }
8771 return str;
8772 }
8773 return Qnil;
8774}
8775
8776
8777/*
8778 * call-seq:
8779 * str.chop -> new_str
8780 *
8781 * Returns a new String with the last character removed. If the
8782 * string ends with <code>\r\n</code>, both characters are
8783 * removed. Applying <code>chop</code> to an empty string returns an
8784 * empty string. String#chomp is often a safer alternative, as it
8785 * leaves the string unchanged if it doesn't end in a record
8786 * separator.
8787 *
8788 * "string\r\n".chop #=> "string"
8789 * "string\n\r".chop #=> "string\n"
8790 * "string\n".chop #=> "string"
8791 * "string".chop #=> "strin"
8792 * "x".chop.chop #=> ""
8793 */
8794
8795static VALUE
8796rb_str_chop(VALUE str)
8797{
8798 return rb_str_subseq(str, 0, chopped_length(str));
8799}
8800
8801
8802static long
8803chompped_length(VALUE str, VALUE rs)
8804{
8805 rb_encoding *enc;
8806 int newline;
8807 char *pp, *e, *rsptr;
8808 long rslen;
8809 char *const p = RSTRING_PTR(str);
8810 long len = RSTRING_LEN(str);
8811
8812 if (len == 0) return 0;
8813 e = p + len;
8814 if (rs == rb_default_rs) {
8815 smart_chomp:
8816 enc = rb_enc_get(str);
8817 if (rb_enc_mbminlen(enc) > 1) {
8818 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
8819 if (rb_enc_is_newline(pp, e, enc)) {
8820 e = pp;
8821 }
8822 pp = e - rb_enc_mbminlen(enc);
8823 if (pp >= p) {
8824 pp = rb_enc_left_char_head(p, pp, e, enc);
8825 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
8826 e = pp;
8827 }
8828 }
8829 }
8830 else {
8831 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
8832 case '\n':
8833 if (--e > p && *(e-1) == '\r') {
8834 --e;
8835 }
8836 break;
8837 case '\r':
8838 --e;
8839 break;
8840 }
8841 }
8842 return e - p;
8843 }
8844
8845 enc = rb_enc_get(str);
8846 RSTRING_GETMEM(rs, rsptr, rslen);
8847 if (rslen == 0) {
8848 if (rb_enc_mbminlen(enc) > 1) {
8849 while (e > p) {
8850 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
8851 if (!rb_enc_is_newline(pp, e, enc)) break;
8852 e = pp;
8853 pp -= rb_enc_mbminlen(enc);
8854 if (pp >= p) {
8855 pp = rb_enc_left_char_head(p, pp, e, enc);
8856 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
8857 e = pp;
8858 }
8859 }
8860 }
8861 }
8862 else {
8863 while (e > p && *(e-1) == '\n') {
8864 --e;
8865 if (e > p && *(e-1) == '\r')
8866 --e;
8867 }
8868 }
8869 return e - p;
8870 }
8871 if (rslen > len) return len;
8872
8873 enc = rb_enc_get(rs);
8874 newline = rsptr[rslen-1];
8875 if (rslen == rb_enc_mbminlen(enc)) {
8876 if (rslen == 1) {
8877 if (newline == '\n')
8878 goto smart_chomp;
8879 }
8880 else {
8881 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
8882 goto smart_chomp;
8883 }
8884 }
8885
8886 enc = rb_enc_check(str, rs);
8887 if (is_broken_string(rs)) {
8888 return len;
8889 }
8890 pp = e - rslen;
8891 if (p[len-1] == newline &&
8892 (rslen <= 1 ||
8893 memcmp(rsptr, pp, rslen) == 0)) {
8894 if (rb_enc_left_char_head(p, pp, e, enc) == pp)
8895 return len - rslen;
8896 RB_GC_GUARD(rs);
8897 }
8898 return len;
8899}
8900
8906static VALUE
8907chomp_rs(int argc, const VALUE *argv)
8908{
8909 rb_check_arity(argc, 0, 1);
8910 if (argc > 0) {
8911 VALUE rs = argv[0];
8912 if (!NIL_P(rs)) StringValue(rs);
8913 return rs;
8914 }
8915 else {
8916 return rb_rs;
8917 }
8918}
8919
8920VALUE
8922{
8923 long olen = RSTRING_LEN(str);
8924 long len = chompped_length(str, rs);
8925 if (len >= olen) return Qnil;
8926 str_modify_keep_cr(str);
8931 }
8932 return str;
8933}
8934
8935/*
8936 * call-seq:
8937 * str.chomp!(separator=$/) -> str or nil
8938 *
8939 * Modifies <i>str</i> in place as described for String#chomp,
8940 * returning <i>str</i>, or <code>nil</code> if no modifications were
8941 * made.
8942 */
8943
8944static VALUE
8945rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
8946{
8947 VALUE rs;
8948 str_modifiable(str);
8949 if (RSTRING_LEN(str) == 0) return Qnil;
8950 rs = chomp_rs(argc, argv);
8951 if (NIL_P(rs)) return Qnil;
8952 return rb_str_chomp_string(str, rs);
8953}
8954
8955
8956/*
8957 * call-seq:
8958 * str.chomp(separator=$/) -> new_str
8959 *
8960 * Returns a new String with the given record separator removed
8961 * from the end of <i>str</i> (if present). If <code>$/</code> has not been
8962 * changed from the default Ruby record separator, then <code>chomp</code> also
8963 * removes carriage return characters (that is it will remove <code>\n</code>,
8964 * <code>\r</code>, and <code>\r\n</code>). If <code>$/</code> is an empty string,
8965 * it will remove all trailing newlines from the string.
8966 *
8967 * "hello".chomp #=> "hello"
8968 * "hello\n".chomp #=> "hello"
8969 * "hello\r\n".chomp #=> "hello"
8970 * "hello\n\r".chomp #=> "hello\n"
8971 * "hello\r".chomp #=> "hello"
8972 * "hello \n there".chomp #=> "hello \n there"
8973 * "hello".chomp("llo") #=> "he"
8974 * "hello\r\n\r\n".chomp('') #=> "hello"
8975 * "hello\r\n\r\r\n".chomp('') #=> "hello\r\n\r"
8976 */
8977
8978static VALUE
8979rb_str_chomp(int argc, VALUE *argv, VALUE str)
8980{
8981 VALUE rs = chomp_rs(argc, argv);
8982 if (NIL_P(rs)) return rb_str_dup(str);
8983 return rb_str_subseq(str, 0, chompped_length(str, rs));
8984}
8985
8986static long
8987lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
8988{
8989 const char *const start = s;
8990
8991 if (!s || s >= e) return 0;
8992
8993 /* remove spaces at head */
8994 if (single_byte_optimizable(str)) {
8995 while (s < e && ascii_isspace(*s)) s++;
8996 }
8997 else {
8998 while (s < e) {
8999 int n;
9000 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9001
9002 if (!rb_isspace(cc)) break;
9003 s += n;
9004 }
9005 }
9006 return s - start;
9007}
9008
9009/*
9010 * call-seq:
9011 * str.lstrip! -> self or nil
9012 *
9013 * Removes leading whitespace from the receiver.
9014 * Returns the altered receiver, or +nil+ if no change was made.
9015 * See also String#rstrip! and String#strip!.
9016 *
9017 * Refer to String#strip for the definition of whitespace.
9018 *
9019 * " hello ".lstrip! #=> "hello "
9020 * "hello ".lstrip! #=> nil
9021 * "hello".lstrip! #=> nil
9022 */
9023
9024static VALUE
9025rb_str_lstrip_bang(VALUE str)
9026{
9027 rb_encoding *enc;
9028 char *start, *s;
9029 long olen, loffset;
9030
9031 str_modify_keep_cr(str);
9032 enc = STR_ENC_GET(str);
9033 RSTRING_GETMEM(str, start, olen);
9034 loffset = lstrip_offset(str, start, start+olen, enc);
9035 if (loffset > 0) {
9036 long len = olen-loffset;
9037 s = start + loffset;
9038 memmove(start, s, len);
9040#if !SHARABLE_MIDDLE_SUBSTRING
9041 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9042#endif
9043 return str;
9044 }
9045 return Qnil;
9046}
9047
9048
9049/*
9050 * call-seq:
9051 * str.lstrip -> new_str
9052 *
9053 * Returns a copy of the receiver with leading whitespace removed.
9054 * See also String#rstrip and String#strip.
9055 *
9056 * Refer to String#strip for the definition of whitespace.
9057 *
9058 * " hello ".lstrip #=> "hello "
9059 * "hello".lstrip #=> "hello"
9060 */
9061
9062static VALUE
9063rb_str_lstrip(VALUE str)
9064{
9065 char *start;
9066 long len, loffset;
9067 RSTRING_GETMEM(str, start, len);
9068 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9069 if (loffset <= 0) return rb_str_dup(str);
9070 return rb_str_subseq(str, loffset, len - loffset);
9071}
9072
9073static long
9074rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9075{
9076 const char *t;
9077
9078 rb_str_check_dummy_enc(enc);
9079 if (!s || s >= e) return 0;
9080 t = e;
9081
9082 /* remove trailing spaces or '\0's */
9083 if (single_byte_optimizable(str)) {
9084 unsigned char c;
9085 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
9086 }
9087 else {
9088 char *tp;
9089
9090 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9091 unsigned int c = rb_enc_codepoint(tp, e, enc);
9092 if (c && !rb_isspace(c)) break;
9093 t = tp;
9094 }
9095 }
9096 return e - t;
9097}
9098
9099/*
9100 * call-seq:
9101 * str.rstrip! -> self or nil
9102 *
9103 * Removes trailing whitespace from the receiver.
9104 * Returns the altered receiver, or +nil+ if no change was made.
9105 * See also String#lstrip! and String#strip!.
9106 *
9107 * Refer to String#strip for the definition of whitespace.
9108 *
9109 * " hello ".rstrip! #=> " hello"
9110 * " hello".rstrip! #=> nil
9111 * "hello".rstrip! #=> nil
9112 */
9113
9114static VALUE
9115rb_str_rstrip_bang(VALUE str)
9116{
9117 rb_encoding *enc;
9118 char *start;
9119 long olen, roffset;
9120
9121 str_modify_keep_cr(str);
9122 enc = STR_ENC_GET(str);
9123 RSTRING_GETMEM(str, start, olen);
9124 roffset = rstrip_offset(str, start, start+olen, enc);
9125 if (roffset > 0) {
9126 long len = olen - roffset;
9127
9129#if !SHARABLE_MIDDLE_SUBSTRING
9130 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9131#endif
9132 return str;
9133 }
9134 return Qnil;
9135}
9136
9137
9138/*
9139 * call-seq:
9140 * str.rstrip -> new_str
9141 *
9142 * Returns a copy of the receiver with trailing whitespace removed.
9143 * See also String#lstrip and String#strip.
9144 *
9145 * Refer to String#strip for the definition of whitespace.
9146 *
9147 * " hello ".rstrip #=> " hello"
9148 * "hello".rstrip #=> "hello"
9149 */
9150
9151static VALUE
9152rb_str_rstrip(VALUE str)
9153{
9154 rb_encoding *enc;
9155 char *start;
9156 long olen, roffset;
9157
9158 enc = STR_ENC_GET(str);
9159 RSTRING_GETMEM(str, start, olen);
9160 roffset = rstrip_offset(str, start, start+olen, enc);
9161
9162 if (roffset <= 0) return rb_str_dup(str);
9163 return rb_str_subseq(str, 0, olen-roffset);
9164}
9165
9166
9167/*
9168 * call-seq:
9169 * str.strip! -> self or nil
9170 *
9171 * Removes leading and trailing whitespace from the receiver.
9172 * Returns the altered receiver, or +nil+ if there was no change.
9173 *
9174 * Refer to String#strip for the definition of whitespace.
9175 *
9176 * " hello ".strip! #=> "hello"
9177 * "hello".strip! #=> nil
9178 */
9179
9180static VALUE
9181rb_str_strip_bang(VALUE str)
9182{
9183 char *start;
9184 long olen, loffset, roffset;
9185 rb_encoding *enc;
9186
9187 str_modify_keep_cr(str);
9188 enc = STR_ENC_GET(str);
9189 RSTRING_GETMEM(str, start, olen);
9190 loffset = lstrip_offset(str, start, start+olen, enc);
9191 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9192
9193 if (loffset > 0 || roffset > 0) {
9194 long len = olen-roffset;
9195 if (loffset > 0) {
9196 len -= loffset;
9197 memmove(start, start + loffset, len);
9198 }
9200#if !SHARABLE_MIDDLE_SUBSTRING
9201 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9202#endif
9203 return str;
9204 }
9205 return Qnil;
9206}
9207
9208
9209/*
9210 * call-seq:
9211 * str.strip -> new_str
9212 *
9213 * Returns a copy of the receiver with leading and trailing whitespace removed.
9214 *
9215 * Whitespace is defined as any of the following characters:
9216 * null, horizontal tab, line feed, vertical tab, form feed, carriage return, space.
9217 *
9218 * " hello ".strip #=> "hello"
9219 * "\tgoodbye\r\n".strip #=> "goodbye"
9220 * "\x00\t\n\v\f\r ".strip #=> ""
9221 * "hello".strip #=> "hello"
9222 */
9223
9224static VALUE
9225rb_str_strip(VALUE str)
9226{
9227 char *start;
9228 long olen, loffset, roffset;
9229 rb_encoding *enc = STR_ENC_GET(str);
9230
9231 RSTRING_GETMEM(str, start, olen);
9232 loffset = lstrip_offset(str, start, start+olen, enc);
9233 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9234
9235 if (loffset <= 0 && roffset <= 0) return rb_str_dup(str);
9236 return rb_str_subseq(str, loffset, olen-loffset-roffset);
9237}
9238
9239static VALUE
9240scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
9241{
9242 VALUE result, match;
9243 struct re_registers *regs;
9244 int i;
9245 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
9246 if (pos >= 0) {
9247 if (BUILTIN_TYPE(pat) == T_STRING) {
9248 regs = NULL;
9249 end = pos + RSTRING_LEN(pat);
9250 }
9251 else {
9252 match = rb_backref_get();
9253 regs = RMATCH_REGS(match);
9254 pos = BEG(0);
9255 end = END(0);
9256 }
9257 if (pos == end) {
9258 rb_encoding *enc = STR_ENC_GET(str);
9259 /*
9260 * Always consume at least one character of the input string
9261 */
9262 if (RSTRING_LEN(str) > end)
9263 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
9264 RSTRING_END(str), enc);
9265 else
9266 *start = end + 1;
9267 }
9268 else {
9269 *start = end;
9270 }
9271 if (!regs || regs->num_regs == 1) {
9272 result = rb_str_subseq(str, pos, end - pos);
9273 return result;
9274 }
9275 result = rb_ary_new2(regs->num_regs);
9276 for (i=1; i < regs->num_regs; i++) {
9277 VALUE s = Qnil;
9278 if (BEG(i) >= 0) {
9279 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
9280 }
9281 rb_ary_push(result, s);
9282 }
9283
9284 return result;
9285 }
9286 return Qnil;
9287}
9288
9289
9290/*
9291 * call-seq:
9292 * str.scan(pattern) -> array
9293 * str.scan(pattern) {|match, ...| block } -> str
9294 *
9295 * Both forms iterate through <i>str</i>, matching the pattern (which may be a
9296 * Regexp or a String). For each match, a result is
9297 * generated and either added to the result array or passed to the block. If
9298 * the pattern contains no groups, each individual result consists of the
9299 * matched string, <code>$&</code>. If the pattern contains groups, each
9300 * individual result is itself an array containing one entry per group.
9301 *
9302 * a = "cruel world"
9303 * a.scan(/\w+/) #=> ["cruel", "world"]
9304 * a.scan(/.../) #=> ["cru", "el ", "wor"]
9305 * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]]
9306 * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]]
9307 *
9308 * And the block form:
9309 *
9310 * a.scan(/\w+/) {|w| print "<<#{w}>> " }
9311 * print "\n"
9312 * a.scan(/(.)(.)/) {|x,y| print y, x }
9313 * print "\n"
9314 *
9315 * <em>produces:</em>
9316 *
9317 * <<cruel>> <<world>>
9318 * rceu lowlr
9319 */
9320
9321static VALUE
9322rb_str_scan(VALUE str, VALUE pat)
9323{
9324 VALUE result;
9325 long start = 0;
9326 long last = -1, prev = 0;
9327 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
9328
9329 pat = get_pat_quoted(pat, 1);
9330 mustnot_broken(str);
9331 if (!rb_block_given_p()) {
9332 VALUE ary = rb_ary_new();
9333
9334 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
9335 last = prev;
9336 prev = start;
9337 rb_ary_push(ary, result);
9338 }
9339 if (last >= 0) rb_pat_search(pat, str, last, 1);
9340 else rb_backref_set(Qnil);
9341 return ary;
9342 }
9343
9344 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
9345 last = prev;
9346 prev = start;
9347 rb_yield(result);
9348 str_mod_check(str, p, len);
9349 }
9350 if (last >= 0) rb_pat_search(pat, str, last, 1);
9351 return str;
9352}
9353
9354
9355/*
9356 * call-seq:
9357 * str.hex -> integer
9358 *
9359 * Treats leading characters from <i>str</i> as a string of hexadecimal digits
9360 * (with an optional sign and an optional <code>0x</code>) and returns the
9361 * corresponding number. Zero is returned on error.
9362 *
9363 * "0x0a".hex #=> 10
9364 * "-1234".hex #=> -4660
9365 * "0".hex #=> 0
9366 * "wombat".hex #=> 0
9367 */
9368
9369static VALUE
9370rb_str_hex(VALUE str)
9371{
9372 return rb_str_to_inum(str, 16, FALSE);
9373}
9374
9375
9376/*
9377 * call-seq:
9378 * str.oct -> integer
9379 *
9380 * Treats leading characters of <i>str</i> as a string of octal digits (with an
9381 * optional sign) and returns the corresponding number. Returns 0 if the
9382 * conversion fails.
9383 *
9384 * "123".oct #=> 83
9385 * "-377".oct #=> -255
9386 * "bad".oct #=> 0
9387 * "0377bad".oct #=> 255
9388 *
9389 * If +str+ starts with <code>0</code>, radix indicators are honored.
9390 * See Kernel#Integer.
9391 */
9392
9393static VALUE
9394rb_str_oct(VALUE str)
9395{
9396 return rb_str_to_inum(str, -8, FALSE);
9397}
9398
9399
9400/*
9401 * call-seq:
9402 * str.crypt(salt_str) -> new_str
9403 *
9404 * Returns the string generated by calling <code>crypt(3)</code>
9405 * standard library function with <code>str</code> and
9406 * <code>salt_str</code>, in this order, as its arguments. Please do
9407 * not use this method any longer. It is legacy; provided only for
9408 * backward compatibility with ruby scripts in earlier days. It is
9409 * bad to use in contemporary programs for several reasons:
9410 *
9411 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
9412 * run. The generated string lacks data portability.
9413 *
9414 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
9415 * (i.e. silently ends up in unexpected results).
9416 *
9417 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
9418 * thread safe.
9419 *
9420 * * So-called "traditional" usage of <code>crypt(3)</code> is very
9421 * very very weak. According to its manpage, Linux's traditional
9422 * <code>crypt(3)</code> output has only 2**56 variations; too
9423 * easy to brute force today. And this is the default behaviour.
9424 *
9425 * * In order to make things robust some OSes implement so-called
9426 * "modular" usage. To go through, you have to do a complex
9427 * build-up of the <code>salt_str</code> parameter, by hand.
9428 * Failure in generation of a proper salt string tends not to
9429 * yield any errors; typos in parameters are normally not
9430 * detectable.
9431 *
9432 * * For instance, in the following example, the second invocation
9433 * of String#crypt is wrong; it has a typo in "round=" (lacks
9434 * "s"). However the call does not fail and something unexpected
9435 * is generated.
9436 *
9437 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
9438 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
9439 *
9440 * * Even in the "modular" mode, some hash functions are considered
9441 * archaic and no longer recommended at all; for instance module
9442 * <code>$1$</code> is officially abandoned by its author: see
9443 * http://phk.freebsd.dk/sagas/md5crypt_eol.html . For another
9444 * instance module <code>$3$</code> is considered completely
9445 * broken: see the manpage of FreeBSD.
9446 *
9447 * * On some OS such as Mac OS, there is no modular mode. Yet, as
9448 * written above, <code>crypt(3)</code> on Mac OS never fails.
9449 * This means even if you build up a proper salt string it
9450 * generates a traditional DES hash anyways, and there is no way
9451 * for you to be aware of.
9452 *
9453 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
9454 *
9455 * If for some reason you cannot migrate to other secure contemporary
9456 * password hashing algorithms, install the string-crypt gem and
9457 * <code>require 'string/crypt'</code> to continue using it.
9458 */
9459
9460static VALUE
9461rb_str_crypt(VALUE str, VALUE salt)
9462{
9463#ifdef HAVE_CRYPT_R
9464 VALUE databuf;
9465 struct crypt_data *data;
9466# define CRYPT_END() ALLOCV_END(databuf)
9467#else
9468 extern char *crypt(const char *, const char *);
9469# define CRYPT_END() (void)0
9470#endif
9471 VALUE result;
9472 const char *s, *saltp;
9473 char *res;
9474#ifdef BROKEN_CRYPT
9475 char salt_8bit_clean[3];
9476#endif
9477
9478 StringValue(salt);
9479 mustnot_wchar(str);
9480 mustnot_wchar(salt);
9481 if (RSTRING_LEN(salt) < 2) {
9482 short_salt:
9483 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
9484 }
9485
9486 s = StringValueCStr(str);
9487 saltp = RSTRING_PTR(salt);
9488 if (!saltp[0] || !saltp[1]) goto short_salt;
9489#ifdef BROKEN_CRYPT
9490 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
9491 salt_8bit_clean[0] = saltp[0] & 0x7f;
9492 salt_8bit_clean[1] = saltp[1] & 0x7f;
9493 salt_8bit_clean[2] = '\0';
9494 saltp = salt_8bit_clean;
9495 }
9496#endif
9497#ifdef HAVE_CRYPT_R
9498 data = ALLOCV(databuf, sizeof(struct crypt_data));
9499# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
9500 data->initialized = 0;
9501# endif
9502 res = crypt_r(s, saltp, data);
9503#else
9504 res = crypt(s, saltp);
9505#endif
9506 if (!res) {
9507 int err = errno;
9508 CRYPT_END();
9509 rb_syserr_fail(err, "crypt");
9510 }
9511 result = rb_str_new_cstr(res);
9512 CRYPT_END();
9513 return result;
9514}
9515
9516
9517/*
9518 * call-seq:
9519 * str.ord -> integer
9520 *
9521 * Returns the Integer ordinal of a one-character string.
9522 *
9523 * "a".ord #=> 97
9524 */
9525
9526VALUE
9528{
9529 unsigned int c;
9530
9532 return UINT2NUM(c);
9533}
9534/*
9535 * call-seq:
9536 * str.sum(n=16) -> integer
9537 *
9538 * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
9539 * where <em>n</em> is the optional Integer parameter, defaulting
9540 * to 16. The result is simply the sum of the binary value of each byte in
9541 * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
9542 * checksum.
9543 */
9544
9545static VALUE
9546rb_str_sum(int argc, VALUE *argv, VALUE str)
9547{
9548 int bits = 16;
9549 char *ptr, *p, *pend;
9550 long len;
9551 VALUE sum = INT2FIX(0);
9552 unsigned long sum0 = 0;
9553
9554 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
9555 bits = 0;
9556 }
9557 ptr = p = RSTRING_PTR(str);
9558 len = RSTRING_LEN(str);
9559 pend = p + len;
9560
9561 while (p < pend) {
9562 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
9563 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
9564 str_mod_check(str, ptr, len);
9565 sum0 = 0;
9566 }
9567 sum0 += (unsigned char)*p;
9568 p++;
9569 }
9570
9571 if (bits == 0) {
9572 if (sum0) {
9573 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
9574 }
9575 }
9576 else {
9577 if (sum == INT2FIX(0)) {
9578 if (bits < (int)sizeof(long)*CHAR_BIT) {
9579 sum0 &= (((unsigned long)1)<<bits)-1;
9580 }
9581 sum = LONG2FIX(sum0);
9582 }
9583 else {
9584 VALUE mod;
9585
9586 if (sum0) {
9587 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
9588 }
9589
9590 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
9591 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
9592 sum = rb_funcall(sum, '&', 1, mod);
9593 }
9594 }
9595 return sum;
9596}
9597
9598static VALUE
9599rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
9600{
9601 rb_encoding *enc;
9602 VALUE w;
9603 long width, len, flen = 1, fclen = 1;
9604 VALUE res;
9605 char *p;
9606 const char *f = " ";
9607 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
9608 VALUE pad;
9609 int singlebyte = 1, cr;
9610 int termlen;
9611
9612 rb_scan_args(argc, argv, "11", &w, &pad);
9613 enc = STR_ENC_GET(str);
9614 termlen = rb_enc_mbminlen(enc);
9615 width = NUM2LONG(w);
9616 if (argc == 2) {
9617 StringValue(pad);
9618 enc = rb_enc_check(str, pad);
9619 f = RSTRING_PTR(pad);
9620 flen = RSTRING_LEN(pad);
9621 fclen = str_strlen(pad, enc); /* rb_enc_check */
9622 singlebyte = single_byte_optimizable(pad);
9623 if (flen == 0 || fclen == 0) {
9624 rb_raise(rb_eArgError, "zero width padding");
9625 }
9626 }
9627 len = str_strlen(str, enc); /* rb_enc_check */
9628 if (width < 0 || len >= width) return rb_str_dup(str);
9629 n = width - len;
9630 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
9631 rlen = n - llen;
9632 cr = ENC_CODERANGE(str);
9633 if (flen > 1) {
9634 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
9635 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
9636 }
9637 size = RSTRING_LEN(str);
9638 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
9639 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
9640 (len += llen2 + rlen2) >= LONG_MAX - size) {
9641 rb_raise(rb_eArgError, "argument too big");
9642 }
9643 len += size;
9644 res = str_new0(rb_obj_class(str), 0, len, termlen);
9645 p = RSTRING_PTR(res);
9646 if (flen <= 1) {
9647 memset(p, *f, llen);
9648 p += llen;
9649 }
9650 else {
9651 while (llen >= fclen) {
9652 memcpy(p,f,flen);
9653 p += flen;
9654 llen -= fclen;
9655 }
9656 if (llen > 0) {
9657 memcpy(p, f, llen2);
9658 p += llen2;
9659 }
9660 }
9661 memcpy(p, RSTRING_PTR(str), size);
9662 p += size;
9663 if (flen <= 1) {
9664 memset(p, *f, rlen);
9665 p += rlen;
9666 }
9667 else {
9668 while (rlen >= fclen) {
9669 memcpy(p,f,flen);
9670 p += flen;
9671 rlen -= fclen;
9672 }
9673 if (rlen > 0) {
9674 memcpy(p, f, rlen2);
9675 p += rlen2;
9676 }
9677 }
9678 TERM_FILL(p, termlen);
9679 STR_SET_LEN(res, p-RSTRING_PTR(res));
9680 rb_enc_associate(res, enc);
9681 if (argc == 2)
9682 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
9683 if (cr != ENC_CODERANGE_BROKEN)
9684 ENC_CODERANGE_SET(res, cr);
9685
9686 RB_GC_GUARD(pad);
9687 return res;
9688}
9689
9690
9691/*
9692 * call-seq:
9693 * str.ljust(integer, padstr=' ') -> new_str
9694 *
9695 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
9696 * String of length <i>integer</i> with <i>str</i> left justified
9697 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
9698 *
9699 * "hello".ljust(4) #=> "hello"
9700 * "hello".ljust(20) #=> "hello "
9701 * "hello".ljust(20, '1234') #=> "hello123412341234123"
9702 */
9703
9704static VALUE
9705rb_str_ljust(int argc, VALUE *argv, VALUE str)
9706{
9707 return rb_str_justify(argc, argv, str, 'l');
9708}
9709
9710
9711/*
9712 * call-seq:
9713 * str.rjust(integer, padstr=' ') -> new_str
9714 *
9715 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
9716 * String of length <i>integer</i> with <i>str</i> right justified
9717 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
9718 *
9719 * "hello".rjust(4) #=> "hello"
9720 * "hello".rjust(20) #=> " hello"
9721 * "hello".rjust(20, '1234') #=> "123412341234123hello"
9722 */
9723
9724static VALUE
9725rb_str_rjust(int argc, VALUE *argv, VALUE str)
9726{
9727 return rb_str_justify(argc, argv, str, 'r');
9728}
9729
9730
9731/*
9732 * call-seq:
9733 * str.center(width, padstr=' ') -> new_str
9734 *
9735 * Centers +str+ in +width+. If +width+ is greater than the length of +str+,
9736 * returns a new String of length +width+ with +str+ centered and padded with
9737 * +padstr+; otherwise, returns +str+.
9738 *
9739 * "hello".center(4) #=> "hello"
9740 * "hello".center(20) #=> " hello "
9741 * "hello".center(20, '123') #=> "1231231hello12312312"
9742 */
9743
9744static VALUE
9745rb_str_center(int argc, VALUE *argv, VALUE str)
9746{
9747 return rb_str_justify(argc, argv, str, 'c');
9748}
9749
9750/*
9751 * call-seq:
9752 * str.partition(sep) -> [head, sep, tail]
9753 * str.partition(regexp) -> [head, match, tail]
9754 *
9755 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
9756 * and returns the part before it, the match, and the part
9757 * after it.
9758 * If it is not found, returns two empty strings and <i>str</i>.
9759 *
9760 * "hello".partition("l") #=> ["he", "l", "lo"]
9761 * "hello".partition("x") #=> ["hello", "", ""]
9762 * "hello".partition(/.l/) #=> ["h", "el", "lo"]
9763 */
9764
9765static VALUE
9766rb_str_partition(VALUE str, VALUE sep)
9767{
9768 long pos;
9769
9770 sep = get_pat_quoted(sep, 0);
9771 if (RB_TYPE_P(sep, T_REGEXP)) {
9772 pos = rb_reg_search(sep, str, 0, 0);
9773 if (pos < 0) {
9774 failed:
9775 return rb_ary_new3(3, rb_str_dup(str), str_new_empty(str), str_new_empty(str));
9776 }
9777 sep = rb_str_subpat(str, sep, INT2FIX(0));
9778 if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
9779 }
9780 else {
9781 pos = rb_str_index(str, sep, 0);
9782 if (pos < 0) goto failed;
9783 }
9784 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
9785 sep,
9786 rb_str_subseq(str, pos+RSTRING_LEN(sep),
9787 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
9788}
9789
9790/*
9791 * call-seq:
9792 * str.rpartition(sep) -> [head, sep, tail]
9793 * str.rpartition(regexp) -> [head, match, tail]
9794 *
9795 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
9796 * of the string, and returns the part before it, the match, and the part
9797 * after it.
9798 * If it is not found, returns two empty strings and <i>str</i>.
9799 *
9800 * "hello".rpartition("l") #=> ["hel", "l", "o"]
9801 * "hello".rpartition("x") #=> ["", "", "hello"]
9802 * "hello".rpartition(/.l/) #=> ["he", "ll", "o"]
9803 */
9804
9805static VALUE
9806rb_str_rpartition(VALUE str, VALUE sep)
9807{
9808 long pos = RSTRING_LEN(str);
9809 int regex = FALSE;
9810
9811 if (RB_TYPE_P(sep, T_REGEXP)) {
9812 pos = rb_reg_search(sep, str, pos, 1);
9813 regex = TRUE;
9814 }
9815 else {
9816 VALUE tmp;
9817
9818 tmp = rb_check_string_type(sep);
9819 if (NIL_P(tmp)) {
9820 rb_raise(rb_eTypeError, "type mismatch: %s given",
9821 rb_obj_classname(sep));
9822 }
9823 sep = tmp;
9824 pos = rb_str_sublen(str, pos);
9825 pos = rb_str_rindex(str, sep, pos);
9826 }
9827 if (pos < 0) {
9828 return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), rb_str_dup(str));
9829 }
9830 if (regex) {
9831 sep = rb_reg_nth_match(0, rb_backref_get());
9832 }
9833 else {
9834 pos = rb_str_offset(str, pos);
9835 }
9836 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
9837 sep,
9838 rb_str_subseq(str, pos+RSTRING_LEN(sep),
9839 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
9840}
9841
9842/*
9843 * call-seq:
9844 * str.start_with?([prefixes]+) -> true or false
9845 *
9846 * Returns true if +str+ starts with one of the +prefixes+ given.
9847 * Each of the +prefixes+ should be a String or a Regexp.
9848 *
9849 * "hello".start_with?("hell") #=> true
9850 * "hello".start_with?(/H/i) #=> true
9851 *
9852 * # returns true if one of the prefixes matches.
9853 * "hello".start_with?("heaven", "hell") #=> true
9854 * "hello".start_with?("heaven", "paradise") #=> false
9855 */
9856
9857static VALUE
9858rb_str_start_with(int argc, VALUE *argv, VALUE str)
9859{
9860 int i;
9861
9862 for (i=0; i<argc; i++) {
9863 VALUE tmp = argv[i];
9864 if (RB_TYPE_P(tmp, T_REGEXP)) {
9865 if (rb_reg_start_with_p(tmp, str))
9866 return Qtrue;
9867 }
9868 else {
9869 StringValue(tmp);
9870 rb_enc_check(str, tmp);
9871 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
9872 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
9873 return Qtrue;
9874 }
9875 }
9876 return Qfalse;
9877}
9878
9879/*
9880 * call-seq:
9881 * str.end_with?([suffixes]+) -> true or false
9882 *
9883 * Returns true if +str+ ends with one of the +suffixes+ given.
9884 *
9885 * "hello".end_with?("ello") #=> true
9886 *
9887 * # returns true if one of the +suffixes+ matches.
9888 * "hello".end_with?("heaven", "ello") #=> true
9889 * "hello".end_with?("heaven", "paradise") #=> false
9890 */
9891
9892static VALUE
9893rb_str_end_with(int argc, VALUE *argv, VALUE str)
9894{
9895 int i;
9896 char *p, *s, *e;
9897 rb_encoding *enc;
9898
9899 for (i=0; i<argc; i++) {
9900 VALUE tmp = argv[i];
9901 StringValue(tmp);
9902 enc = rb_enc_check(str, tmp);
9903 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
9904 p = RSTRING_PTR(str);
9905 e = p + RSTRING_LEN(str);
9906 s = e - RSTRING_LEN(tmp);
9907 if (rb_enc_left_char_head(p, s, e, enc) != s)
9908 continue;
9909 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
9910 return Qtrue;
9911 }
9912 return Qfalse;
9913}
9914
9924static long
9925deleted_prefix_length(VALUE str, VALUE prefix)
9926{
9927 char *strptr, *prefixptr;
9928 long olen, prefixlen;
9929
9930 StringValue(prefix);
9931 if (is_broken_string(prefix)) return 0;
9932 rb_enc_check(str, prefix);
9933
9934 /* return 0 if not start with prefix */
9935 prefixlen = RSTRING_LEN(prefix);
9936 if (prefixlen <= 0) return 0;
9937 olen = RSTRING_LEN(str);
9938 if (olen < prefixlen) return 0;
9939 strptr = RSTRING_PTR(str);
9940 prefixptr = RSTRING_PTR(prefix);
9941 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
9942
9943 return prefixlen;
9944}
9945
9946/*
9947 * call-seq:
9948 * str.delete_prefix!(prefix) -> self or nil
9949 *
9950 * Deletes leading <code>prefix</code> from <i>str</i>, returning
9951 * <code>nil</code> if no change was made.
9952 *
9953 * "hello".delete_prefix!("hel") #=> "lo"
9954 * "hello".delete_prefix!("llo") #=> nil
9955 */
9956
9957static VALUE
9958rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
9959{
9960 long prefixlen;
9961 str_modify_keep_cr(str);
9962
9963 prefixlen = deleted_prefix_length(str, prefix);
9964 if (prefixlen <= 0) return Qnil;
9965
9966 return rb_str_drop_bytes(str, prefixlen);
9967}
9968
9969/*
9970 * call-seq:
9971 * str.delete_prefix(prefix) -> new_str
9972 *
9973 * Returns a copy of <i>str</i> with leading <code>prefix</code> deleted.
9974 *
9975 * "hello".delete_prefix("hel") #=> "lo"
9976 * "hello".delete_prefix("llo") #=> "hello"
9977 */
9978
9979static VALUE
9980rb_str_delete_prefix(VALUE str, VALUE prefix)
9981{
9982 long prefixlen;
9983
9984 prefixlen = deleted_prefix_length(str, prefix);
9985 if (prefixlen <= 0) return rb_str_dup(str);
9986
9987 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
9988}
9989
9999static long
10000deleted_suffix_length(VALUE str, VALUE suffix)
10001{
10002 char *strptr, *suffixptr, *s;
10003 long olen, suffixlen;
10004 rb_encoding *enc;
10005
10006 StringValue(suffix);
10007 if (is_broken_string(suffix)) return 0;
10008 enc = rb_enc_check(str, suffix);
10009
10010 /* return 0 if not start with suffix */
10011 suffixlen = RSTRING_LEN(suffix);
10012 if (suffixlen <= 0) return 0;
10013 olen = RSTRING_LEN(str);
10014 if (olen < suffixlen) return 0;
10015 strptr = RSTRING_PTR(str);
10016 suffixptr = RSTRING_PTR(suffix);
10017 s = strptr + olen - suffixlen;
10018 if (memcmp(s, suffixptr, suffixlen) != 0) return 0;
10019 if (rb_enc_left_char_head(strptr, s, strptr + olen, enc) != s) return 0;
10020
10021 return suffixlen;
10022}
10023
10024/*
10025 * call-seq:
10026 * str.delete_suffix!(suffix) -> self or nil
10027 *
10028 * Deletes trailing <code>suffix</code> from <i>str</i>, returning
10029 * <code>nil</code> if no change was made.
10030 *
10031 * "hello".delete_suffix!("llo") #=> "he"
10032 * "hello".delete_suffix!("hel") #=> nil
10033 */
10034
10035static VALUE
10036rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
10037{
10038 long olen, suffixlen, len;
10039 str_modifiable(str);
10040
10041 suffixlen = deleted_suffix_length(str, suffix);
10042 if (suffixlen <= 0) return Qnil;
10043
10044 olen = RSTRING_LEN(str);
10045 str_modify_keep_cr(str);
10046 len = olen - suffixlen;
10051 }
10052 return str;
10053}
10054
10055/*
10056 * call-seq:
10057 * str.delete_suffix(suffix) -> new_str
10058 *
10059 * Returns a copy of <i>str</i> with trailing <code>suffix</code> deleted.
10060 *
10061 * "hello".delete_suffix("llo") #=> "he"
10062 * "hello".delete_suffix("hel") #=> "hello"
10063 */
10064
10065static VALUE
10066rb_str_delete_suffix(VALUE str, VALUE suffix)
10067{
10068 long suffixlen;
10069
10070 suffixlen = deleted_suffix_length(str, suffix);
10071 if (suffixlen <= 0) return rb_str_dup(str);
10072
10073 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10074}
10075
10076void
10078{
10079 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
10080 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
10081 }
10082 *var = val;
10083}
10084
10085static void
10086rb_fs_setter(VALUE val, ID id, VALUE *var)
10087{
10088 val = rb_fs_check(val);
10089 if (!val) {
10091 "value of %"PRIsVALUE" must be String or Regexp",
10092 rb_id2str(id));
10093 }
10094 if (!NIL_P(val)) {
10095 rb_warn_deprecated("`$;'", NULL);
10096 }
10097 *var = val;
10098}
10099
10100
10101/*
10102 * call-seq:
10103 * str.force_encoding(encoding) -> str
10104 *
10105 * Changes the encoding to +encoding+ and returns self.
10106 */
10107
10108static VALUE
10109rb_str_force_encoding(VALUE str, VALUE enc)
10110{
10111 str_modifiable(str);
10114 return str;
10115}
10116
10117/*
10118 * call-seq:
10119 * str.b -> str
10120 *
10121 * Returns a copied string whose encoding is ASCII-8BIT.
10122 */
10123
10124static VALUE
10125rb_str_b(VALUE str)
10126{
10127 VALUE str2 = str_alloc(rb_cString);
10128 str_replace_shared_without_enc(str2, str);
10129 ENC_CODERANGE_CLEAR(str2);
10130 return str2;
10131}
10132
10133/*
10134 * call-seq:
10135 * str.valid_encoding? -> true or false
10136 *
10137 * Returns true for a string which is encoded correctly.
10138 *
10139 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true
10140 * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false
10141 * "\x80".force_encoding("UTF-8").valid_encoding? #=> false
10142 */
10143
10144static VALUE
10145rb_str_valid_encoding_p(VALUE str)
10146{
10147 int cr = rb_enc_str_coderange(str);
10148
10149 return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
10150}
10151
10152/*
10153 * call-seq:
10154 * str.ascii_only? -> true or false
10155 *
10156 * Returns true for a string which has only ASCII characters.
10157 *
10158 * "abc".force_encoding("UTF-8").ascii_only? #=> true
10159 * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false
10160 */
10161
10162static VALUE
10163rb_str_is_ascii_only_p(VALUE str)
10164{
10165 int cr = rb_enc_str_coderange(str);
10166
10167 return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
10168}
10169
10184VALUE
10186{
10187 static const char ellipsis[] = "...";
10188 const long ellipsislen = sizeof(ellipsis) - 1;
10189 rb_encoding *const enc = rb_enc_get(str);
10190 const long blen = RSTRING_LEN(str);
10191 const char *const p = RSTRING_PTR(str), *e = p + blen;
10192 VALUE estr, ret = 0;
10193
10194 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
10195 if (len * rb_enc_mbminlen(enc) >= blen ||
10196 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
10197 ret = str;
10198 }
10199 else if (len <= ellipsislen ||
10200 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
10201 if (rb_enc_asciicompat(enc)) {
10202 ret = rb_str_new_with_class(str, ellipsis, len);
10203 rb_enc_associate(ret, enc);
10204 }
10205 else {
10206 estr = rb_usascii_str_new(ellipsis, len);
10207 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
10208 }
10209 }
10210 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
10211 rb_str_cat(ret, ellipsis, ellipsislen);
10212 }
10213 else {
10214 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
10215 rb_enc_from_encoding(enc), 0, Qnil);
10216 rb_str_append(ret, estr);
10217 }
10218 return ret;
10219}
10220
10221static VALUE
10222str_compat_and_valid(VALUE str, rb_encoding *enc)
10223{
10224 int cr;
10225 str = StringValue(str);
10227 if (cr == ENC_CODERANGE_BROKEN) {
10228 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
10229 }
10230 else {
10232 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
10233 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
10234 rb_enc_name(enc), rb_enc_name(e));
10235 }
10236 }
10237 return str;
10238}
10239
10240static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
10241
10247VALUE
10249{
10250 rb_encoding *enc = STR_ENC_GET(str);
10251 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
10252}
10253
10254VALUE
10256{
10257 int cr = ENC_CODERANGE_UNKNOWN;
10258 if (enc == STR_ENC_GET(str)) {
10259 /* cached coderange makes sense only when enc equals the
10260 * actual encoding of str */
10261 cr = ENC_CODERANGE(str);
10262 }
10263 return enc_str_scrub(enc, str, repl, cr);
10264}
10265
10266static VALUE
10267enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
10268{
10269 int encidx;
10270 VALUE buf = Qnil;
10271 const char *rep, *p, *e, *p1, *sp;
10272 long replen = -1;
10273 long slen;
10274
10275 if (rb_block_given_p()) {
10276 if (!NIL_P(repl))
10277 rb_raise(rb_eArgError, "both of block and replacement given");
10278 replen = 0;
10279 }
10280
10281 if (ENC_CODERANGE_CLEAN_P(cr))
10282 return Qnil;
10283
10284 if (!NIL_P(repl)) {
10285 repl = str_compat_and_valid(repl, enc);
10286 }
10287
10288 if (rb_enc_dummy_p(enc)) {
10289 return Qnil;
10290 }
10291 encidx = rb_enc_to_index(enc);
10292
10293#define DEFAULT_REPLACE_CHAR(str) do { \
10294 static const char replace[sizeof(str)-1] = str; \
10295 rep = replace; replen = (int)sizeof(replace); \
10296 } while (0)
10297
10298 slen = RSTRING_LEN(str);
10299 p = RSTRING_PTR(str);
10300 e = RSTRING_END(str);
10301 p1 = p;
10302 sp = p;
10303
10304 if (rb_enc_asciicompat(enc)) {
10305 int rep7bit_p;
10306 if (!replen) {
10307 rep = NULL;
10308 rep7bit_p = FALSE;
10309 }
10310 else if (!NIL_P(repl)) {
10311 rep = RSTRING_PTR(repl);
10312 replen = RSTRING_LEN(repl);
10313 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
10314 }
10315 else if (encidx == rb_utf8_encindex()) {
10316 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
10317 rep7bit_p = FALSE;
10318 }
10319 else {
10321 rep7bit_p = TRUE;
10322 }
10323 cr = ENC_CODERANGE_7BIT;
10324
10325 p = search_nonascii(p, e);
10326 if (!p) {
10327 p = e;
10328 }
10329 while (p < e) {
10330 int ret = rb_enc_precise_mbclen(p, e, enc);
10331 if (MBCLEN_NEEDMORE_P(ret)) {
10332 break;
10333 }
10334 else if (MBCLEN_CHARFOUND_P(ret)) {
10336 p += MBCLEN_CHARFOUND_LEN(ret);
10337 }
10338 else if (MBCLEN_INVALID_P(ret)) {
10339 /*
10340 * p1~p: valid ascii/multibyte chars
10341 * p ~e: invalid bytes + unknown bytes
10342 */
10343 long clen = rb_enc_mbmaxlen(enc);
10345 if (p > p1) {
10346 rb_str_buf_cat(buf, p1, p - p1);
10347 }
10348
10349 if (e - p < clen) clen = e - p;
10350 if (clen <= 2) {
10351 clen = 1;
10352 }
10353 else {
10354 const char *q = p;
10355 clen--;
10356 for (; clen > 1; clen--) {
10357 ret = rb_enc_precise_mbclen(q, q + clen, enc);
10358 if (MBCLEN_NEEDMORE_P(ret)) break;
10359 if (MBCLEN_INVALID_P(ret)) continue;
10361 }
10362 }
10363 if (rep) {
10364 rb_str_buf_cat(buf, rep, replen);
10365 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
10366 }
10367 else {
10368 repl = rb_yield(rb_enc_str_new(p, clen, enc));
10369 str_mod_check(str, sp, slen);
10370 repl = str_compat_and_valid(repl, enc);
10374 }
10375 p += clen;
10376 p1 = p;
10377 p = search_nonascii(p, e);
10378 if (!p) {
10379 p = e;
10380 break;
10381 }
10382 }
10383 else {
10385 }
10386 }
10387 if (NIL_P(buf)) {
10388 if (p == e) {
10390 return Qnil;
10391 }
10393 }
10394 if (p1 < p) {
10395 rb_str_buf_cat(buf, p1, p - p1);
10396 }
10397 if (p < e) {
10398 if (rep) {
10399 rb_str_buf_cat(buf, rep, replen);
10400 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
10401 }
10402 else {
10403 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
10404 str_mod_check(str, sp, slen);
10405 repl = str_compat_and_valid(repl, enc);
10409 }
10410 }
10411 }
10412 else {
10413 /* ASCII incompatible */
10414 long mbminlen = rb_enc_mbminlen(enc);
10415 if (!replen) {
10416 rep = NULL;
10417 }
10418 else if (!NIL_P(repl)) {
10419 rep = RSTRING_PTR(repl);
10420 replen = RSTRING_LEN(repl);
10421 }
10422 else if (encidx == ENCINDEX_UTF_16BE) {
10423 DEFAULT_REPLACE_CHAR("\xFF\xFD");
10424 }
10425 else if (encidx == ENCINDEX_UTF_16LE) {
10426 DEFAULT_REPLACE_CHAR("\xFD\xFF");
10427 }
10428 else if (encidx == ENCINDEX_UTF_32BE) {
10429 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
10430 }
10431 else if (encidx == ENCINDEX_UTF_32LE) {
10432 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
10433 }
10434 else {
10436 }
10437
10438 while (p < e) {
10439 int ret = rb_enc_precise_mbclen(p, e, enc);
10440 if (MBCLEN_NEEDMORE_P(ret)) {
10441 break;
10442 }
10443 else if (MBCLEN_CHARFOUND_P(ret)) {
10444 p += MBCLEN_CHARFOUND_LEN(ret);
10445 }
10446 else if (MBCLEN_INVALID_P(ret)) {
10447 const char *q = p;
10448 long clen = rb_enc_mbmaxlen(enc);
10450 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
10451
10452 if (e - p < clen) clen = e - p;
10453 if (clen <= mbminlen * 2) {
10454 clen = mbminlen;
10455 }
10456 else {
10457 clen -= mbminlen;
10458 for (; clen > mbminlen; clen-=mbminlen) {
10459 ret = rb_enc_precise_mbclen(q, q + clen, enc);
10460 if (MBCLEN_NEEDMORE_P(ret)) break;
10461 if (MBCLEN_INVALID_P(ret)) continue;
10463 }
10464 }
10465 if (rep) {
10466 rb_str_buf_cat(buf, rep, replen);
10467 }
10468 else {
10469 repl = rb_yield(rb_enc_str_new(p, clen, enc));
10470 str_mod_check(str, sp, slen);
10471 repl = str_compat_and_valid(repl, enc);
10473 }
10474 p += clen;
10475 p1 = p;
10476 }
10477 else {
10479 }
10480 }
10481 if (NIL_P(buf)) {
10482 if (p == e) {
10484 return Qnil;
10485 }
10487 }
10488 if (p1 < p) {
10489 rb_str_buf_cat(buf, p1, p - p1);
10490 }
10491 if (p < e) {
10492 if (rep) {
10493 rb_str_buf_cat(buf, rep, replen);
10494 }
10495 else {
10496 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
10497 str_mod_check(str, sp, slen);
10498 repl = str_compat_and_valid(repl, enc);
10500 }
10501 }
10503 }
10505 return buf;
10506}
10507
10508/*
10509 * call-seq:
10510 * str.scrub -> new_str
10511 * str.scrub(repl) -> new_str
10512 * str.scrub{|bytes|} -> new_str
10513 *
10514 * If the string is invalid byte sequence then replace invalid bytes with given replacement
10515 * character, else returns self.
10516 * If block is given, replace invalid bytes with returned value of the block.
10517 *
10518 * "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD"
10519 * "abc\u3042\x81".scrub("*") #=> "abc\u3042*"
10520 * "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack('H*')[0]+'>' } #=> "abc\u3042<e380>"
10521 */
10522static VALUE
10523str_scrub(int argc, VALUE *argv, VALUE str)
10524{
10525 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
10526 VALUE new = rb_str_scrub(str, repl);
10527 return NIL_P(new) ? rb_str_dup(str): new;
10528}
10529
10530/*
10531 * call-seq:
10532 * str.scrub! -> str
10533 * str.scrub!(repl) -> str
10534 * str.scrub!{|bytes|} -> str
10535 *
10536 * If the string is invalid byte sequence then replace invalid bytes with given replacement
10537 * character, else returns self.
10538 * If block is given, replace invalid bytes with returned value of the block.
10539 *
10540 * "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD"
10541 * "abc\u3042\x81".scrub!("*") #=> "abc\u3042*"
10542 * "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack('H*')[0]+'>' } #=> "abc\u3042<e380>"
10543 */
10544static VALUE
10545str_scrub_bang(int argc, VALUE *argv, VALUE str)
10546{
10547 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
10548 VALUE new = rb_str_scrub(str, repl);
10549 if (!NIL_P(new)) rb_str_replace(str, new);
10550 return str;
10551}
10552
10553static ID id_normalize;
10554static ID id_normalized_p;
10555static VALUE mUnicodeNormalize;
10556
10557static VALUE
10558unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
10559{
10560 static int UnicodeNormalizeRequired = 0;
10561 VALUE argv2[2];
10562
10563 if (!UnicodeNormalizeRequired) {
10564 rb_require("unicode_normalize/normalize.rb");
10565 UnicodeNormalizeRequired = 1;
10566 }
10567 argv2[0] = str;
10568 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
10569 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
10570}
10571
10572/*
10573 * call-seq:
10574 * str.unicode_normalize(form=:nfc)
10575 *
10576 * Unicode Normalization---Returns a normalized form of +str+,
10577 * using Unicode normalizations NFC, NFD, NFKC, or NFKD.
10578 * The normalization form used is determined by +form+, which can
10579 * be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
10580 * The default is +:nfc+.
10581 *
10582 * If the string is not in a Unicode Encoding, then an Exception is raised.
10583 * In this context, 'Unicode Encoding' means any of UTF-8, UTF-16BE/LE,
10584 * and UTF-32BE/LE, as well as GB18030, UCS_2BE, and UCS_4BE.
10585 * Anything other than UTF-8 is implemented by converting to UTF-8,
10586 * which makes it slower than UTF-8.
10587 *
10588 * "a\u0300".unicode_normalize #=> "\u00E0"
10589 * "a\u0300".unicode_normalize(:nfc) #=> "\u00E0"
10590 * "\u00E0".unicode_normalize(:nfd) #=> "a\u0300"
10591 * "\xE0".force_encoding('ISO-8859-1').unicode_normalize(:nfd)
10592 * #=> Encoding::CompatibilityError raised
10593 */
10594static VALUE
10595rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
10596{
10597 return unicode_normalize_common(argc, argv, str, id_normalize);
10598}
10599
10600/*
10601 * call-seq:
10602 * str.unicode_normalize!(form=:nfc)
10603 *
10604 * Destructive version of String#unicode_normalize, doing Unicode
10605 * normalization in place.
10606 */
10607static VALUE
10608rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
10609{
10610 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
10611}
10612
10613/* call-seq:
10614 * str.unicode_normalized?(form=:nfc)
10615 *
10616 * Checks whether +str+ is in Unicode normalization form +form+,
10617 * which can be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
10618 * The default is +:nfc+.
10619 *
10620 * If the string is not in a Unicode Encoding, then an Exception is raised.
10621 * For details, see String#unicode_normalize.
10622 *
10623 * "a\u0300".unicode_normalized? #=> false
10624 * "a\u0300".unicode_normalized?(:nfd) #=> true
10625 * "\u00E0".unicode_normalized? #=> true
10626 * "\u00E0".unicode_normalized?(:nfd) #=> false
10627 * "\xE0".force_encoding('ISO-8859-1').unicode_normalized?
10628 * #=> Encoding::CompatibilityError raised
10629 */
10630static VALUE
10631rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
10632{
10633 return unicode_normalize_common(argc, argv, str, id_normalized_p);
10634}
10635
10636/**********************************************************************
10637 * Document-class: Symbol
10638 *
10639 * Symbol objects represent names inside the Ruby interpreter. They
10640 * are generated using the <code>:name</code> and
10641 * <code>:"string"</code> literals syntax, and by the various
10642 * <code>to_sym</code> methods. The same Symbol object will be
10643 * created for a given name or string for the duration of a program's
10644 * execution, regardless of the context or meaning of that name. Thus
10645 * if <code>Fred</code> is a constant in one context, a method in
10646 * another, and a class in a third, the Symbol <code>:Fred</code>
10647 * will be the same object in all three contexts.
10648 *
10649 * module One
10650 * class Fred
10651 * end
10652 * $f1 = :Fred
10653 * end
10654 * module Two
10655 * Fred = 1
10656 * $f2 = :Fred
10657 * end
10658 * def Fred()
10659 * end
10660 * $f3 = :Fred
10661 * $f1.object_id #=> 2514190
10662 * $f2.object_id #=> 2514190
10663 * $f3.object_id #=> 2514190
10664 *
10665 */
10666
10667
10668/*
10669 * call-seq:
10670 * sym == obj -> true or false
10671 *
10672 * Equality---If <i>sym</i> and <i>obj</i> are exactly the same
10673 * symbol, returns <code>true</code>.
10674 */
10675
10676#define sym_equal rb_obj_equal
10677
10678static int
10679sym_printable(const char *s, const char *send, rb_encoding *enc)
10680{
10681 while (s < send) {
10682 int n;
10683 int c = rb_enc_precise_mbclen(s, send, enc);
10684
10685 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
10687 c = rb_enc_mbc_to_codepoint(s, send, enc);
10688 if (!rb_enc_isprint(c, enc)) return FALSE;
10689 s += n;
10690 }
10691 return TRUE;
10692}
10693
10694int
10696{
10697 rb_encoding *enc;
10698 const char *ptr;
10699 long len;
10701
10702 if (resenc == NULL) resenc = rb_default_external_encoding();
10703 enc = STR_ENC_GET(sym);
10704 ptr = RSTRING_PTR(sym);
10705 len = RSTRING_LEN(sym);
10706 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
10707 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
10708 return FALSE;
10709 }
10710 return TRUE;
10711}
10712
10713VALUE
10715{
10716 rb_encoding *enc;
10717 const char *ptr;
10718 long len;
10719 rb_encoding *resenc;
10720
10723 if (resenc == NULL) resenc = rb_default_external_encoding();
10724 enc = STR_ENC_GET(str);
10725 ptr = RSTRING_PTR(str);
10726 len = RSTRING_LEN(str);
10727 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
10728 !sym_printable(ptr, ptr + len, enc)) {
10729 return rb_str_inspect(str);
10730 }
10731 return str;
10732}
10733
10736{
10737 VALUE str = rb_id2str(id);
10738 if (!rb_str_symname_p(str)) {
10739 return rb_str_inspect(str);
10740 }
10741 return str;
10742}
10743
10744/*
10745 * call-seq:
10746 * sym.inspect -> string
10747 *
10748 * Returns the representation of <i>sym</i> as a symbol literal.
10749 *
10750 * :fred.inspect #=> ":fred"
10751 */
10752
10753static VALUE
10754sym_inspect(VALUE sym)
10755{
10757 const char *ptr;
10758 long len;
10759 char *dest;
10760
10761 if (!rb_str_symname_p(str)) {
10763 len = RSTRING_LEN(str);
10764 rb_str_resize(str, len + 1);
10765 dest = RSTRING_PTR(str);
10766 memmove(dest + 1, dest, len);
10767 }
10768 else {
10769 rb_encoding *enc = STR_ENC_GET(str);
10771 str = rb_enc_str_new(0, len + 1, enc);
10772 dest = RSTRING_PTR(str);
10773 memcpy(dest + 1, ptr, len);
10774 }
10775 dest[0] = ':';
10776 return str;
10777}
10778
10779
10780/*
10781 * call-seq:
10782 * sym.id2name -> string
10783 * sym.to_s -> string
10784 *
10785 * Returns the name or string corresponding to <i>sym</i>.
10786 *
10787 * :fred.id2name #=> "fred"
10788 * :ginger.to_s #=> "ginger"
10789 */
10790
10791
10792VALUE
10794{
10795 return str_new_shared(rb_cString, rb_sym2str(sym));
10796}
10797
10798
10799/*
10800 * call-seq:
10801 * sym.to_sym -> sym
10802 * sym.intern -> sym
10803 *
10804 * In general, <code>to_sym</code> returns the Symbol corresponding
10805 * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
10806 * in this case.
10807 */
10808
10809static VALUE
10810sym_to_sym(VALUE sym)
10811{
10812 return sym;
10813}
10814
10816rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
10817{
10818 VALUE obj;
10819
10820 if (argc < 1) {
10821 rb_raise(rb_eArgError, "no receiver given");
10822 }
10823 obj = argv[0];
10824 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
10825}
10826
10827#if 0
10828/*
10829 * call-seq:
10830 * sym.to_proc
10831 *
10832 * Returns a _Proc_ object which responds to the given method by _sym_.
10833 *
10834 * (1..3).collect(&:to_s) #=> ["1", "2", "3"]
10835 */
10836
10837VALUE
10839{
10840}
10841#endif
10842
10843/*
10844 * call-seq:
10845 *
10846 * sym.succ
10847 *
10848 * Same as <code>sym.to_s.succ.intern</code>.
10849 */
10850
10851static VALUE
10852sym_succ(VALUE sym)
10853{
10855}
10856
10857/*
10858 * call-seq:
10859 *
10860 * symbol <=> other_symbol -> -1, 0, +1, or nil
10861 *
10862 * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the
10863 * symbols. Returns -1, 0, +1, or +nil+ depending on whether +symbol+ is
10864 * less than, equal to, or greater than +other_symbol+.
10865 *
10866 * +nil+ is returned if the two values are incomparable.
10867 *
10868 * See String#<=> for more information.
10869 */
10870
10871static VALUE
10872sym_cmp(VALUE sym, VALUE other)
10873{
10874 if (!SYMBOL_P(other)) {
10875 return Qnil;
10876 }
10877 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
10878}
10879
10880/*
10881 * call-seq:
10882 * sym.casecmp(other_symbol) -> -1, 0, +1, or nil
10883 *
10884 * Case-insensitive version of Symbol#<=>.
10885 * Currently, case-insensitivity only works on characters A-Z/a-z,
10886 * not all of Unicode. This is different from Symbol#casecmp?.
10887 *
10888 * :aBcDeF.casecmp(:abcde) #=> 1
10889 * :aBcDeF.casecmp(:abcdef) #=> 0
10890 * :aBcDeF.casecmp(:abcdefg) #=> -1
10891 * :abcdef.casecmp(:ABCDEF) #=> 0
10892 *
10893 * +nil+ is returned if the two symbols have incompatible encodings,
10894 * or if +other_symbol+ is not a symbol.
10895 *
10896 * :foo.casecmp(2) #=> nil
10897 * "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym.casecmp(:"\u{c4 d6 dc}") #=> nil
10898 */
10899
10900static VALUE
10901sym_casecmp(VALUE sym, VALUE other)
10902{
10903 if (!SYMBOL_P(other)) {
10904 return Qnil;
10905 }
10906 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
10907}
10908
10909/*
10910 * call-seq:
10911 * sym.casecmp?(other_symbol) -> true, false, or nil
10912 *
10913 * Returns +true+ if +sym+ and +other_symbol+ are equal after
10914 * Unicode case folding, +false+ if they are not equal.
10915 *
10916 * :aBcDeF.casecmp?(:abcde) #=> false
10917 * :aBcDeF.casecmp?(:abcdef) #=> true
10918 * :aBcDeF.casecmp?(:abcdefg) #=> false
10919 * :abcdef.casecmp?(:ABCDEF) #=> true
10920 * :"\u{e4 f6 fc}".casecmp?(:"\u{c4 d6 dc}") #=> true
10921 *
10922 * +nil+ is returned if the two symbols have incompatible encodings,
10923 * or if +other_symbol+ is not a symbol.
10924 *
10925 * :foo.casecmp?(2) #=> nil
10926 * "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym.casecmp?(:"\u{c4 d6 dc}") #=> nil
10927 */
10928
10929static VALUE
10930sym_casecmp_p(VALUE sym, VALUE other)
10931{
10932 if (!SYMBOL_P(other)) {
10933 return Qnil;
10934 }
10935 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
10936}
10937
10938/*
10939 * call-seq:
10940 * sym =~ obj -> integer or nil
10941 *
10942 * Returns <code>sym.to_s =~ obj</code>.
10943 */
10944
10945static VALUE
10946sym_match(VALUE sym, VALUE other)
10947{
10948 return rb_str_match(rb_sym2str(sym), other);
10949}
10950
10951/*
10952 * call-seq:
10953 * sym.match(pattern) -> matchdata or nil
10954 * sym.match(pattern, pos) -> matchdata or nil
10955 *
10956 * Returns <code>sym.to_s.match</code>.
10957 */
10958
10959static VALUE
10960sym_match_m(int argc, VALUE *argv, VALUE sym)
10961{
10962 return rb_str_match_m(argc, argv, rb_sym2str(sym));
10963}
10964
10965/*
10966 * call-seq:
10967 * sym.match?(pattern) -> true or false
10968 * sym.match?(pattern, pos) -> true or false
10969 *
10970 * Returns <code>sym.to_s.match?</code>.
10971 */
10972
10973static VALUE
10974sym_match_m_p(int argc, VALUE *argv, VALUE sym)
10975{
10976 return rb_str_match_m_p(argc, argv, sym);
10977}
10978
10979/*
10980 * call-seq:
10981 * sym[idx] -> char
10982 * sym[b, n] -> string
10983 * sym.slice(idx) -> char
10984 * sym.slice(b, n) -> string
10985 *
10986 * Returns <code>sym.to_s[]</code>.
10987 */
10988
10989static VALUE
10990sym_aref(int argc, VALUE *argv, VALUE sym)
10991{
10992 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
10993}
10994
10995/*
10996 * call-seq:
10997 * sym.length -> integer
10998 * sym.size -> integer
10999 *
11000 * Same as <code>sym.to_s.length</code>.
11001 */
11002
11003static VALUE
11004sym_length(VALUE sym)
11005{
11006 return rb_str_length(rb_sym2str(sym));
11007}
11008
11009/*
11010 * call-seq:
11011 * sym.empty? -> true or false
11012 *
11013 * Returns whether _sym_ is :"" or not.
11014 */
11015
11016static VALUE
11017sym_empty(VALUE sym)
11018{
11019 return rb_str_empty(rb_sym2str(sym));
11020}
11021
11022/*
11023 * call-seq:
11024 * sym.upcase -> symbol
11025 * sym.upcase([options]) -> symbol
11026 *
11027 * Same as <code>sym.to_s.upcase.intern</code>.
11028 */
11029
11030static VALUE
11031sym_upcase(int argc, VALUE *argv, VALUE sym)
11032{
11033 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
11034}
11035
11036/*
11037 * call-seq:
11038 * sym.downcase -> symbol
11039 * sym.downcase([options]) -> symbol
11040 *
11041 * Same as <code>sym.to_s.downcase.intern</code>.
11042 */
11043
11044static VALUE
11045sym_downcase(int argc, VALUE *argv, VALUE sym)
11046{
11047 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
11048}
11049
11050/*
11051 * call-seq:
11052 * sym.capitalize -> symbol
11053 * sym.capitalize([options]) -> symbol
11054 *
11055 * Same as <code>sym.to_s.capitalize.intern</code>.
11056 */
11057
11058static VALUE
11059sym_capitalize(int argc, VALUE *argv, VALUE sym)
11060{
11061 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
11062}
11063
11064/*
11065 * call-seq:
11066 * sym.swapcase -> symbol
11067 * sym.swapcase([options]) -> symbol
11068 *
11069 * Same as <code>sym.to_s.swapcase.intern</code>.
11070 */
11071
11072static VALUE
11073sym_swapcase(int argc, VALUE *argv, VALUE sym)
11074{
11075 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
11076}
11077
11078/*
11079 * call-seq:
11080 * sym.start_with?([prefixes]+) -> true or false
11081 *
11082 * Returns true if +sym+ starts with one of the +prefixes+ given.
11083 * Each of the +prefixes+ should be a String or a Regexp.
11084 *
11085 * :hello.start_with?("hell") #=> true
11086 * :hello.start_with?(/H/i) #=> true
11087 *
11088 * # returns true if one of the prefixes matches.
11089 * :hello.start_with?("heaven", "hell") #=> true
11090 * :hello.start_with?("heaven", "paradise") #=> false
11091 */
11092
11093static VALUE
11094sym_start_with(int argc, VALUE *argv, VALUE sym)
11095{
11096 return rb_str_start_with(argc, argv, rb_sym2str(sym));
11097}
11098
11099/*
11100 * call-seq:
11101 * sym.end_with?([suffixes]+) -> true or false
11102 *
11103 * Returns true if +sym+ ends with one of the +suffixes+ given.
11104 *
11105 * :hello.end_with?("ello") #=> true
11106 *
11107 * # returns true if one of the +suffixes+ matches.
11108 * :hello.end_with?("heaven", "ello") #=> true
11109 * :hello.end_with?("heaven", "paradise") #=> false
11110 */
11111
11112static VALUE
11113sym_end_with(int argc, VALUE *argv, VALUE sym)
11114{
11115 return rb_str_end_with(argc, argv, rb_sym2str(sym));
11116}
11117
11118/*
11119 * call-seq:
11120 * sym.encoding -> encoding
11121 *
11122 * Returns the Encoding object that represents the encoding of _sym_.
11123 */
11124
11125static VALUE
11126sym_encoding(VALUE sym)
11127{
11129}
11130
11131static VALUE
11132string_for_symbol(VALUE name)
11133{
11134 if (!RB_TYPE_P(name, T_STRING)) {
11136 if (NIL_P(tmp)) {
11137 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
11138 name);
11139 }
11140 name = tmp;
11141 }
11142 return name;
11143}
11144
11145ID
11147{
11148 if (SYMBOL_P(name)) {
11149 return SYM2ID(name);
11150 }
11151 name = string_for_symbol(name);
11152 return rb_intern_str(name);
11153}
11154
11155VALUE
11157{
11158 if (SYMBOL_P(name)) {
11159 return name;
11160 }
11161 name = string_for_symbol(name);
11162 return rb_str_intern(name);
11163}
11164
11165/*
11166 * call-seq:
11167 * Symbol.all_symbols => array
11168 *
11169 * Returns an array of all the symbols currently in Ruby's symbol
11170 * table.
11171 *
11172 * Symbol.all_symbols.size #=> 903
11173 * Symbol.all_symbols[1,20] #=> [:floor, :ARGV, :Binding, :symlink,
11174 * :chown, :EOFError, :$;, :String,
11175 * :LOCK_SH, :"setuid?", :$<,
11176 * :default_proc, :compact, :extend,
11177 * :Tms, :getwd, :$=, :ThreadGroup,
11178 * :wait2, :$>]
11179 */
11180
11181static VALUE
11182sym_all_symbols(VALUE _)
11183{
11184 return rb_sym_all_symbols();
11185}
11186
11187/*
11188 * A String object holds and manipulates an arbitrary sequence of
11189 * bytes, typically representing characters. String objects may be created
11190 * using String::new or as literals.
11191 *
11192 * Because of aliasing issues, users of strings should be aware of the methods
11193 * that modify the contents of a String object. Typically,
11194 * methods with names ending in ``!'' modify their receiver, while those
11195 * without a ``!'' return a new String. However, there are
11196 * exceptions, such as String#[]=.
11197 *
11198 */
11199
11200void
11202{
11203#undef rb_intern
11204#define rb_intern(str) rb_intern_const(str)
11205
11208 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
11210 rb_define_alloc_func(rb_cString, empty_str_alloc);
11211 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
11212 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
11213 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
11214 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
11218 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
11219 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
11220 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
11223 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
11224 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
11225 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
11226 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
11229 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
11230 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
11231 rb_define_method(rb_cString, "=~", rb_str_match, 1);
11232 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
11233 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
11235 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
11237 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
11238 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
11239 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
11240 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
11242 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
11243 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
11244 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
11245 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
11246 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
11247 rb_define_method(rb_cString, "scrub", str_scrub, -1);
11248 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
11250 rb_define_method(rb_cString, "+@", str_uplus, 0);
11251 rb_define_method(rb_cString, "-@", str_uminus, 0);
11252
11253 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
11254 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
11255 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
11256 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
11259 rb_define_method(rb_cString, "undump", str_undump, 0);
11260
11261 sym_ascii = ID2SYM(rb_intern("ascii"));
11262 sym_turkic = ID2SYM(rb_intern("turkic"));
11263 sym_lithuanian = ID2SYM(rb_intern("lithuanian"));
11264 sym_fold = ID2SYM(rb_intern("fold"));
11265
11266 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
11267 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
11268 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
11269 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
11270
11271 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
11272 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
11273 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
11274 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
11275
11276 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
11277 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
11278 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
11279 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
11280 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
11281 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
11282 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
11283 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
11284 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
11285 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
11286 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
11288 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
11289 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
11290 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
11291 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
11293
11294 rb_define_method(rb_cString, "include?", rb_str_include, 1);
11295 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
11296 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
11297
11298 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
11299
11300 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
11301 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
11302 rb_define_method(rb_cString, "center", rb_str_center, -1);
11303
11304 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
11305 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
11306 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
11307 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
11308 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
11309 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
11310 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
11311 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
11312 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
11313
11314 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
11315 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
11316 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
11317 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
11318 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
11319 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
11320 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
11321 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
11322 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
11323
11324 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
11325 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
11326 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
11327 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
11328 rb_define_method(rb_cString, "count", rb_str_count, -1);
11329
11330 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
11331 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
11332 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
11333 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
11334
11335 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
11336 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
11337 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
11338 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
11339 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
11340
11341 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
11342
11343 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
11344 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
11345
11346 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
11347 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
11348
11349 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
11350 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
11351 rb_define_method(rb_cString, "b", rb_str_b, 0);
11352 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
11353 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
11354
11355 /* define UnicodeNormalize module here so that we don't have to look it up */
11356 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
11357 id_normalize = rb_intern("normalize");
11358 id_normalized_p = rb_intern("normalized?");
11359
11360 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
11361 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
11362 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
11363
11364 rb_fs = Qnil;
11365 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
11366 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
11368
11373 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
11374
11377 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
11379 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
11380 rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
11381 rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
11383 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
11384 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
11385
11386 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
11387 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
11388 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
11389 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
11390
11391 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
11392 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
11393 rb_define_method(rb_cSymbol, "length", sym_length, 0);
11394 rb_define_method(rb_cSymbol, "size", sym_length, 0);
11395 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
11396 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
11397 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
11398
11399 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
11400 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
11401 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
11402 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
11403
11404 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
11405 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
11406
11407 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
11408}
int errno
#define L(x)
Definition: asm.h:125
char * crypt_r(const char *key, const char *setting, struct crypt_data *data)
Definition: crypt.c:396
#define sym(x)
Definition: date_core.c:3717
#define sub(x, y)
Definition: date_strftime.c:24
#define mod(x, y)
Definition: date_strftime.c:28
#define range(low, item, hi)
Definition: date_strftime.c:21
struct RIMemo * ptr
Definition: debug.c:65
#define ENCINDEX_UTF_32BE
Definition: encindex.h:47
#define ENCINDEX_UTF_32LE
Definition: encindex.h:48
#define ENCINDEX_UTF_16BE
Definition: encindex.h:45
#define ENCINDEX_UTF_16
Definition: encindex.h:49
#define ENCINDEX_UTF_8
Definition: encindex.h:43
#define ENCINDEX_UTF_16LE
Definition: encindex.h:46
int rb_enc_find_index2(const char *name, long len)
Definition: encoding.c:717
#define ENCINDEX_UTF_32
Definition: encindex.h:50
#define ENCINDEX_US_ASCII
Definition: encindex.h:44
#define ENCINDEX_ASCII
Definition: encindex.h:42
int rb_enc_dummy_p(rb_encoding *enc)
Definition: encoding.c:131
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:1032
int rb_enc_get_index(VALUE obj)
Definition: encoding.c:779
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Definition: encoding.c:866
rb_encoding * rb_utf8_encoding(void)
Definition: encoding.c:1328
rb_encoding * rb_enc_check_str(VALUE str1, VALUE str2)
Definition: encoding.c:880
rb_encoding * rb_ascii8bit_encoding(void)
Definition: encoding.c:1316
unsigned int rb_enc_codepoint_len(const char *p, const char *e, int *len_p, rb_encoding *enc)
Definition: encoding.c:1068
rb_encoding * rb_enc_from_index(int index)
Definition: encoding.c:609
rb_encoding * rb_filesystem_encoding(void)
Definition: encoding.c:1387
rb_encoding * rb_default_internal_encoding(void)
Definition: encoding.c:1512
int rb_utf8_encindex(void)
Definition: encoding.c:1334
int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:1014
rb_encoding * rb_enc_get(VALUE obj)
Definition: encoding.c:872
int rb_ascii8bit_encindex(void)
Definition: encoding.c:1322
rb_encoding * rb_enc_get_from_index(int index)
Definition: encoding.c:618
int rb_enc_unicode_p(rb_encoding *enc)
Definition: encoding.c:521
void rb_enc_copy(VALUE obj1, VALUE obj2)
Definition: encoding.c:990
int rb_enc_to_index(rb_encoding *enc)
Definition: encoding.c:125
void rb_enc_set_index(VALUE obj, int idx)
Definition: encoding.c:830
rb_encoding * rb_default_external_encoding(void)
Definition: encoding.c:1427
rb_encoding * rb_enc_check(VALUE str1, VALUE str2)
Definition: encoding.c:891
int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:1020
rb_encoding * rb_enc_compatible(VALUE str1, VALUE str2)
Definition: encoding.c:974
rb_encoding * rb_locale_encoding(void)
Definition: encoding.c:1372
VALUE rb_obj_encoding(VALUE obj)
Definition: encoding.c:1004
rb_encoding * rb_to_encoding(VALUE enc)
Definition: encoding.c:245
rb_encoding * rb_usascii_encoding(void)
Definition: encoding.c:1340
VALUE rb_enc_from_encoding(rb_encoding *encoding)
Definition: encoding.c:116
VALUE rb_enc_associate_index(VALUE obj, int idx)
Definition: encoding.c:838
int rb_enc_codelen(int c, rb_encoding *enc)
Definition: encoding.c:1089
int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
Definition: encoding.c:1044
int rb_usascii_encindex(void)
Definition: encoding.c:1346
#define ENCODING_SET_INLINED(obj, i)
Definition: encoding.h:59
#define ENC_CODERANGE_7BIT
Definition: encoding.h:104
#define ENC_CODERANGE_VALID
Definition: encoding.h:105
#define rb_enc_left_char_head(s, p, e, enc)
Definition: encoding.h:222
#define rb_enc_mbcput(c, buf, enc)
Definition: encoding.h:217
#define ENC_CODERANGE_CLEAN_P(cr)
Definition: encoding.h:107
#define rb_enc_isctype(c, t, enc)
Definition: encoding.h:229
#define ENC_CODERANGE_AND(a, b)
Definition: encoding.h:112
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Definition: transcode.c:1429
rb_econv_result_t
Definition: encoding.h:297
@ econv_finished
Definition: encoding.h:302
@ econv_destination_buffer_full
Definition: encoding.h:300
#define rb_enc_step_back(s, p, e, n, enc)
Definition: encoding.h:224
long rb_memsearch(const void *, long, const void *, long, rb_encoding *)
Definition: re.c:239
#define rb_enc_prev_char(s, p, e, enc)
Definition: encoding.h:220
int rb_enc_symname2_p(const char *, long, rb_encoding *)
Definition: symbol.c:339
#define ENC_CODERANGE(obj)
Definition: encoding.h:108
#define ENC_CODERANGE_UNKNOWN
Definition: encoding.h:103
#define rb_enc_name(enc)
Definition: encoding.h:177
#define rb_enc_isascii(c, enc)
Definition: encoding.h:230
#define rb_enc_mbmaxlen(enc)
Definition: encoding.h:181
#define ENCODING_GET(obj)
Definition: encoding.h:62
#define ENC_CODERANGE_MASK
Definition: encoding.h:102
#define rb_enc_mbc_to_codepoint(p, e, enc)
Definition: encoding.h:208
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Definition: transcode.c:2561
#define MBCLEN_CHARFOUND_LEN(ret)
Definition: encoding.h:192
#define rb_enc_asciicompat(enc)
Definition: encoding.h:245
#define rb_enc_codepoint(p, e, enc)
Definition: encoding.h:207
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Definition: transcode.c:2870
#define MBCLEN_INVALID_P(ret)
Definition: encoding.h:193
#define rb_enc_code_to_mbclen(c, enc)
Definition: encoding.h:214
#define rb_enc_isprint(c, enc)
Definition: encoding.h:236
#define MBCLEN_NEEDMORE_P(ret)
Definition: encoding.h:194
#define rb_enc_mbminlen(enc)
Definition: encoding.h:180
#define ENC_CODERANGE_BROKEN
Definition: encoding.h:106
#define MBCLEN_CHARFOUND_P(ret)
Definition: encoding.h:191
void rb_econv_close(rb_econv_t *ec)
Definition: transcode.c:1685
#define rb_enc_right_char_head(s, p, e, enc)
Definition: encoding.h:223
#define ENCODING_GET_INLINED(obj)
Definition: encoding.h:61
#define ENC_CODERANGE_CLEAR(obj)
Definition: encoding.h:111
#define ENCODING_IS_ASCII8BIT(obj)
Definition: encoding.h:63
#define ENC_CODERANGE_SET(obj, cr)
Definition: encoding.h:110
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Definition: encoding.h:113
#define rb_enc_is_newline(p, end, enc)
Definition: encoding.h:227
#define ENCODING_MASK
Definition: encoding.h:42
char str[HTML_ESCAPE_MAX_LEN+1]
Definition: escape.c:18
#define rb_intern_str(string)
Definition: generator.h:16
void rb_include_module(VALUE, VALUE)
Definition: class.c:882
VALUE rb_define_class(const char *, VALUE)
Defines a top-level class.
Definition: class.c:662
VALUE rb_define_module(const char *)
Definition: class.c:785
void rb_undef_method(VALUE, const char *)
Definition: class.c:1593
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition: eval.c:898
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *)
Definition: class.c:1904
VALUE rb_cObject
Object class.
Definition: ruby.h:2012
struct RBasic basic
Definition: ruby.h:989
long capa
Definition: ruby.h:995
VALUE rb_cString
Definition: string.c:66
long len
Definition: ruby.h:992
union RString::@157::@158::@159 aux
char ary[RSTRING_EMBED_LEN_MAX+1]
Definition: ruby.h:999
VALUE rb_cSymbol
Definition: string.c:67
VALUE shared
Definition: ruby.h:996
char * ptr
Definition: ruby.h:993
VALUE rb_to_symbol(VALUE name)
Definition: string.c:11156
@ RSTRING_FSTR
Definition: ruby.h:983
@ RSTRING_NOEMBED
Definition: ruby.h:978
@ RSTRING_EMBED_LEN_MAX
Definition: ruby.h:982
@ RSTRING_EMBED_LEN_MASK
Definition: ruby.h:979
union RString::@157 as
VALUE rb_mComparable
Definition: compar.c:16
struct RString::@157::@158 heap
@ RUBY_FL_FREEZE
Definition: ruby.h:851
ID rb_to_id(VALUE name)
Definition: string.c:11146
void rb_syserr_fail(int e, const char *mesg)
Definition: error.c:2783
void rb_raise(VALUE exc, const char *fmt,...)
Definition: error.c:2671
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition: eval.c:668
void rb_bug(const char *fmt,...)
Definition: error.c:636
VALUE rb_eRangeError
Definition: error.c:928
VALUE rb_eTypeError
Definition: error.c:924
void rb_fatal(const char *fmt,...)
Definition: error.c:2722
VALUE rb_eEncCompatError
Definition: error.c:931
VALUE rb_eRuntimeError
Definition: error.c:922
void rb_warn(const char *fmt,...)
Definition: error.c:315
VALUE rb_eArgError
Definition: error.c:925
VALUE rb_eIndexError
Definition: error.c:926
VALUE rb_ensure(VALUE(*)(VALUE), VALUE, VALUE(*)(VALUE), VALUE)
An equivalent to ensure clause.
Definition: eval.c:1115
VALUE rb_any_to_s(VALUE)
Default implementation of #to_s.
Definition: object.c:527
VALUE rb_obj_alloc(VALUE)
Allocates an instance of klass.
Definition: object.c:1895
VALUE rb_obj_frozen_p(VALUE obj)
Determines if the object is frozen.
Definition: object.c:1099
double rb_str_to_dbl(VALUE, int)
Parses a string representation of a floating point number.
Definition: object.c:3371
VALUE rb_obj_class(VALUE)
Equivalent to Object#class in Ruby.
Definition: object.c:217
VALUE rb_convert_type_with_id(VALUE, int, const char *, ID)
Definition: object.c:2914
VALUE rb_equal(VALUE, VALUE)
Same as Object#===, case equality.
Definition: object.c:124
VALUE rb_obj_freeze(VALUE)
Make the object unmodifiable.
Definition: object.c:1080
VALUE rb_str_escape(VALUE str)
Definition: string.c:5866
VALUE rb_check_convert_type_with_id(VALUE, int, const char *, ID)
Definition: object.c:2957
VALUE rb_to_int(VALUE)
Converts val into Integer.
Definition: object.c:3021
#define __msan_unpoison_string(x)
Definition: internal.h:123
unsigned char u8
Definition: many2.c:13
const char * name
Definition: nkf.c:208
unsigned int last
Definition: nkf.c:4324
#define ONIGENC_CTYPE_DIGIT
Definition: onigmo.h:298
ONIG_EXTERN int onig_error_code_to_str(OnigUChar *s, OnigPosition err_code,...)
#define ONIGENC_CASE_ASCII_ONLY
Definition: onigmo.h:125
unsigned char OnigUChar
Definition: onigmo.h:79
unsigned int OnigCaseFoldType
Definition: onigmo.h:95
#define ONIG_MAX_ERROR_MESSAGE_LEN
Definition: onigmo.h:443
ONIG_EXTERN int onig_new(OnigRegex *, const OnigUChar *pattern, const OnigUChar *pattern_end, OnigOptionType option, OnigEncoding enc, const OnigSyntaxType *syntax, OnigErrorInfo *einfo)
#define ONIGENC_CASE_MODIFIED
Definition: onigmo.h:119
#define ONIGENC_MBCLEN_CHARFOUND_LEN(r)
Definition: onigmo.h:347
#define ONIGENC_CTYPE_ALPHA
Definition: onigmo.h:295
#define UChar
Definition: onigmo.h:76
#define ONIGENC_CODE_TO_MBC_MAXLEN
Definition: onigmo.h:289
ptrdiff_t OnigPosition
Definition: onigmo.h:83
#define ONIGENC_MBCLEN_CHARFOUND_P(r)
Definition: onigmo.h:346
#define ONIGENC_CASE_UPCASE
Definition: onigmo.h:113
#define ONIGENC_CASE_FOLD
Definition: onigmo.h:120
#define ONIGERR_TOO_BIG_WIDE_CHAR_VALUE
Definition: onigmo.h:691
ONIG_EXTERN const OnigSyntaxType * OnigDefaultSyntax
Definition: onigmo.h:515
#define ONIGENC_CASE_DOWNCASE
Definition: onigmo.h:114
#define ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, s, end)
Definition: onigmo.h:334
ONIG_EXTERN int onigenc_ascii_only_case_map(OnigCaseFoldType *flagP, const OnigUChar **pp, const OnigUChar *end, OnigUChar *to, OnigUChar *to_end, const struct OnigEncodingTypeST *enc)
Definition: regenc.c:955
ONIG_EXTERN OnigPosition onig_match(OnigRegex, const OnigUChar *str, const OnigUChar *end, const OnigUChar *at, OnigRegion *region, OnigOptionType option)
#define ONIGENC_CASE_FOLD_TURKISH_AZERI
Definition: onigmo.h:122
#define ONIGENC_CASE_TITLECASE
Definition: onigmo.h:115
#define ONIGENC_CASE_FOLD_LITHUANIAN
Definition: onigmo.h:124
#define ONIGENC_CODE_TO_MBCLEN(enc, code)
Definition: onigmo.h:367
#define ONIGERR_INVALID_CODE_POINT_VALUE
Definition: onigmo.h:689
#define ONIG_OPTION_DEFAULT
Definition: onigmo.h:447
int rb_reg_backref_number(VALUE match, VALUE backref)
Definition: re.c:1181
#define UNALIGNED_WORD_ACCESS
void * memchr(const void *, int, size_t)
VALUE rb_hash_lookup(VALUE, VALUE)
Definition: hash.c:2063
#define MEMCPY(p1, p2, type, n)
#define is_ascii_string(str)
#define NULL
#define STR_EMBED_P(str)
#define rb_funcallv(recv, mid, argc, argv)
int memcmp(const void *, const void *, size_t)
Definition: memcmp.c:7
#define dp(v)
#define RBASIC_CLEAR_CLASS(obj)
void rb_backref_set_string(VALUE string, long pos, long len)
Definition: re.c:1348
#define UNLIMITED_ARGUMENTS
use StringValue() instead")))
#define RSTRING_LEN(str)
#define rb_str_buf_cat2
#define FL_EXIVAR
#define _(args)
VALUE rb_reg_check_preprocess(VALUE)
Definition: re.c:2707
#define NEWOBJ_OF(obj, type, klass, flags)
#define RTEST(v)
VALUE rb_rs
Definition: intern.h:585
#define ALLOCA_N(type, n)
#define FL_TEST(x, f)
#define bp()
unsigned long st_data_t
#define RBASIC(obj)
#define ZALLOC_N(type, n)
int rb_objspace_garbage_object_p(VALUE obj)
Definition: gc.c:3620
#define RGENGC_WB_PROTECTED_STRING
size_t strlen(const char *)
#define T_STRING
long int ptrdiff_t
VALUE rb_hash_aref(VALUE, VALUE)
Definition: hash.c:2037
VALUE rb_backref_get(void)
Definition: vm.c:1304
#define offsetof(TYPE, MEMBER)
#define RARRAY_LENINT(ary)
#define PRIuSIZE
#define xfree
#define LONG2FIX(i)
#define OBJ_FROZEN_RAW(x)
#define Qundef
VALUE rb_range_beg_len(VALUE, long *, long *, long, int)
Definition: range.c:1278
const struct rb_call_cache * cc
#define CHAR_BIT
#define MAYBE_UNUSED(x)
#define rb_str_cat2
void rb_warn_deprecated(const char *fmt, const char *suggest,...) __attribute__((format(printf
#define RSTRING_END(str)
const VALUE VALUE obj
#define rb_check_frozen(obj)
#define FL_SET(x, f)
#define UINT2NUM(x)
#define UNREACHABLE
#define RSTRING_PTR(str)
#define UCHAR_MAX
int snprintf(char *__restrict__, size_t, const char *__restrict__,...) __attribute__((__format__(__printf__
long rb_reg_search0(VALUE, VALUE, long, int, int)
Definition: re.c:1538
st_index_t rb_memhash(const void *ptr, long len)
Definition: random.c:1444
#define STR_SHARED_P(s)
#define NIL_P(v)
#define rb_str_buf_cat
VALUE rb_str_format(int, const VALUE *, VALUE)
Definition: sprintf.c:204
#define numberof(array)
#define DBL2NUM(dbl)
void rb_match_busy(VALUE)
Definition: re.c:1295
#define ID2SYM(x)
#define LONG_MAX
#define RSTRING_LENINT(str)
VALUE rb_cEncodingConverter
Definition: transcode.c:25
int fprintf(FILE *__restrict__, const char *__restrict__,...) __attribute__((__format__(__printf__
const char size_t n
VALUE rb_sym_all_symbols(void)
Definition: symbol.c:840
#define MEMZERO(p, type, n)
#define rb_intern_const(str)
#define SYM2ID(x)
int rb_respond_to(VALUE, ID)
Definition: vm_method.c:2207
unsigned long VALUE
VALUE rb_ary_push(VALUE, VALUE)
Definition: array.c:1195
#define stderr
VALUE rb_reg_match_p(VALUE re, VALUE str, long pos)
Definition: re.c:3340
__inline__ const void *__restrict__ src
VALUE rb_sym2str(VALUE)
Definition: symbol.c:784
_Bool rb_reg_start_with_p(VALUE re, VALUE str)
Definition: re.c:1626
#define rp(obj)
#define xmalloc
void rb_define_alloc_func(VALUE, rb_alloc_func_t)
#define FL_UNSET(x, f)
uint32_t i
#define rb_fstring_lit(str)
#define char
#define RSTRING_GETMEM(str, ptrvar, lenvar)
__inline__ const void *__restrict__ size_t len
#define OBJ_FROZEN(x)
const VALUE int int int int int int VALUE char * fmt
const char * rb_obj_classname(VALUE)
Definition: variable.c:289
#define FL_TEST_RAW(x, f)
#define ALLOC_N(type, n)
#define OBJ_FREEZE(x)
#define INT2NUM(x)
VALUE rb_int_and(VALUE x, VALUE y)
Definition: numeric.c:4472
#define SIZED_REALLOC_N(var, type, n, old_n)
#define ALLOCV(v, n)
#define RB_OBJ_WRITE(a, slot, b)
#define LONG2NUM(x)
#define long
#define NUM2INT(x)
void rb_define_singleton_method(VALUE, const char *, VALUE(*)(), int)
VALUE rb_str_to_inum(VALUE, int, int)
Definition: bignum.c:4268
#define STR_SHARED
#define RB_GC_GUARD(v)
#define SIZEOF_VOIDP
#define RUBY_DTRACE_CREATE_HOOK(name, arg)
#define is_broken_string(str)
#define PRIsVALUE
#define rb_ary_new3
void * memset(void *, int, size_t)
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
#define rb_funcall(recv, mid, argc,...)
int VALUE v
VALUE rb_ary_new(void)
Definition: array.c:723
VALUE rb_invcmp(VALUE, VALUE)
Definition: compar.c:47
ID rb_id_encoding(void)
Definition: encoding.c:759
#define rb_scan_args(argc, argvp, fmt,...)
void rb_gc_register_address(VALUE *)
Definition: gc.c:7093
VALUE rb_reg_match(VALUE, VALUE)
Definition: re.c:3180
#define RB_DEBUG_COUNTER_INC_IF(type, cond)
VALUE rb_require(const char *)
Definition: load.c:1161
VALUE ID VALUE old
#define TypedData_Wrap_Struct(klass, data_type, sval)
#define CONST_ID(var, str)
#define RBASIC_SET_CLASS_RAW(obj, cls)
#define TRUE
#define FALSE
#define RSTRING_EMBED_LEN(str)
unsigned int size
#define Qtrue
#define ruby_sized_xfree(ptr, size)
long unsigned int size_t
#define rb_strlen_lit(str)
#define FIXNUM_MAX
#define FL_UNSET_RAW(x, f)
#define UNLIKELY(x)
int dup(int __fildes)
struct rb_call_cache buf
#define FLEX_ARY_LEN
#define memmove(dst, src, len)
#define ISDIGIT(c)
__uintptr_t uintptr_t
#define Qnil
#define Qfalse
#define DATA_PTR(dta)
void * memcpy(void *__restrict__, const void *__restrict__, size_t)
#define OBJ_FREEZE_RAW(x)
VALUE rb_str_intern(VALUE)
Definition: symbol.c:710
void * memrchr(const void *, int, size_t)
st_data_t st_index_t
VALUE rb_check_hash_type(VALUE)
Definition: hash.c:1852
#define FIXABLE(f)
#define RB_TYPE_P(obj, type)
VALUE rb_funcall_with_block_kw(VALUE, ID, int, const VALUE *, VALUE, int)
Definition: vm_eval.c:1060
VALUE rb_reg_nth_match(int, VALUE)
Definition: re.c:1714
#define FL_WB_PROTECTED
#define INT2FIX(i)
#define SPECIAL_CONST_P(x)
#define RSTRING(obj)
VALUE rb_check_array_type(VALUE)
Definition: array.c:909
char * crypt(const char *__key, const char *__salt)
VALUE rb_str_locktmp(VALUE)
#define STR_NOEMBED
void rb_gc_force_recycle(VALUE)
Definition: gc.c:7027
int rb_num_to_uint(VALUE val, unsigned int *ret)
Definition: numeric.c:244
void rb_define_hooked_variable(const char *, VALUE *, rb_gvar_getter_t *, rb_gvar_setter_t *)
Definition: variable.c:480
#define MJIT_FUNC_EXPORTED
const VALUE * argv
void void ruby_xfree(void *)
Definition: gc.c:10183
#define SYMBOL_P(x)
__inline__ int
#define FIXNUM_P(f)
#define CLASS_OF(v)
#define RETURN_ENUMERATOR(obj, argc, argv)
void rb_undef_alloc_func(VALUE)
Definition: vm_method.c:729
#define TOLOWER(c)
#define Check_Type(v, t)
#define RB_INTEGER_TYPE_P(obj)
VALUE rb_hash_aset(VALUE, VALUE, VALUE)
Definition: hash.c:2852
#define assert
#define rb_check_arity
#define RUBY_ASSERT(expr)
#define FL_FREEZE
VALUE rb_sym_to_proc(VALUE sym)
Definition: proc.c:1312
#define RBASIC_CLASS(obj)
unsigned long ID
VALUE rb_yield(VALUE)
Definition: vm_eval.c:1237
void rb_match_unbusy(VALUE)
Definition: re.c:1301
const char *void rb_warning(const char *,...) __attribute__((format(printf
#define ISSPACE(c)
#define RUBY_FUNC_EXPORTED
#define FIX2LONG(x)
#define RBASIC_SET_CLASS(obj, cls)
#define NUM2LONG(x)
void rb_define_method(VALUE, const char *, VALUE(*)(), int)
#define rb_ary_new2
#define BUILTIN_TYPE(x)
void rb_backref_set(VALUE)
Definition: vm.c:1310
VALUE rb_hash_new(void)
Definition: hash.c:1523
#define ST2FIX(h)
#define RARRAY_CONST_PTR(a)
#define ISPRINT(c)
#define SIZET2NUM(v)
#define ISASCII(c)
#define FL_SET_RAW(x, f)
#define StringValueCStr(v)
#define T_REGEXP
#define ISALPHA(c)
#define RB_DEBUG_COUNTER_INC(type)
VALUE rb_default_rs
Definition: intern.h:586
#define LIKELY(x)
VALUE rb_reg_regcomp(VALUE)
Definition: re.c:2970
VALUE rb_reg_regsub(VALUE, VALUE, struct re_registers *, VALUE)
Definition: re.c:3776
#define RMATCH_REGS(obj)
Definition: re.h:51
long rb_reg_search(VALUE, VALUE, long, int)
Definition: re.c:1620
VALUE rb_enc_sprintf(rb_encoding *enc, const char *format,...)
Definition: sprintf.c:1178
#define f
int st_delete(st_table *tab, st_data_t *key, st_data_t *value)
Definition: st.c:1418
int st_foreach(st_table *tab, st_foreach_callback_func *func, st_data_t arg)
Definition: st.c:1717
int st_update(st_table *tab, st_data_t key, st_update_callback_func *func, st_data_t arg)
Definition: st.c:1509
#define sym_equal
Definition: string.c:10676
VALUE rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
Definition: string.c:385
VALUE rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
Definition: string.c:964
#define STR_SET_LEN(str, n)
Definition: string.c:104
#define STR_EMBEDDABLE_P(len, termlen)
Definition: string.c:181
struct mapping_buffer mapping_buffer
int rb_str_symname_p(VALUE sym)
Definition: string.c:10695
VALUE rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
Definition: string.c:4398
void rb_str_free(VALUE str)
Definition: string.c:1349
VALUE rb_str_times(VALUE str, VALUE times)
Definition: string.c:1966
#define SHARABLE_SUBSTRING_P(beg, len, end)
Definition: string.c:176
#define STR_ENC_GET(str)
Definition: string.c:170
long rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
Definition: string.c:567
#define STR_HEAP_PTR(str)
Definition: string.c:167
VALUE rb_str_new_shared(VALUE str)
Definition: string.c:1197
#define STR_NOFREE
Definition: string.c:90
const char * ruby_escaped_char(int c)
Definition: string.c:5848
#define TR_TABLE_SIZE
Definition: string.c:7357
void rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
Definition: string.c:2230
VALUE rb_str_cat_cstr(VALUE str, const char *ptr)
Definition: string.c:2822
VALUE rb_str_new_frozen(VALUE orig)
Definition: string.c:1203
#define aligned_ptr(value)
VALUE rb_str_buf_cat_ascii(VALUE str, const char *ptr)
Definition: string.c:2926
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Definition: string.c:1030
#define STR_SET_NOEMBED(str)
Definition: string.c:93
#define STR_SET_SHARED(str, shared_str)
Definition: string.c:157
VALUE rb_str_buf_append(VALUE str, VALUE str2)
Definition: string.c:2950
VALUE rb_str_cat(VALUE str, const char *ptr, long len)
Definition: string.c:2812
VALUE rb_utf8_str_new(const char *ptr, long len)
Definition: string.c:788
VALUE rb_filesystem_str_new(const char *ptr, long len)
Definition: string.c:1111
long rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
Definition: string.c:1740
#define rb_str_splice(str, beg, len, val)
Definition: string.c:4687
VALUE rb_str_export(VALUE str)
Definition: string.c:1123
#define DEFAULT_REPLACE_CHAR(str)
char * rb_string_value_cstr(volatile VALUE *ptr)
Definition: string.c:2291
VALUE rb_fstring_cstr(const char *ptr)
Definition: string.c:410
#define lesser(a, b)
Definition: string.c:3200
VALUE rb_utf8_str_new_cstr(const char *ptr)
Definition: string.c:828
VALUE rb_sym_to_s(VALUE sym)
Definition: string.c:10793
void rb_str_shared_replace(VALUE str, VALUE str2)
Definition: string.c:1391
VALUE rb_external_str_new(const char *ptr, long len)
Definition: string.c:1087
VALUE rb_str_tmp_new(long len)
Definition: string.c:1343
char * rb_str_fill_terminator(VALUE str, const int newminlen)
Definition: string.c:2306
long rb_str_offset(VALUE str, long pos)
Definition: string.c:2416
char * rb_str_subpos(VALUE str, long beg, long *lenp)
Definition: string.c:2497
VALUE rb_str_succ(VALUE orig)
Definition: string.c:4090
#define CASE_MAPPING_ADDITIONAL_LENGTH
Definition: string.c:6487
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Definition: string.c:3173
VALUE rb_str_subseq(VALUE str, long beg, long len)
Definition: string.c:2474
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition: string.c:10185
#define RUBY_MAX_CHAR_LEN
Definition: string.c:86
VALUE rb_str_new_static(const char *ptr, long len)
Definition: string.c:872
int rb_enc_str_coderange(VALUE str)
Definition: string.c:657
VALUE rb_str_chomp_string(VALUE str, VALUE rs)
Definition: string.c:8921
#define STR_SHARED_ROOT
Definition: string.c:87
#define ENUM_ELEM(ary, e)
Definition: string.c:8140
MJIT_FUNC_EXPORTED VALUE rb_str_opt_plus(VALUE str1, VALUE str2)
Definition: string.c:1925
VALUE rb_str_ord(VALUE s)
Definition: string.c:9527
#define CRYPT_END()
VALUE rb_str_upto_each(VALUE beg, VALUE end, int excl, int(*each)(VALUE, VALUE), VALUE arg)
Definition: string.c:4263
size_t rb_str_capacity(VALUE str)
Definition: string.c:712
MJIT_FUNC_EXPORTED VALUE rb_id_quote_unprintable(ID id)
Definition: string.c:10735
VALUE rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len, rb_encoding *from, int ecflags, VALUE ecopts)
Definition: string.c:943
#define STR_SET_EMBED(str)
Definition: string.c:97
const struct st_hash_type rb_fstring_hash_type
Definition: string.c:260
#define BARE_STRING_P(str)
Definition: string.c:265
VALUE rb_str_dup(VALUE str)
Definition: string.c:1516
void Init_String(void)
Definition: string.c:11201
void rb_str_modify(VALUE str)
Definition: string.c:2114
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Definition: string.c:890
VALUE rb_str_to_str(VALUE str)
Definition: string.c:1382
st_index_t rb_str_hash(VALUE str)
Definition: string.c:3163
VALUE rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
Definition: string.c:403
long rb_str_strlen(VALUE str)
Definition: string.c:1829
VALUE rb_str_resurrect(VALUE str)
Definition: string.c:1522
VALUE rb_str_quote_unprintable(VALUE str)
Definition: string.c:10714
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Definition: string.c:914
VALUE rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
Definition: string.c:1074
VALUE rb_usascii_str_new(const char *ptr, long len)
Definition: string.c:780
#define IS_EVSTR(p, e)
Definition: string.c:6023
VALUE rb_usascii_str_new_cstr(const char *ptr)
Definition: string.c:820
#define CASEMAP_DEBUG
Definition: string.c:6489
#define ascii_isspace(c)
Definition: string.c:7808
VALUE rb_fs
Definition: string.c:452
#define WANTARRAY(m, size)
Definition: string.c:8125
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Definition: string.c:1117
#define rb_str_index(str, sub, offset)
Definition: string.c:3490
#define BEG(no)
Definition: string.c:25
int rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
Definition: string.c:5815
VALUE rb_str_plus(VALUE str1, VALUE str2)
Definition: string.c:1894
long rb_str_sublen(VALUE str, long pos)
Definition: string.c:2463
VALUE rb_str_equal(VALUE str1, VALUE str2)
Definition: string.c:3267
VALUE rb_str_tmp_frozen_acquire(VALUE orig)
Definition: string.c:1210
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
Definition: string.c:1036
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Definition: string.c:836
VALUE rb_str_replace(VALUE str, VALUE str2)
Definition: string.c:5363
VALUE rb_check_string_type(VALUE str)
Definition: string.c:2314
void rb_str_set_len(VALUE str, long len)
Definition: string.c:2692
#define END(no)
Definition: string.c:26
VALUE rb_str_export_locale(VALUE str)
Definition: string.c:1129
VALUE rb_str_inspect(VALUE str)
Definition: string.c:5930
VALUE rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
Definition: string.c:796
VALUE rb_str_buf_new_cstr(const char *ptr)
Definition: string.c:1331
MJIT_FUNC_EXPORTED VALUE rb_obj_as_string_result(VALUE str, VALUE obj)
Definition: string.c:1452
#define rb_intern(str)
#define CHECK_IF_ASCII(c)
RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen,(str))
Definition: string.c:2661
VALUE rb_tainted_str_new(const char *ptr, long len)
Definition: string.c:896
void rb_str_setter(VALUE val, ID id, VALUE *var)
Definition: string.c:10077
VALUE rb_str_length(VALUE str)
Definition: string.c:1843
RUBY_FUNC_EXPORTED VALUE rb_fstring(VALUE str)
Definition: string.c:312
RUBY_FUNC_EXPORTED VALUE rb_str_locktmp_ensure(VALUE str, VALUE(*func)(VALUE), VALUE arg)
Definition: string.c:2685
int rb_str_comparable(VALUE str1, VALUE str2)
Definition: string.c:3203
MJIT_FUNC_EXPORTED VALUE rb_fstring_new(const char *ptr, long len)
Definition: string.c:396
#define str_buf_cat2(str, ptr)
Definition: string.c:2809
#define MIN_PRE_ALLOC_SIZE
Definition: string.c:2971
VALUE rb_str_append(VALUE str, VALUE str2)
Definition: string.c:2965
#define RESIZE_CAPA_TERM(str, capacity, termlen)
Definition: string.c:137
VALUE rb_str_freeze(VALUE str)
Definition: string.c:2616
VALUE rb_string_value(volatile VALUE *ptr)
Definition: string.c:2175
void rb_str_modify_expand(VALUE str, long expand)
Definition: string.c:2122
#define RESIZE_CAPA(str, capacity)
Definition: string.c:133
VALUE rb_str_new(const char *ptr, long len)
Definition: string.c:774
unsigned char * USTR
Definition: string.c:6987
MJIT_FUNC_EXPORTED VALUE rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
Definition: string.c:10816
VALUE rb_str_scrub(VALUE str, VALUE repl)
Definition: string.c:10248
#define STR_BORROWED
Definition: string.c:88
#define STR_TMPLOCK
Definition: string.c:89
int rb_enc_str_asciionly_p(VALUE str)
Definition: string.c:678
#define CHAR_ESC_LEN
Definition: string.c:5812
#define STR_FAKESTR
Definition: string.c:91
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
Definition: string.c:2919
#define STR_SET_EMBED_LEN(str, n)
Definition: string.c:98
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Definition: string.c:1298
VALUE rb_locale_str_new_cstr(const char *ptr)
Definition: string.c:1105
#define rb_str_dup_frozen
int rb_str_cmp(VALUE str1, VALUE str2)
Definition: string.c:3228
char * rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
Definition: string.c:2388
void rb_str_update(VALUE str, long beg, long len, VALUE val)
Definition: string.c:4643
char * rb_str_to_cstr(VALUE str)
Definition: string.c:2284
RUBY_FUNC_EXPORTED size_t rb_str_memsize(VALUE str)
Definition: string.c:1371
VALUE rb_str_substr(VALUE str, long beg, long len)
Definition: string.c:2584
VALUE rb_str_unlocktmp(VALUE str)
Definition: string.c:2675
VALUE rb_str_new_cstr(const char *ptr)
Definition: string.c:808
VALUE rb_str_upto_endless_each(VALUE beg, int(*each)(VALUE, VALUE), VALUE arg)
Definition: string.c:4345
VALUE rb_str_resize(VALUE str, long len)
Definition: string.c:2709
void rb_must_asciicompat(VALUE str)
Definition: string.c:2166
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Definition: string.c:884
MJIT_FUNC_EXPORTED VALUE rb_str_concat_literals(size_t num, const VALUE *strary)
Definition: string.c:2974
VALUE rb_str_split(VALUE str, const char *sep0)
Definition: string.c:8116
VALUE rb_tainted_str_new_cstr(const char *ptr)
Definition: string.c:903
MJIT_FUNC_EXPORTED VALUE rb_str_eql(VALUE str1, VALUE str2)
Definition: string.c:3287
char * rb_string_value_ptr(volatile VALUE *ptr)
Definition: string.c:2186
#define TERM_LEN(str)
Definition: string.c:124
VALUE rb_str_dump(VALUE str)
Definition: string.c:6042
VALUE rb_str_concat(VALUE str1, VALUE str2)
Definition: string.c:3065
void rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
Definition: string.c:1217
VALUE rb_locale_str_new(const char *ptr, long len)
Definition: string.c:1099
#define CASE_UTF(e)
VALUE rb_str_buf_new(long capa)
Definition: string.c:1315
#define STR_BUF_MIN_SIZE
Definition: string.c:1311
STATIC_ASSERT(STR_BUF_MIN_SIZE, STR_BUF_MIN_SIZE > RSTRING_EMBED_LEN_MAX)
VALUE rb_external_str_new_cstr(const char *ptr)
Definition: string.c:1093
#define SPLIT_STR(beg, len)
#define STR_HEAP_SIZE(str)
Definition: string.c:168
VALUE rb_str_drop_bytes(VALUE str, long len)
Definition: string.c:4573
long rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
Definition: string.c:1749
#define TERM_FILL(ptr, termlen)
Definition: string.c:125
VALUE rb_str_export_to_enc(VALUE str, rb_encoding *enc)
Definition: string.c:1135
neighbor_char
Definition: string.c:3879
@ NEIGHBOR_FOUND
Definition: string.c:3881
@ NEIGHBOR_WRAPPED
Definition: string.c:3882
@ NEIGHBOR_NOT_CHAR
Definition: string.c:3880
VALUE rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
Definition: string.c:10255
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Definition: string.c:878
VALUE rb_obj_as_string(VALUE obj)
Definition: string.c:1440
const char * name
Definition: onigmo.h:162
int(* case_map)(OnigCaseFoldType *flagP, const OnigUChar **pp, const OnigUChar *end, OnigUChar *to, OnigUChar *to_end, const struct OnigEncodingTypeST *enc)
Definition: onigmo.h:177
Definition: ruby.h:988
size_t capa
Definition: string.c:6494
OnigUChar space[FLEX_ARY_LEN]
Definition: string.c:6497
struct mapping_buffer * next
Definition: string.c:6496
size_t used
Definition: string.c:6495
int num_regs
Definition: onigmo.h:718
Definition: string.c:6989
int gen
Definition: string.c:6990
unsigned int now
Definition: string.c:6991
unsigned int max
Definition: string.c:6991
char * p
Definition: string.c:6992
char * pend
Definition: string.c:6992
#define scan_hex(s, l, e)
Definition: util.h:55
st_table * rb_vm_fstring_table(void)
Definition: vm.c:3394
#define rb_id2str(id)
Definition: vm_backtrace.c:30
MJIT_STATIC void rb_error_arity(int argc, int min, int max)