Ruby 2.7.7p221 (2022-11-24 revision 168ec2b1e5ad0e4688e963d9de019557c78feed9)
transcode.c
Go to the documentation of this file.
1/**********************************************************************
2
3 transcode.c -
4
5 $Author$
6 created at: Tue Oct 30 16:10:22 JST 2007
7
8 Copyright (C) 2007 Martin Duerst
9
10**********************************************************************/
11
12#include "ruby/encoding.h"
13#include "internal.h"
14#include "transcode_data.h"
15#include <ctype.h>
16#include "id.h"
17
18#define ENABLE_ECONV_NEWLINE_OPTION 1
19
20/* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
21static VALUE rb_eUndefinedConversionError;
22static VALUE rb_eInvalidByteSequenceError;
23static VALUE rb_eConverterNotFoundError;
24
26
27static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback;
28static VALUE sym_xml, sym_text, sym_attr;
29static VALUE sym_universal_newline;
30static VALUE sym_crlf_newline;
31static VALUE sym_cr_newline;
32#ifdef ENABLE_ECONV_NEWLINE_OPTION
33static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf;
34#endif
35static VALUE sym_partial_input;
36
37static VALUE sym_invalid_byte_sequence;
38static VALUE sym_undefined_conversion;
39static VALUE sym_destination_buffer_full;
40static VALUE sym_source_buffer_empty;
41static VALUE sym_finished;
42static VALUE sym_after_output;
43static VALUE sym_incomplete_input;
44
45static unsigned char *
46allocate_converted_string(const char *sname, const char *dname,
47 const unsigned char *str, size_t len,
48 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
49 size_t *dst_len_ptr);
50
51/* dynamic structure, one per conversion (similar to iconv_t) */
52/* may carry conversion state (e.g. for iso-2022-jp) */
53typedef struct rb_transcoding {
55
56 int flags;
57
59 unsigned int next_table;
61 unsigned char next_byte;
62 unsigned int output_index;
63
64 ssize_t recognized_len; /* already interpreted */
65 ssize_t readagain_len; /* not yet interpreted */
66 union {
67 unsigned char ary[8]; /* max_input <= sizeof(ary) */
68 unsigned char *ptr; /* length: max_input */
69 } readbuf; /* recognized_len + readagain_len used */
70
73 union {
74 unsigned char ary[8]; /* max_output <= sizeof(ary) */
75 unsigned char *ptr; /* length: max_output */
77
78 union rb_transcoding_state_t { /* opaque data for stateful encoding */
79 void *ptr;
80 char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
84#define TRANSCODING_READBUF(tc) \
85 ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
86 (tc)->readbuf.ary : \
87 (tc)->readbuf.ptr)
88#define TRANSCODING_WRITEBUF(tc) \
89 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
90 (tc)->writebuf.ary : \
91 (tc)->writebuf.ptr)
92#define TRANSCODING_WRITEBUF_SIZE(tc) \
93 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
94 sizeof((tc)->writebuf.ary) : \
95 (size_t)(tc)->transcoder->max_output)
96#define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
97#define TRANSCODING_STATE(tc) \
98 ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
99 (tc)->state.ary : \
100 (tc)->state.ptr)
101
102typedef struct {
104 unsigned char *out_buf_start;
105 unsigned char *out_data_start;
106 unsigned char *out_data_end;
107 unsigned char *out_buf_end;
110
112 int flags;
113 int started; /* bool */
114
117
118 const unsigned char *replacement_str;
120 const char *replacement_enc;
121
122 unsigned char *in_buf_start;
123 unsigned char *in_data_start;
124 unsigned char *in_data_end;
125 unsigned char *in_buf_end;
127 int replacement_allocated; /* bool */
132
133 /* last error */
134 struct {
137 const char *source_encoding;
139 const unsigned char *error_bytes_start;
143
144 /* The following fields are only for Encoding::Converter.
145 * rb_econv_open set them NULL. */
148};
149
150/*
151 * Dispatch data and logic
152 */
153
154#define DECORATOR_P(sname, dname) (*(sname) == '\0')
155
156typedef struct {
157 const char *sname;
158 const char *dname;
159 const char *lib; /* null means no need to load a library */
162
163static st_table *transcoder_table;
164
165static transcoder_entry_t *
166make_transcoder_entry(const char *sname, const char *dname)
167{
168 st_data_t val;
169 st_table *table2;
170
171 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
173 st_add_direct(transcoder_table, (st_data_t)sname, val);
174 }
175 table2 = (st_table *)val;
176 if (!st_lookup(table2, (st_data_t)dname, &val)) {
178 entry->sname = sname;
179 entry->dname = dname;
180 entry->lib = NULL;
181 entry->transcoder = NULL;
182 val = (st_data_t)entry;
183 st_add_direct(table2, (st_data_t)dname, val);
184 }
185 return (transcoder_entry_t *)val;
186}
187
188static transcoder_entry_t *
189get_transcoder_entry(const char *sname, const char *dname)
190{
191 st_data_t val;
192 st_table *table2;
193
194 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
195 return NULL;
196 }
197 table2 = (st_table *)val;
198 if (!st_lookup(table2, (st_data_t)dname, &val)) {
199 return NULL;
200 }
201 return (transcoder_entry_t *)val;
202}
203
204void
206{
207 const char *const sname = tr->src_encoding;
208 const char *const dname = tr->dst_encoding;
209
210 transcoder_entry_t *entry;
211
212 entry = make_transcoder_entry(sname, dname);
213 if (entry->transcoder) {
214 rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
215 sname, dname);
216 }
217
218 entry->transcoder = tr;
219}
220
221static void
222declare_transcoder(const char *sname, const char *dname, const char *lib)
223{
224 transcoder_entry_t *entry;
225
226 entry = make_transcoder_entry(sname, dname);
227 entry->lib = lib;
228}
229
230static const char transcoder_lib_prefix[] = "enc/trans/";
231
232void
233rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
234{
235 if (!lib) {
236 rb_raise(rb_eArgError, "invalid library name - (null)");
237 }
238 declare_transcoder(enc1, enc2, lib);
239}
240
241#define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
242
243typedef struct search_path_queue_tag {
245 const char *enc;
247
248typedef struct {
252 const char *base_enc;
254
255static int
256transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
257{
258 const char *dname = (const char *)key;
261
262 if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
263 return ST_CONTINUE;
264 }
265
267 q->enc = dname;
268 q->next = NULL;
269 *bfs->queue_last_ptr = q;
270 bfs->queue_last_ptr = &q->next;
271
272 st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
273 return ST_CONTINUE;
274}
275
276static int
277transcode_search_path(const char *sname, const char *dname,
278 void (*callback)(const char *sname, const char *dname, int depth, void *arg),
279 void *arg)
280{
283 st_data_t val;
284 st_table *table2;
285 int found;
286 int pathlen = -1;
287
288 if (encoding_equal(sname, dname))
289 return -1;
290
292 q->enc = sname;
293 q->next = NULL;
294 bfs.queue_last_ptr = &q->next;
295 bfs.queue = q;
296
299
300 while (bfs.queue) {
301 q = bfs.queue;
302 bfs.queue = q->next;
303 if (!bfs.queue)
304 bfs.queue_last_ptr = &bfs.queue;
305
306 if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
307 xfree(q);
308 continue;
309 }
310 table2 = (st_table *)val;
311
312 if (st_lookup(table2, (st_data_t)dname, &val)) {
313 st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
314 xfree(q);
315 found = 1;
316 goto cleanup;
317 }
318
319 bfs.base_enc = q->enc;
320 st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
321 bfs.base_enc = NULL;
322
323 xfree(q);
324 }
325 found = 0;
326
327 cleanup:
328 while (bfs.queue) {
329 q = bfs.queue;
330 bfs.queue = q->next;
331 xfree(q);
332 }
333
334 if (found) {
335 const char *enc = dname;
336 int depth;
337 pathlen = 0;
338 while (1) {
339 st_lookup(bfs.visited, (st_data_t)enc, &val);
340 if (!val)
341 break;
342 pathlen++;
343 enc = (const char *)val;
344 }
345 depth = pathlen;
346 enc = dname;
347 while (1) {
348 st_lookup(bfs.visited, (st_data_t)enc, &val);
349 if (!val)
350 break;
351 callback((const char *)val, enc, --depth, arg);
352 enc = (const char *)val;
353 }
354 }
355
357
358 return pathlen; /* is -1 if not found */
359}
360
361static const rb_transcoder *
362load_transcoder_entry(transcoder_entry_t *entry)
363{
364 if (entry->transcoder)
365 return entry->transcoder;
366
367 if (entry->lib) {
368 const char *const lib = entry->lib;
369 const size_t len = strlen(lib);
370 const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len;
371 const VALUE fn = rb_str_new(0, total_len);
372 char *const path = RSTRING_PTR(fn);
373
374 memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
375 memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len);
376 rb_str_set_len(fn, total_len);
377 OBJ_FREEZE(fn);
379 }
380
381 if (entry->transcoder)
382 return entry->transcoder;
383
384 return NULL;
385}
386
387static const char*
388get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
389{
390 if (encoding_equal(encname, "UTF-8")) {
391 *len_ret = 3;
392 *repl_encname_ptr = "UTF-8";
393 return "\xEF\xBF\xBD";
394 }
395 else {
396 *len_ret = 1;
397 *repl_encname_ptr = "US-ASCII";
398 return "?";
399 }
400}
401
402/*
403 * Transcoding engine logic
404 */
405
406static const unsigned char *
407transcode_char_start(rb_transcoding *tc,
408 const unsigned char *in_start,
409 const unsigned char *inchar_start,
410 const unsigned char *in_p,
411 size_t *char_len_ptr)
412{
413 const unsigned char *ptr;
414 if (inchar_start - in_start < tc->recognized_len) {
416 inchar_start, unsigned char, in_p - inchar_start);
418 }
419 else {
420 ptr = inchar_start - tc->recognized_len;
421 }
422 *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
423 return ptr;
424}
425
427transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
428 const unsigned char *in_stop, unsigned char *out_stop,
429 rb_transcoding *tc,
430 const int opt)
431{
432 const rb_transcoder *tr = tc->transcoder;
433 int unitlen = tr->input_unit_length;
434 ssize_t readagain_len = 0;
435
436 const unsigned char *inchar_start;
437 const unsigned char *in_p;
438
439 unsigned char *out_p;
440
441 in_p = inchar_start = *in_pos;
442
443 out_p = *out_pos;
444
445#define SUSPEND(ret, num) \
446 do { \
447 tc->resume_position = (num); \
448 if (0 < in_p - inchar_start) \
449 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
450 inchar_start, unsigned char, in_p - inchar_start); \
451 *in_pos = in_p; \
452 *out_pos = out_p; \
453 tc->recognized_len += in_p - inchar_start; \
454 if (readagain_len) { \
455 tc->recognized_len -= readagain_len; \
456 tc->readagain_len = readagain_len; \
457 } \
458 return (ret); \
459 resume_label ## num:; \
460 } while (0)
461#define SUSPEND_OBUF(num) \
462 do { \
463 while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
464 } while (0)
465
466#define SUSPEND_AFTER_OUTPUT(num) \
467 if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
468 SUSPEND(econv_after_output, num); \
469 }
470
471#define next_table (tc->next_table)
472#define next_info (tc->next_info)
473#define next_byte (tc->next_byte)
474#define writebuf_len (tc->writebuf_len)
475#define writebuf_off (tc->writebuf_off)
476
477 switch (tc->resume_position) {
478 case 0: break;
479 case 1: goto resume_label1;
480 case 2: goto resume_label2;
481 case 3: goto resume_label3;
482 case 4: goto resume_label4;
483 case 5: goto resume_label5;
484 case 6: goto resume_label6;
485 case 7: goto resume_label7;
486 case 8: goto resume_label8;
487 case 9: goto resume_label9;
488 case 10: goto resume_label10;
489 case 11: goto resume_label11;
490 case 12: goto resume_label12;
491 case 13: goto resume_label13;
492 case 14: goto resume_label14;
493 case 15: goto resume_label15;
494 case 16: goto resume_label16;
495 case 17: goto resume_label17;
496 case 18: goto resume_label18;
497 case 19: goto resume_label19;
498 case 20: goto resume_label20;
499 case 21: goto resume_label21;
500 case 22: goto resume_label22;
501 case 23: goto resume_label23;
502 case 24: goto resume_label24;
503 case 25: goto resume_label25;
504 case 26: goto resume_label26;
505 case 27: goto resume_label27;
506 case 28: goto resume_label28;
507 case 29: goto resume_label29;
508 case 30: goto resume_label30;
509 case 31: goto resume_label31;
510 case 32: goto resume_label32;
511 case 33: goto resume_label33;
512 case 34: goto resume_label34;
513 }
514
515 while (1) {
516 inchar_start = in_p;
517 tc->recognized_len = 0;
518 next_table = tr->conv_tree_start;
519
521
522 if (in_stop <= in_p) {
523 if (!(opt & ECONV_PARTIAL_INPUT))
524 break;
526 continue;
527 }
528
529#define BYTE_ADDR(index) (tr->byte_array + (index))
530#define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
531#define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
532#define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
533#define BL_MIN_BYTE (BL_BASE[0])
534#define BL_MAX_BYTE (BL_BASE[1])
535#define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
536#define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
537
538 next_byte = (unsigned char)*in_p++;
539 follow_byte:
542 else {
544 }
545 follow_info:
546 switch (next_info & 0x1F) {
547 case NOMAP:
548 {
549 const unsigned char *p = inchar_start;
550 writebuf_off = 0;
551 while (p < in_p) {
552 TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
553 }
555 writebuf_off = 0;
556 while (writebuf_off < writebuf_len) {
557 SUSPEND_OBUF(3);
558 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
559 }
560 }
561 continue;
562 case 0x00: case 0x04: case 0x08: case 0x0C:
563 case 0x10: case 0x14: case 0x18: case 0x1C:
565 while (in_p >= in_stop) {
566 if (!(opt & ECONV_PARTIAL_INPUT))
567 goto incomplete;
569 }
570 next_byte = (unsigned char)*in_p++;
571 next_table = (unsigned int)next_info;
572 goto follow_byte;
573 case ZERObt: /* drop input */
574 continue;
575 case ONEbt:
576 SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
577 continue;
578 case TWObt:
579 SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
580 SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
581 continue;
582 case THREEbt:
583 SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
584 SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
585 SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
586 continue;
587 case FOURbt:
588 SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
589 SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
590 SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
591 SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
592 continue;
593 case GB4bt:
594 SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
595 SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
596 SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
597 SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
598 continue;
599 case STR1:
600 tc->output_index = 0;
603 tc->output_index++;
604 }
605 continue;
606 case FUNii:
607 next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
608 goto follow_info;
609 case FUNsi:
610 {
611 const unsigned char *char_start;
612 size_t char_len;
613 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
614 next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
615 goto follow_info;
616 }
617 case FUNio:
618 SUSPEND_OBUF(13);
619 if (tr->max_output <= out_stop - out_p)
620 out_p += tr->func_io(TRANSCODING_STATE(tc),
621 next_info, out_p, out_stop - out_p);
622 else {
623 writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
624 next_info,
626 writebuf_off = 0;
627 while (writebuf_off < writebuf_len) {
628 SUSPEND_OBUF(20);
629 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
630 }
631 }
632 break;
633 case FUNso:
634 {
635 const unsigned char *char_start;
636 size_t char_len;
637 SUSPEND_OBUF(14);
638 if (tr->max_output <= out_stop - out_p) {
639 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
640 out_p += tr->func_so(TRANSCODING_STATE(tc),
641 char_start, (size_t)char_len,
642 out_p, out_stop - out_p);
643 }
644 else {
645 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
646 writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
647 char_start, (size_t)char_len,
649 writebuf_off = 0;
650 while (writebuf_off < writebuf_len) {
651 SUSPEND_OBUF(22);
652 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
653 }
654 }
655 break;
656 }
657 case FUNsio:
658 {
659 const unsigned char *char_start;
660 size_t char_len;
661 SUSPEND_OBUF(33);
662 if (tr->max_output <= out_stop - out_p) {
663 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
664 out_p += tr->func_sio(TRANSCODING_STATE(tc),
665 char_start, (size_t)char_len, next_info,
666 out_p, out_stop - out_p);
667 }
668 else {
669 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
670 writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
671 char_start, (size_t)char_len, next_info,
673 writebuf_off = 0;
674 while (writebuf_off < writebuf_len) {
675 SUSPEND_OBUF(34);
676 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
677 }
678 }
679 break;
680 }
681 case INVALID:
682 if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
683 if (tc->recognized_len + (in_p - inchar_start) < unitlen)
685 while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
686 in_p = in_stop;
688 }
689 if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
690 in_p = in_stop;
691 }
692 else {
693 in_p = inchar_start + (unitlen - tc->recognized_len);
694 }
695 }
696 else {
697 ssize_t invalid_len; /* including the last byte which causes invalid */
698 ssize_t discard_len;
699 invalid_len = tc->recognized_len + (in_p - inchar_start);
700 discard_len = ((invalid_len - 1) / unitlen) * unitlen;
701 readagain_len = invalid_len - discard_len;
702 }
703 goto invalid;
704 case UNDEF:
705 goto undef;
706 default:
707 rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
708 }
709 continue;
710
711 invalid:
713 continue;
714
715 incomplete:
717 continue;
718
719 undef:
721 continue;
722 }
723
724 /* cleanup */
725 if (tr->finish_func) {
726 SUSPEND_OBUF(4);
727 if (tr->max_output <= out_stop - out_p) {
728 out_p += tr->finish_func(TRANSCODING_STATE(tc),
729 out_p, out_stop - out_p);
730 }
731 else {
732 writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
734 writebuf_off = 0;
735 while (writebuf_off < writebuf_len) {
736 SUSPEND_OBUF(23);
737 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
738 }
739 }
740 }
741 while (1)
743#undef SUSPEND
744#undef next_table
745#undef next_info
746#undef next_byte
747#undef writebuf_len
748#undef writebuf_off
749}
750
752transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
753 const unsigned char *in_stop, unsigned char *out_stop,
754 rb_transcoding *tc,
755 const int opt)
756{
757 if (tc->readagain_len) {
758 unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
759 const unsigned char *readagain_pos = readagain_buf;
760 const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
762
763 MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
764 unsigned char, tc->readagain_len);
765 tc->readagain_len = 0;
766 res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
767 if (res != econv_source_buffer_empty) {
769 readagain_pos, unsigned char, readagain_stop - readagain_pos);
770 tc->readagain_len += readagain_stop - readagain_pos;
771 return res;
772 }
773 }
774 return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
775}
776
777static rb_transcoding *
778rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
779{
780 rb_transcoding *tc;
781
782 tc = ALLOC(rb_transcoding);
783 tc->transcoder = tr;
784 tc->flags = flags;
785 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
786 tc->state.ptr = xmalloc(tr->state_size);
787 if (tr->state_init_func) {
788 (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
789 }
790 tc->resume_position = 0;
791 tc->recognized_len = 0;
792 tc->readagain_len = 0;
793 tc->writebuf_len = 0;
794 tc->writebuf_off = 0;
795 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
796 tc->readbuf.ptr = xmalloc(tr->max_input);
797 }
798 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
799 tc->writebuf.ptr = xmalloc(tr->max_output);
800 }
801 return tc;
802}
803
805rb_transcoding_convert(rb_transcoding *tc,
806 const unsigned char **input_ptr, const unsigned char *input_stop,
807 unsigned char **output_ptr, unsigned char *output_stop,
808 int flags)
809{
810 return transcode_restartable(
811 input_ptr, output_ptr,
812 input_stop, output_stop,
813 tc, flags);
814}
815
816static void
817rb_transcoding_close(rb_transcoding *tc)
818{
819 const rb_transcoder *tr = tc->transcoder;
820 if (tr->state_fini_func) {
821 (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
822 }
823 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
824 xfree(tc->state.ptr);
825 if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
826 xfree(tc->readbuf.ptr);
827 if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
828 xfree(tc->writebuf.ptr);
829 xfree(tc);
830}
831
832static size_t
833rb_transcoding_memsize(rb_transcoding *tc)
834{
835 size_t size = sizeof(rb_transcoding);
836 const rb_transcoder *tr = tc->transcoder;
837
838 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
839 size += tr->state_size;
840 }
841 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
842 size += tr->max_input;
843 }
844 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
845 size += tr->max_output;
846 }
847 return size;
848}
849
850static rb_econv_t *
851rb_econv_alloc(int n_hint)
852{
853 rb_econv_t *ec;
854
855 if (n_hint <= 0)
856 n_hint = 1;
857
858 ec = ALLOC(rb_econv_t);
859 ec->flags = 0;
862 ec->started = 0;
863 ec->replacement_str = NULL;
864 ec->replacement_len = 0;
865 ec->replacement_enc = NULL;
866 ec->replacement_allocated = 0;
867 ec->in_buf_start = NULL;
868 ec->in_data_start = NULL;
869 ec->in_data_end = NULL;
870 ec->in_buf_end = NULL;
871 ec->num_allocated = n_hint;
872 ec->num_trans = 0;
874 ec->num_finished = 0;
875 ec->last_tc = NULL;
883 ec->source_encoding = NULL;
885 return ec;
886}
887
888static int
889rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
890{
891 int n, j;
892 int bufsize = 4096;
893 unsigned char *p;
894
895 if (ec->num_trans == ec->num_allocated) {
896 n = ec->num_allocated * 2;
898 ec->num_allocated = n;
899 }
900
901 p = xmalloc(bufsize);
902
903 MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
904
905 ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
906 ec->elems[i].out_buf_start = p;
907 ec->elems[i].out_buf_end = p + bufsize;
908 ec->elems[i].out_data_start = p;
909 ec->elems[i].out_data_end = p;
911
912 ec->num_trans++;
913
914 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
915 for (j = ec->num_trans-1; i <= j; j--) {
916 rb_transcoding *tc = ec->elems[j].tc;
917 const rb_transcoder *tr2 = tc->transcoder;
918 if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
919 ec->last_tc = tc;
920 break;
921 }
922 }
923
924 return 0;
925}
926
927static rb_econv_t *
928rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
929{
930 rb_econv_t *ec;
931 int i, ret;
932
933 for (i = 0; i < n; i++) {
934 const rb_transcoder *tr;
935 tr = load_transcoder_entry(entries[i]);
936 if (!tr)
937 return NULL;
938 }
939
940 ec = rb_econv_alloc(n);
941
942 for (i = 0; i < n; i++) {
943 const rb_transcoder *tr = load_transcoder_entry(entries[i]);
944 ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
945 if (ret == -1) {
946 rb_econv_close(ec);
947 return NULL;
948 }
949 }
950
951 return ec;
952}
953
957};
958
959static void
960trans_open_i(const char *sname, const char *dname, int depth, void *arg)
961{
962 struct trans_open_t *toarg = arg;
963
964 if (!toarg->entries) {
965 toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
966 }
967 toarg->entries[depth] = get_transcoder_entry(sname, dname);
968}
969
970static rb_econv_t *
971rb_econv_open0(const char *sname, const char *dname, int ecflags)
972{
974 int num_trans;
975 rb_econv_t *ec;
976
977 /* Just check if sname and dname are defined */
978 /* (This check is needed?) */
979 if (*sname) rb_enc_find_index(sname);
980 if (*dname) rb_enc_find_index(dname);
981
982 if (*sname == '\0' && *dname == '\0') {
983 num_trans = 0;
984 entries = NULL;
985 sname = dname = "";
986 }
987 else {
988 struct trans_open_t toarg;
989 toarg.entries = NULL;
990 toarg.num_additional = 0;
991 num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
992 entries = toarg.entries;
993 if (num_trans < 0) {
994 xfree(entries);
995 return NULL;
996 }
997 }
998
999 ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
1000 xfree(entries);
1001 if (!ec)
1002 return NULL;
1003
1004 ec->flags = ecflags;
1005 ec->source_encoding_name = sname;
1006 ec->destination_encoding_name = dname;
1007
1008 return ec;
1009}
1010
1011#define MAX_ECFLAGS_DECORATORS 32
1012
1013static int
1014decorator_names(int ecflags, const char **decorators_ret)
1015{
1016 int num_decorators;
1017
1018 switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
1022 case 0:
1023 break;
1024 default:
1025 return -1;
1026 }
1027
1028 if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
1030 return -1;
1031
1032 num_decorators = 0;
1033
1034 if (ecflags & ECONV_XML_TEXT_DECORATOR)
1035 decorators_ret[num_decorators++] = "xml_text_escape";
1037 decorators_ret[num_decorators++] = "xml_attr_content_escape";
1038 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
1039 decorators_ret[num_decorators++] = "xml_attr_quote";
1040
1041 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
1042 decorators_ret[num_decorators++] = "crlf_newline";
1043 if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
1044 decorators_ret[num_decorators++] = "cr_newline";
1046 decorators_ret[num_decorators++] = "universal_newline";
1047
1048 return num_decorators;
1049}
1050
1051rb_econv_t *
1052rb_econv_open(const char *sname, const char *dname, int ecflags)
1053{
1054 rb_econv_t *ec;
1055 int num_decorators;
1056 const char *decorators[MAX_ECFLAGS_DECORATORS];
1057 int i;
1058
1059 num_decorators = decorator_names(ecflags, decorators);
1060 if (num_decorators == -1)
1061 return NULL;
1062
1063 ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
1064 if (!ec)
1065 return NULL;
1066
1067 for (i = 0; i < num_decorators; i++)
1068 if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
1069 rb_econv_close(ec);
1070 return NULL;
1071 }
1072
1073 ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
1074
1075 return ec;
1076}
1077
1078static int
1079trans_sweep(rb_econv_t *ec,
1080 const unsigned char **input_ptr, const unsigned char *input_stop,
1081 unsigned char **output_ptr, unsigned char *output_stop,
1082 int flags,
1083 int start)
1084{
1085 int try;
1086 int i, f;
1087
1088 const unsigned char **ipp, *is, *iold;
1089 unsigned char **opp, *os, *oold;
1091
1092 try = 1;
1093 while (try) {
1094 try = 0;
1095 for (i = start; i < ec->num_trans; i++) {
1096 rb_econv_elem_t *te = &ec->elems[i];
1097
1098 if (i == 0) {
1099 ipp = input_ptr;
1100 is = input_stop;
1101 }
1102 else {
1103 rb_econv_elem_t *prev_te = &ec->elems[i-1];
1104 ipp = (const unsigned char **)&prev_te->out_data_start;
1105 is = prev_te->out_data_end;
1106 }
1107
1108 if (i == ec->num_trans-1) {
1109 opp = output_ptr;
1110 os = output_stop;
1111 }
1112 else {
1113 if (te->out_buf_start != te->out_data_start) {
1115 ssize_t off = te->out_data_start - te->out_buf_start;
1116 MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
1117 te->out_data_start = te->out_buf_start;
1118 te->out_data_end -= off;
1119 }
1120 opp = &te->out_data_end;
1121 os = te->out_buf_end;
1122 }
1123
1124 f = flags;
1125 if (ec->num_finished != i)
1127 if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
1128 start = 1;
1129 flags &= ~ECONV_AFTER_OUTPUT;
1130 }
1131 if (i != 0)
1132 f &= ~ECONV_AFTER_OUTPUT;
1133 iold = *ipp;
1134 oold = *opp;
1135 te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
1136 if (iold != *ipp || oold != *opp)
1137 try = 1;
1138
1139 switch (res) {
1143 case econv_after_output:
1144 return i;
1145
1148 break;
1149
1150 case econv_finished:
1151 ec->num_finished = i+1;
1152 break;
1153 }
1154 }
1155 }
1156 return -1;
1157}
1158
1159static rb_econv_result_t
1160rb_trans_conv(rb_econv_t *ec,
1161 const unsigned char **input_ptr, const unsigned char *input_stop,
1162 unsigned char **output_ptr, unsigned char *output_stop,
1163 int flags,
1164 int *result_position_ptr)
1165{
1166 int i;
1167 int needreport_index;
1168 int sweep_start;
1169
1170 unsigned char empty_buf;
1171 unsigned char *empty_ptr = &empty_buf;
1172
1173 if (!input_ptr) {
1174 input_ptr = (const unsigned char **)&empty_ptr;
1175 input_stop = empty_ptr;
1176 }
1177
1178 if (!output_ptr) {
1179 output_ptr = &empty_ptr;
1180 output_stop = empty_ptr;
1181 }
1182
1183 if (ec->elems[0].last_result == econv_after_output)
1185
1186 for (i = ec->num_trans-1; 0 <= i; i--) {
1187 switch (ec->elems[i].last_result) {
1191 case econv_after_output:
1192 case econv_finished:
1193 sweep_start = i+1;
1194 goto found_needreport;
1195
1198 break;
1199
1200 default:
1201 rb_bug("unexpected transcode last result");
1202 }
1203 }
1204
1205 /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
1206
1208 (flags & ECONV_AFTER_OUTPUT)) {
1210
1211 res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
1213 result_position_ptr);
1214
1215 if (res == econv_source_buffer_empty)
1216 return econv_after_output;
1217 return res;
1218 }
1219
1220 sweep_start = 0;
1221
1222 found_needreport:
1223
1224 do {
1225 needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
1226 sweep_start = needreport_index + 1;
1227 } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
1228
1229 for (i = ec->num_trans-1; 0 <= i; i--) {
1232 if (res == econv_invalid_byte_sequence ||
1233 res == econv_incomplete_input ||
1235 res == econv_after_output) {
1237 }
1238 if (result_position_ptr)
1239 *result_position_ptr = i;
1240 return res;
1241 }
1242 }
1243 if (result_position_ptr)
1244 *result_position_ptr = -1;
1246}
1247
1248static rb_econv_result_t
1249rb_econv_convert0(rb_econv_t *ec,
1250 const unsigned char **input_ptr, const unsigned char *input_stop,
1251 unsigned char **output_ptr, unsigned char *output_stop,
1252 int flags)
1253{
1255 int result_position;
1256 int has_output = 0;
1257
1258 memset(&ec->last_error, 0, sizeof(ec->last_error));
1259
1260 if (ec->num_trans == 0) {
1261 size_t len;
1262 if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
1263 if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
1264 len = output_stop - *output_ptr;
1265 memcpy(*output_ptr, ec->in_data_start, len);
1266 *output_ptr = output_stop;
1267 ec->in_data_start += len;
1269 goto gotresult;
1270 }
1271 len = ec->in_data_end - ec->in_data_start;
1272 memcpy(*output_ptr, ec->in_data_start, len);
1273 *output_ptr += len;
1274 ec->in_data_start = ec->in_data_end = ec->in_buf_start;
1275 if (flags & ECONV_AFTER_OUTPUT) {
1276 res = econv_after_output;
1277 goto gotresult;
1278 }
1279 }
1280 if (output_stop - *output_ptr < input_stop - *input_ptr) {
1281 len = output_stop - *output_ptr;
1282 }
1283 else {
1284 len = input_stop - *input_ptr;
1285 }
1286 if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
1287 *(*output_ptr)++ = *(*input_ptr)++;
1288 res = econv_after_output;
1289 goto gotresult;
1290 }
1291 memcpy(*output_ptr, *input_ptr, len);
1292 *output_ptr += len;
1293 *input_ptr += len;
1294 if (*input_ptr != input_stop)
1296 else if (flags & ECONV_PARTIAL_INPUT)
1298 else
1299 res = econv_finished;
1300 goto gotresult;
1301 }
1302
1303 if (ec->elems[ec->num_trans-1].out_data_start) {
1304 unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
1305 unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
1306 if (data_start != data_end) {
1307 size_t len;
1308 if (output_stop - *output_ptr < data_end - data_start) {
1309 len = output_stop - *output_ptr;
1310 memcpy(*output_ptr, data_start, len);
1311 *output_ptr = output_stop;
1312 ec->elems[ec->num_trans-1].out_data_start += len;
1314 goto gotresult;
1315 }
1316 len = data_end - data_start;
1317 memcpy(*output_ptr, data_start, len);
1318 *output_ptr += len;
1319 ec->elems[ec->num_trans-1].out_data_start =
1320 ec->elems[ec->num_trans-1].out_data_end =
1321 ec->elems[ec->num_trans-1].out_buf_start;
1322 has_output = 1;
1323 }
1324 }
1325
1326 if (ec->in_buf_start &&
1327 ec->in_data_start != ec->in_data_end) {
1328 res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
1329 (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
1330 if (res != econv_source_buffer_empty)
1331 goto gotresult;
1332 }
1333
1334 if (has_output &&
1335 (flags & ECONV_AFTER_OUTPUT) &&
1336 *input_ptr != input_stop) {
1337 input_stop = *input_ptr;
1338 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1339 if (res == econv_source_buffer_empty)
1340 res = econv_after_output;
1341 }
1342 else if ((flags & ECONV_AFTER_OUTPUT) ||
1343 ec->num_trans == 1) {
1344 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1345 }
1346 else {
1347 flags |= ECONV_AFTER_OUTPUT;
1348 do {
1349 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1350 } while (res == econv_after_output);
1351 }
1352
1353 gotresult:
1354 ec->last_error.result = res;
1355 if (res == econv_invalid_byte_sequence ||
1356 res == econv_incomplete_input ||
1358 rb_transcoding *error_tc = ec->elems[result_position].tc;
1359 ec->last_error.error_tc = error_tc;
1364 ec->last_error.readagain_len = error_tc->readagain_len;
1365 }
1366
1367 return res;
1368}
1369
1370static int output_replacement_character(rb_econv_t *ec);
1371
1372static int
1373output_hex_charref(rb_econv_t *ec)
1374{
1375 int ret;
1376 unsigned char utfbuf[1024];
1377 const unsigned char *utf;
1378 size_t utf_len;
1379 int utf_allocated = 0;
1380 char charef_buf[16];
1381 const unsigned char *p;
1382
1383 if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
1384 utf = ec->last_error.error_bytes_start;
1385 utf_len = ec->last_error.error_bytes_len;
1386 }
1387 else {
1388 utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
1390 utfbuf, sizeof(utfbuf),
1391 &utf_len);
1392 if (!utf)
1393 return -1;
1394 if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
1395 utf_allocated = 1;
1396 }
1397
1398 if (utf_len % 4 != 0)
1399 goto fail;
1400
1401 p = utf;
1402 while (4 <= utf_len) {
1403 unsigned int u = 0;
1404 u += p[0] << 24;
1405 u += p[1] << 16;
1406 u += p[2] << 8;
1407 u += p[3];
1408 snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
1409
1410 ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
1411 if (ret == -1)
1412 goto fail;
1413
1414 p += 4;
1415 utf_len -= 4;
1416 }
1417
1418 if (utf_allocated)
1419 xfree((void *)utf);
1420 return 0;
1421
1422 fail:
1423 if (utf_allocated)
1424 xfree((void *)utf);
1425 return -1;
1426}
1427
1430 const unsigned char **input_ptr, const unsigned char *input_stop,
1431 unsigned char **output_ptr, unsigned char *output_stop,
1432 int flags)
1433{
1435
1436 unsigned char empty_buf;
1437 unsigned char *empty_ptr = &empty_buf;
1438
1439 ec->started = 1;
1440
1441 if (!input_ptr) {
1442 input_ptr = (const unsigned char **)&empty_ptr;
1443 input_stop = empty_ptr;
1444 }
1445
1446 if (!output_ptr) {
1447 output_ptr = &empty_ptr;
1448 output_stop = empty_ptr;
1449 }
1450
1451 resume:
1452 ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
1453
1454 if (ret == econv_invalid_byte_sequence ||
1455 ret == econv_incomplete_input) {
1456 /* deal with invalid byte sequence */
1457 /* todo: add more alternative behaviors */
1458 switch (ec->flags & ECONV_INVALID_MASK) {
1460 if (output_replacement_character(ec) == 0)
1461 goto resume;
1462 }
1463 }
1464
1465 if (ret == econv_undefined_conversion) {
1466 /* valid character in source encoding
1467 * but no related character(s) in destination encoding */
1468 /* todo: add more alternative behaviors */
1469 switch (ec->flags & ECONV_UNDEF_MASK) {
1471 if (output_replacement_character(ec) == 0)
1472 goto resume;
1473 break;
1474
1476 if (output_hex_charref(ec) == 0)
1477 goto resume;
1478 break;
1479 }
1480 }
1481
1482 return ret;
1483}
1484
1485const char *
1487{
1488 rb_transcoding *tc = ec->last_tc;
1489 const rb_transcoder *tr;
1490
1491 if (tc == NULL)
1492 return "";
1493
1494 tr = tc->transcoder;
1495
1496 if (tr->asciicompat_type == asciicompat_encoder)
1497 return tr->src_encoding;
1498 return tr->dst_encoding;
1499}
1500
1501static unsigned char *
1502allocate_converted_string(const char *sname, const char *dname,
1503 const unsigned char *str, size_t len,
1504 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
1505 size_t *dst_len_ptr)
1506{
1507 unsigned char *dst_str;
1508 size_t dst_len;
1509 size_t dst_bufsize;
1510
1511 rb_econv_t *ec;
1513
1514 const unsigned char *sp;
1515 unsigned char *dp;
1516
1517 if (caller_dst_buf)
1518 dst_bufsize = caller_dst_bufsize;
1519 else if (len == 0)
1520 dst_bufsize = 1;
1521 else
1522 dst_bufsize = len;
1523
1524 ec = rb_econv_open(sname, dname, 0);
1525 if (ec == NULL)
1526 return NULL;
1527 if (caller_dst_buf)
1528 dst_str = caller_dst_buf;
1529 else
1530 dst_str = xmalloc(dst_bufsize);
1531 dst_len = 0;
1532 sp = str;
1533 dp = dst_str+dst_len;
1534 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1535 dst_len = dp - dst_str;
1536 while (res == econv_destination_buffer_full) {
1537 if (SIZE_MAX/2 < dst_bufsize) {
1538 goto fail;
1539 }
1540 dst_bufsize *= 2;
1541 if (dst_str == caller_dst_buf) {
1542 unsigned char *tmp;
1543 tmp = xmalloc(dst_bufsize);
1544 memcpy(tmp, dst_str, dst_bufsize/2);
1545 dst_str = tmp;
1546 }
1547 else {
1548 dst_str = xrealloc(dst_str, dst_bufsize);
1549 }
1550 dp = dst_str+dst_len;
1551 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1552 dst_len = dp - dst_str;
1553 }
1554 if (res != econv_finished) {
1555 goto fail;
1556 }
1557 rb_econv_close(ec);
1558 *dst_len_ptr = dst_len;
1559 return dst_str;
1560
1561 fail:
1562 if (dst_str != caller_dst_buf)
1563 xfree(dst_str);
1564 rb_econv_close(ec);
1565 return NULL;
1566}
1567
1568/* result: 0:success -1:failure */
1569int
1571 const unsigned char *str, size_t len, const char *str_encoding)
1572{
1573 const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
1574 unsigned char insert_buf[4096];
1575 const unsigned char *insert_str = NULL;
1576 size_t insert_len;
1577
1578 int last_trans_index;
1579 rb_transcoding *tc;
1580
1581 unsigned char **buf_start_p;
1582 unsigned char **data_start_p;
1583 unsigned char **data_end_p;
1584 unsigned char **buf_end_p;
1585
1586 size_t need;
1587
1588 ec->started = 1;
1589
1590 if (len == 0)
1591 return 0;
1592
1593 if (encoding_equal(insert_encoding, str_encoding)) {
1594 insert_str = str;
1595 insert_len = len;
1596 }
1597 else {
1598 insert_str = allocate_converted_string(str_encoding, insert_encoding,
1599 str, len, insert_buf, sizeof(insert_buf), &insert_len);
1600 if (insert_str == NULL)
1601 return -1;
1602 }
1603
1604 need = insert_len;
1605
1606 last_trans_index = ec->num_trans-1;
1607 if (ec->num_trans == 0) {
1608 tc = NULL;
1609 buf_start_p = &ec->in_buf_start;
1610 data_start_p = &ec->in_data_start;
1611 data_end_p = &ec->in_data_end;
1612 buf_end_p = &ec->in_buf_end;
1613 }
1614 else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
1615 tc = ec->elems[last_trans_index].tc;
1616 need += tc->readagain_len;
1617 if (need < insert_len)
1618 goto fail;
1619 if (last_trans_index == 0) {
1620 buf_start_p = &ec->in_buf_start;
1621 data_start_p = &ec->in_data_start;
1622 data_end_p = &ec->in_data_end;
1623 buf_end_p = &ec->in_buf_end;
1624 }
1625 else {
1626 rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
1627 buf_start_p = &ee->out_buf_start;
1628 data_start_p = &ee->out_data_start;
1629 data_end_p = &ee->out_data_end;
1630 buf_end_p = &ee->out_buf_end;
1631 }
1632 }
1633 else {
1634 rb_econv_elem_t *ee = &ec->elems[last_trans_index];
1635 buf_start_p = &ee->out_buf_start;
1636 data_start_p = &ee->out_data_start;
1637 data_end_p = &ee->out_data_end;
1638 buf_end_p = &ee->out_buf_end;
1639 tc = ec->elems[last_trans_index].tc;
1640 }
1641
1642 if (*buf_start_p == NULL) {
1643 unsigned char *buf = xmalloc(need);
1644 *buf_start_p = buf;
1645 *data_start_p = buf;
1646 *data_end_p = buf;
1647 *buf_end_p = buf+need;
1648 }
1649 else if ((size_t)(*buf_end_p - *data_end_p) < need) {
1650 MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
1651 *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
1652 *data_start_p = *buf_start_p;
1653 if ((size_t)(*buf_end_p - *data_end_p) < need) {
1654 unsigned char *buf;
1655 size_t s = (*data_end_p - *buf_start_p) + need;
1656 if (s < need)
1657 goto fail;
1658 buf = xrealloc(*buf_start_p, s);
1659 *data_start_p = buf;
1660 *data_end_p = buf + (*data_end_p - *buf_start_p);
1661 *buf_start_p = buf;
1662 *buf_end_p = buf + s;
1663 }
1664 }
1665
1666 memcpy(*data_end_p, insert_str, insert_len);
1667 *data_end_p += insert_len;
1669 memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
1670 *data_end_p += tc->readagain_len;
1671 tc->readagain_len = 0;
1672 }
1673
1674 if (insert_str != str && insert_str != insert_buf)
1675 xfree((void*)insert_str);
1676 return 0;
1677
1678 fail:
1679 if (insert_str != str && insert_str != insert_buf)
1680 xfree((void*)insert_str);
1681 return -1;
1682}
1683
1684void
1686{
1687 int i;
1688
1689 if (ec->replacement_allocated) {
1690 xfree((void *)ec->replacement_str);
1691 }
1692 for (i = 0; i < ec->num_trans; i++) {
1693 rb_transcoding_close(ec->elems[i].tc);
1694 if (ec->elems[i].out_buf_start)
1695 xfree(ec->elems[i].out_buf_start);
1696 }
1697 xfree(ec->in_buf_start);
1698 xfree(ec->elems);
1699 xfree(ec);
1700}
1701
1702size_t
1704{
1705 size_t size = sizeof(rb_econv_t);
1706 int i;
1707
1708 if (ec->replacement_allocated) {
1709 size += ec->replacement_len;
1710 }
1711 for (i = 0; i < ec->num_trans; i++) {
1712 size += rb_transcoding_memsize(ec->elems[i].tc);
1713
1714 if (ec->elems[i].out_buf_start) {
1715 size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
1716 }
1717 }
1718 size += ec->in_buf_end - ec->in_buf_start;
1719 size += sizeof(rb_econv_elem_t) * ec->num_allocated;
1720
1721 return size;
1722}
1723
1724int
1726{
1727 if (ec->num_trans == 0)
1728 return 0;
1729#if SIZEOF_SIZE_T > SIZEOF_INT
1730 if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
1731#endif
1732 return (int)ec->elems[0].tc->readagain_len;
1733}
1734
1735void
1736rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
1737{
1738 rb_transcoding *tc;
1739 if (ec->num_trans == 0 || n == 0)
1740 return;
1741 tc = ec->elems[0].tc;
1743 tc->readagain_len -= n;
1744}
1745
1749};
1750
1751static int
1752asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
1753{
1754 struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
1755 transcoder_entry_t *entry = (transcoder_entry_t *)val;
1756 const rb_transcoder *tr;
1757
1758 if (DECORATOR_P(entry->sname, entry->dname))
1759 return ST_CONTINUE;
1760 tr = load_transcoder_entry(entry);
1761 if (tr && tr->asciicompat_type == asciicompat_decoder) {
1762 data->ascii_compat_name = tr->dst_encoding;
1763 return ST_STOP;
1764 }
1765 return ST_CONTINUE;
1766}
1767
1768const char *
1770{
1771 st_data_t v;
1772 st_table *table2;
1773 struct asciicompat_encoding_t data;
1774
1775 if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v))
1776 return NULL;
1777 table2 = (st_table *)v;
1778
1779 /*
1780 * Assumption:
1781 * There is at most one transcoder for
1782 * converting from ASCII incompatible encoding.
1783 *
1784 * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
1785 */
1786 if (table2->num_entries != 1)
1787 return NULL;
1788
1790 data.ascii_compat_name = NULL;
1791 st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
1792 return data.ascii_compat_name;
1793}
1794
1795VALUE
1796rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
1797{
1798 unsigned const char *sp, *se;
1799 unsigned char *ds, *dp, *de;
1801 int max_output;
1802
1803 if (NIL_P(dst)) {
1804 dst = rb_str_buf_new(len);
1805 if (ec->destination_encoding)
1807 }
1808
1809 if (ec->last_tc)
1810 max_output = ec->last_tc->transcoder->max_output;
1811 else
1812 max_output = 1;
1813
1814 do {
1815 long dlen = RSTRING_LEN(dst);
1816 if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
1817 unsigned long new_capa = (unsigned long)dlen + len + max_output;
1818 if (LONG_MAX < new_capa)
1819 rb_raise(rb_eArgError, "too long string");
1820 rb_str_resize(dst, new_capa);
1821 rb_str_set_len(dst, dlen);
1822 }
1823 sp = (const unsigned char *)ss;
1824 se = sp + len;
1825 ds = (unsigned char *)RSTRING_PTR(dst);
1826 de = ds + rb_str_capacity(dst);
1827 dp = ds += dlen;
1828 res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
1829 len -= (const char *)sp - ss;
1830 ss = (const char *)sp;
1831 rb_str_set_len(dst, dlen + (dp - ds));
1833 } while (res == econv_destination_buffer_full);
1834
1835 return dst;
1836}
1837
1838VALUE
1839rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
1840{
1842 dst = rb_econv_append(ec, RSTRING_PTR(src) + off, len, dst, flags);
1844 return dst;
1845}
1846
1847VALUE
1849{
1850 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
1851}
1852
1853VALUE
1854rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
1855{
1856 return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
1857}
1858
1859VALUE
1861{
1862 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
1863}
1864
1865static int
1866rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
1867{
1868 transcoder_entry_t *entry;
1869 const rb_transcoder *tr;
1870
1871 if (ec->started != 0)
1872 return -1;
1873
1874 entry = get_transcoder_entry(sname, dname);
1875 if (!entry)
1876 return -1;
1877
1878 tr = load_transcoder_entry(entry);
1879 if (!tr) return -1;
1880
1881 return rb_econv_add_transcoder_at(ec, tr, n);
1882}
1883
1884static int
1885rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
1886{
1887 return rb_econv_add_converter(ec, "", decorator_name, n);
1888}
1889
1890int
1891rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
1892{
1893 const rb_transcoder *tr;
1894
1895 if (ec->num_trans == 0)
1896 return rb_econv_decorate_at(ec, decorator_name, 0);
1897
1898 tr = ec->elems[0].tc->transcoder;
1899
1900 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1901 tr->asciicompat_type == asciicompat_decoder)
1902 return rb_econv_decorate_at(ec, decorator_name, 1);
1903
1904 return rb_econv_decorate_at(ec, decorator_name, 0);
1905}
1906
1907int
1908rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
1909{
1910 const rb_transcoder *tr;
1911
1912 if (ec->num_trans == 0)
1913 return rb_econv_decorate_at(ec, decorator_name, 0);
1914
1915 tr = ec->elems[ec->num_trans-1].tc->transcoder;
1916
1917 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1918 tr->asciicompat_type == asciicompat_encoder)
1919 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
1920
1921 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
1922}
1923
1924void
1926{
1927 const char *dname = 0;
1928
1929 switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) {
1931 dname = "universal_newline";
1932 break;
1934 dname = "crlf_newline";
1935 break;
1937 dname = "cr_newline";
1938 break;
1939 }
1940
1941 if (dname) {
1942 const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder;
1943 int num_trans = ec->num_trans;
1944 int i, j = 0;
1945
1946 for (i=0; i < num_trans; i++) {
1947 if (transcoder == ec->elems[i].tc->transcoder) {
1948 rb_transcoding_close(ec->elems[i].tc);
1949 xfree(ec->elems[i].out_buf_start);
1950 ec->num_trans--;
1951 }
1952 else
1953 ec->elems[j++] = ec->elems[i];
1954 }
1955 }
1956
1957 ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK;
1958}
1959
1960static VALUE
1961econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
1962{
1963 int has_description = 0;
1964
1965 if (NIL_P(mesg))
1966 mesg = rb_str_new(NULL, 0);
1967
1968 if (*sname != '\0' || *dname != '\0') {
1969 if (*sname == '\0')
1970 rb_str_cat2(mesg, dname);
1971 else if (*dname == '\0')
1972 rb_str_cat2(mesg, sname);
1973 else
1974 rb_str_catf(mesg, "%s to %s", sname, dname);
1975 has_description = 1;
1976 }
1977
1978 if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
1982 const char *pre = "";
1983 if (has_description)
1984 rb_str_cat2(mesg, " with ");
1985 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
1986 rb_str_cat2(mesg, pre); pre = ",";
1987 rb_str_cat2(mesg, "universal_newline");
1988 }
1989 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
1990 rb_str_cat2(mesg, pre); pre = ",";
1991 rb_str_cat2(mesg, "crlf_newline");
1992 }
1993 if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
1994 rb_str_cat2(mesg, pre); pre = ",";
1995 rb_str_cat2(mesg, "cr_newline");
1996 }
1997 if (ecflags & ECONV_XML_TEXT_DECORATOR) {
1998 rb_str_cat2(mesg, pre); pre = ",";
1999 rb_str_cat2(mesg, "xml_text");
2000 }
2001 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
2002 rb_str_cat2(mesg, pre); pre = ",";
2003 rb_str_cat2(mesg, "xml_attr_content");
2004 }
2005 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
2006 rb_str_cat2(mesg, pre); pre = ",";
2007 rb_str_cat2(mesg, "xml_attr_quote");
2008 }
2009 has_description = 1;
2010 }
2011 if (!has_description) {
2012 rb_str_cat2(mesg, "no-conversion");
2013 }
2014
2015 return mesg;
2016}
2017
2018VALUE
2019rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
2020{
2021 VALUE mesg, exc;
2022 mesg = rb_str_new_cstr("code converter not found (");
2023 econv_description(sname, dname, ecflags, mesg);
2024 rb_str_cat2(mesg, ")");
2025 exc = rb_exc_new3(rb_eConverterNotFoundError, mesg);
2026 return exc;
2027}
2028
2029static VALUE
2030make_econv_exception(rb_econv_t *ec)
2031{
2032 VALUE mesg, exc;
2035 const char *err = (const char *)ec->last_error.error_bytes_start;
2036 size_t error_len = ec->last_error.error_bytes_len;
2037 VALUE bytes = rb_str_new(err, error_len);
2038 VALUE dumped = rb_str_dump(bytes);
2039 size_t readagain_len = ec->last_error.readagain_len;
2040 VALUE bytes2 = Qnil;
2041 VALUE dumped2;
2042 int idx;
2044 mesg = rb_sprintf("incomplete %s on %s",
2045 StringValueCStr(dumped),
2047 }
2048 else if (readagain_len) {
2049 bytes2 = rb_str_new(err+error_len, readagain_len);
2050 dumped2 = rb_str_dump(bytes2);
2051 mesg = rb_sprintf("%s followed by %s on %s",
2052 StringValueCStr(dumped),
2053 StringValueCStr(dumped2),
2055 }
2056 else {
2057 mesg = rb_sprintf("%s on %s",
2058 StringValueCStr(dumped),
2060 }
2061
2062 exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
2063 rb_ivar_set(exc, rb_intern("error_bytes"), bytes);
2064 rb_ivar_set(exc, rb_intern("readagain_bytes"), bytes2);
2065 rb_ivar_set(exc, rb_intern("incomplete_input"), ec->last_error.result == econv_incomplete_input ? Qtrue : Qfalse);
2066
2067 set_encs:
2068 rb_ivar_set(exc, rb_intern("source_encoding_name"), rb_str_new2(ec->last_error.source_encoding));
2069 rb_ivar_set(exc, rb_intern("destination_encoding_name"), rb_str_new2(ec->last_error.destination_encoding));
2071 if (0 <= idx)
2074 if (0 <= idx)
2075 rb_ivar_set(exc, rb_intern("destination_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
2076 return exc;
2077 }
2079 VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
2081 VALUE dumped = Qnil;
2082 int idx;
2083 if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
2084 rb_encoding *utf8 = rb_utf8_encoding();
2085 const char *start, *end;
2086 int n;
2087 start = (const char *)ec->last_error.error_bytes_start;
2088 end = start + ec->last_error.error_bytes_len;
2089 n = rb_enc_precise_mbclen(start, end, utf8);
2090 if (MBCLEN_CHARFOUND_P(n) &&
2092 unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
2093 dumped = rb_sprintf("U+%04X", cc);
2094 }
2095 }
2096 if (dumped == Qnil)
2097 dumped = rb_str_dump(bytes);
2099 ec->source_encoding_name) == 0 &&
2101 ec->destination_encoding_name) == 0) {
2102 mesg = rb_sprintf("%s from %s to %s",
2103 StringValueCStr(dumped),
2106 }
2107 else {
2108 int i;
2109 mesg = rb_sprintf("%s to %s in conversion from %s",
2110 StringValueCStr(dumped),
2113 for (i = 0; i < ec->num_trans; i++) {
2114 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
2115 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
2116 rb_str_catf(mesg, " to %s",
2118 }
2119 }
2120 exc = rb_exc_new3(rb_eUndefinedConversionError, mesg);
2122 if (0 <= idx)
2123 rb_enc_associate_index(bytes, idx);
2124 rb_ivar_set(exc, rb_intern("error_char"), bytes);
2125 goto set_encs;
2126 }
2127 return Qnil;
2128}
2129
2130static void
2131more_output_buffer(
2132 VALUE destination,
2133 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2134 int max_output,
2135 unsigned char **out_start_ptr,
2136 unsigned char **out_pos,
2137 unsigned char **out_stop_ptr)
2138{
2139 size_t len = (*out_pos - *out_start_ptr);
2140 size_t new_len = (len + max_output) * 2;
2141 *out_start_ptr = resize_destination(destination, len, new_len);
2142 *out_pos = *out_start_ptr + len;
2143 *out_stop_ptr = *out_start_ptr + new_len;
2144}
2145
2146static int
2147make_replacement(rb_econv_t *ec)
2148{
2149 rb_transcoding *tc;
2150 const rb_transcoder *tr;
2151 const unsigned char *replacement;
2152 const char *repl_enc;
2153 const char *ins_enc;
2154 size_t len;
2155
2156 if (ec->replacement_str)
2157 return 0;
2158
2160
2161 tc = ec->last_tc;
2162 if (*ins_enc) {
2163 tr = tc->transcoder;
2164 rb_enc_find(tr->dst_encoding);
2165 replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
2166 }
2167 else {
2168 replacement = (unsigned char *)"?";
2169 len = 1;
2170 repl_enc = "";
2171 }
2172
2173 ec->replacement_str = replacement;
2174 ec->replacement_len = len;
2175 ec->replacement_enc = repl_enc;
2176 ec->replacement_allocated = 0;
2177 return 0;
2178}
2179
2180int
2182 const unsigned char *str, size_t len, const char *encname)
2183{
2184 unsigned char *str2;
2185 size_t len2;
2186 const char *encname2;
2187
2189
2190 if (!*encname2 || encoding_equal(encname, encname2)) {
2191 str2 = xmalloc(len);
2192 MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
2193 len2 = len;
2194 encname2 = encname;
2195 }
2196 else {
2197 str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
2198 if (!str2)
2199 return -1;
2200 }
2201
2202 if (ec->replacement_allocated) {
2203 xfree((void *)ec->replacement_str);
2204 }
2205 ec->replacement_allocated = 1;
2206 ec->replacement_str = str2;
2207 ec->replacement_len = len2;
2208 ec->replacement_enc = encname2;
2209 return 0;
2210}
2211
2212static int
2213output_replacement_character(rb_econv_t *ec)
2214{
2215 int ret;
2216
2217 if (make_replacement(ec) == -1)
2218 return -1;
2219
2221 if (ret == -1)
2222 return -1;
2223
2224 return 0;
2225}
2226
2227#if 1
2228#define hash_fallback rb_hash_aref
2229
2230static VALUE
2231proc_fallback(VALUE fallback, VALUE c)
2232{
2233 return rb_proc_call(fallback, rb_ary_new4(1, &c));
2234}
2235
2236static VALUE
2237method_fallback(VALUE fallback, VALUE c)
2238{
2239 return rb_method_call(1, &c, fallback);
2240}
2241
2242static VALUE
2243aref_fallback(VALUE fallback, VALUE c)
2244{
2245 return rb_funcallv_public(fallback, idAREF, 1, &c);
2246}
2247
2248static void
2249transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2250 const unsigned char *in_stop, unsigned char *out_stop,
2251 VALUE destination,
2252 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2253 const char *src_encoding,
2254 const char *dst_encoding,
2255 int ecflags,
2256 VALUE ecopts)
2257{
2258 rb_econv_t *ec;
2259 rb_transcoding *last_tc;
2261 unsigned char *out_start = *out_pos;
2262 int max_output;
2263 VALUE exc;
2264 VALUE fallback = Qnil;
2265 VALUE (*fallback_func)(VALUE, VALUE) = 0;
2266
2267 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2268 if (!ec)
2269 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2270
2271 if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) {
2272 fallback = rb_hash_aref(ecopts, sym_fallback);
2273 if (RB_TYPE_P(fallback, T_HASH)) {
2275 }
2276 else if (rb_obj_is_proc(fallback)) {
2277 fallback_func = proc_fallback;
2278 }
2279 else if (rb_obj_is_method(fallback)) {
2280 fallback_func = method_fallback;
2281 }
2282 else {
2283 fallback_func = aref_fallback;
2284 }
2285 }
2286 last_tc = ec->last_tc;
2287 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2288
2289 resume:
2290 ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
2291
2292 if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
2293 VALUE rep = rb_enc_str_new(
2294 (const char *)ec->last_error.error_bytes_start,
2297 rep = (*fallback_func)(fallback, rep);
2298 if (rep != Qundef && !NIL_P(rep)) {
2299 StringValue(rep);
2300 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
2301 RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
2302 if ((int)ret == -1) {
2303 rb_raise(rb_eArgError, "too big fallback string");
2304 }
2305 goto resume;
2306 }
2307 }
2308
2309 if (ret == econv_invalid_byte_sequence ||
2310 ret == econv_incomplete_input ||
2312 exc = make_econv_exception(ec);
2313 rb_econv_close(ec);
2315 }
2316
2317 if (ret == econv_destination_buffer_full) {
2318 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2319 goto resume;
2320 }
2321
2322 rb_econv_close(ec);
2323 return;
2324}
2325#else
2326/* sample transcode_loop implementation in byte-by-byte stream style */
2327static void
2328transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2329 const unsigned char *in_stop, unsigned char *out_stop,
2330 VALUE destination,
2331 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2332 const char *src_encoding,
2333 const char *dst_encoding,
2334 int ecflags,
2335 VALUE ecopts)
2336{
2337 rb_econv_t *ec;
2338 rb_transcoding *last_tc;
2340 unsigned char *out_start = *out_pos;
2341 const unsigned char *ptr;
2342 int max_output;
2343 VALUE exc;
2344
2345 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2346 if (!ec)
2347 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2348
2349 last_tc = ec->last_tc;
2350 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2351
2353 ptr = *in_pos;
2354 while (ret != econv_finished) {
2355 unsigned char input_byte;
2356 const unsigned char *p = &input_byte;
2357
2358 if (ret == econv_source_buffer_empty) {
2359 if (ptr < in_stop) {
2360 input_byte = *ptr;
2361 ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2362 }
2363 else {
2364 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
2365 }
2366 }
2367 else {
2368 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2369 }
2370 if (&input_byte != p)
2371 ptr += p - &input_byte;
2372 switch (ret) {
2376 exc = make_econv_exception(ec);
2377 rb_econv_close(ec);
2379 break;
2380
2382 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2383 break;
2384
2386 break;
2387
2388 case econv_finished:
2389 break;
2390 }
2391 }
2392 rb_econv_close(ec);
2393 *in_pos = in_stop;
2394 return;
2395}
2396#endif
2397
2398
2399/*
2400 * String-specific code
2401 */
2402
2403static unsigned char *
2404str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
2405{
2406 rb_str_resize(destination, new_len);
2407 return (unsigned char *)RSTRING_PTR(destination);
2408}
2409
2410static int
2411econv_opts(VALUE opt, int ecflags)
2412{
2413 VALUE v;
2414
2415 v = rb_hash_aref(opt, sym_invalid);
2416 if (NIL_P(v)) {
2417 }
2418 else if (v==sym_replace) {
2419 ecflags |= ECONV_INVALID_REPLACE;
2420 }
2421 else {
2422 rb_raise(rb_eArgError, "unknown value for invalid character option");
2423 }
2424
2425 v = rb_hash_aref(opt, sym_undef);
2426 if (NIL_P(v)) {
2427 }
2428 else if (v==sym_replace) {
2429 ecflags |= ECONV_UNDEF_REPLACE;
2430 }
2431 else {
2432 rb_raise(rb_eArgError, "unknown value for undefined character option");
2433 }
2434
2435 v = rb_hash_aref(opt, sym_replace);
2436 if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
2437 ecflags |= ECONV_UNDEF_REPLACE;
2438 }
2439
2440 v = rb_hash_aref(opt, sym_xml);
2441 if (!NIL_P(v)) {
2442 if (v==sym_text) {
2444 }
2445 else if (v==sym_attr) {
2447 }
2448 else if (RB_TYPE_P(v, T_SYMBOL)) {
2449 rb_raise(rb_eArgError, "unexpected value for xml option: %"PRIsVALUE, rb_sym2str(v));
2450 }
2451 else {
2452 rb_raise(rb_eArgError, "unexpected value for xml option");
2453 }
2454 }
2455
2456#ifdef ENABLE_ECONV_NEWLINE_OPTION
2457 v = rb_hash_aref(opt, sym_newline);
2458 if (!NIL_P(v)) {
2459 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2460 if (v == sym_universal) {
2462 }
2463 else if (v == sym_crlf) {
2465 }
2466 else if (v == sym_cr) {
2467 ecflags |= ECONV_CR_NEWLINE_DECORATOR;
2468 }
2469 else if (v == sym_lf) {
2470 /* ecflags |= ECONV_LF_NEWLINE_DECORATOR; */
2471 }
2472 else if (SYMBOL_P(v)) {
2473 rb_raise(rb_eArgError, "unexpected value for newline option: %"PRIsVALUE,
2474 rb_sym2str(v));
2475 }
2476 else {
2477 rb_raise(rb_eArgError, "unexpected value for newline option");
2478 }
2479 }
2480 else
2481#endif
2482 {
2483 int setflags = 0, newlineflag = 0;
2484
2485 v = rb_hash_aref(opt, sym_universal_newline);
2486 if (RTEST(v))
2488 newlineflag |= !NIL_P(v);
2489
2490 v = rb_hash_aref(opt, sym_crlf_newline);
2491 if (RTEST(v))
2492 setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2493 newlineflag |= !NIL_P(v);
2494
2495 v = rb_hash_aref(opt, sym_cr_newline);
2496 if (RTEST(v))
2497 setflags |= ECONV_CR_NEWLINE_DECORATOR;
2498 newlineflag |= !NIL_P(v);
2499
2500 if (newlineflag) {
2501 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2502 ecflags |= setflags;
2503 }
2504 }
2505
2506 return ecflags;
2507}
2508
2509int
2510rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
2511{
2512 VALUE newhash = Qnil;
2513 VALUE v;
2514
2515 if (NIL_P(opthash)) {
2516 *opts = Qnil;
2517 return ecflags;
2518 }
2519 ecflags = econv_opts(opthash, ecflags);
2520
2521 v = rb_hash_aref(opthash, sym_replace);
2522 if (!NIL_P(v)) {
2523 StringValue(v);
2525 VALUE dumped = rb_str_dump(v);
2526 rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
2527 StringValueCStr(dumped),
2529 }
2531 newhash = rb_hash_new();
2532 rb_hash_aset(newhash, sym_replace, v);
2533 }
2534
2535 v = rb_hash_aref(opthash, sym_fallback);
2536 if (!NIL_P(v)) {
2538 if (NIL_P(h)
2540 : (v = h, 1)) {
2541 if (NIL_P(newhash))
2542 newhash = rb_hash_new();
2543 rb_hash_aset(newhash, sym_fallback, v);
2544 }
2545 }
2546
2547 if (!NIL_P(newhash))
2548 rb_hash_freeze(newhash);
2549 *opts = newhash;
2550
2551 return ecflags;
2552}
2553
2554int
2556{
2557 return rb_econv_prepare_options(opthash, opts, 0);
2558}
2559
2560rb_econv_t *
2561rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
2562{
2563 rb_econv_t *ec;
2564 VALUE replacement;
2565
2566 if (NIL_P(opthash)) {
2567 replacement = Qnil;
2568 }
2569 else {
2570 if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash))
2571 rb_bug("rb_econv_open_opts called with invalid opthash");
2572 replacement = rb_hash_aref(opthash, sym_replace);
2573 }
2574
2575 ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
2576 if (!ec)
2577 return ec;
2578
2579 if (!NIL_P(replacement)) {
2580 int ret;
2581 rb_encoding *enc = rb_enc_get(replacement);
2582
2583 ret = rb_econv_set_replacement(ec,
2584 (const unsigned char *)RSTRING_PTR(replacement),
2585 RSTRING_LEN(replacement),
2586 rb_enc_name(enc));
2587 if (ret == -1) {
2588 rb_econv_close(ec);
2589 return NULL;
2590 }
2591 }
2592 return ec;
2593}
2594
2595static int
2596enc_arg(VALUE *arg, const char **name_p, rb_encoding **enc_p)
2597{
2598 rb_encoding *enc;
2599 const char *n;
2600 int encidx;
2601 VALUE encval;
2602
2603 if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
2604 !(enc = rb_enc_from_index(encidx))) {
2605 enc = NULL;
2606 encidx = 0;
2607 n = StringValueCStr(*arg);
2608 }
2609 else {
2610 n = rb_enc_name(enc);
2611 }
2612
2613 *name_p = n;
2614 *enc_p = enc;
2615
2616 return encidx;
2617}
2618
2619static int
2620str_transcode_enc_args(VALUE str, VALUE *arg1, VALUE *arg2,
2621 const char **sname_p, rb_encoding **senc_p,
2622 const char **dname_p, rb_encoding **denc_p)
2623{
2624 rb_encoding *senc, *denc;
2625 const char *sname, *dname;
2626 int sencidx, dencidx;
2627
2628 dencidx = enc_arg(arg1, &dname, &denc);
2629
2630 if (NIL_P(*arg2)) {
2631 sencidx = rb_enc_get_index(str);
2632 senc = rb_enc_from_index(sencidx);
2633 sname = rb_enc_name(senc);
2634 }
2635 else {
2636 sencidx = enc_arg(arg2, &sname, &senc);
2637 }
2638
2639 *sname_p = sname;
2640 *senc_p = senc;
2641 *dname_p = dname;
2642 *denc_p = denc;
2643 return dencidx;
2644}
2645
2646static int
2647str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
2648{
2649 VALUE dest;
2650 VALUE str = *self;
2651 VALUE arg1, arg2;
2652 long blen, slen;
2653 unsigned char *buf, *bp, *sp;
2654 const unsigned char *fromp;
2655 rb_encoding *senc, *denc;
2656 const char *sname, *dname;
2657 int dencidx;
2658 int explicitly_invalid_replace = TRUE;
2659
2660 rb_check_arity(argc, 0, 2);
2661
2662 if (argc == 0) {
2663 arg1 = rb_enc_default_internal();
2664 if (NIL_P(arg1)) {
2665 if (!ecflags) return -1;
2666 arg1 = rb_obj_encoding(str);
2667 }
2668 if (!(ecflags & ECONV_INVALID_MASK)) {
2669 explicitly_invalid_replace = FALSE;
2670 }
2672 }
2673 else {
2674 arg1 = argv[0];
2675 }
2676 arg2 = argc<=1 ? Qnil : argv[1];
2677 dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
2678
2679 if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2683 if (senc && senc == denc) {
2684 if ((ecflags & ECONV_INVALID_MASK) && explicitly_invalid_replace) {
2685 VALUE rep = Qnil;
2686 if (!NIL_P(ecopts)) {
2687 rep = rb_hash_aref(ecopts, sym_replace);
2688 }
2689 dest = rb_enc_str_scrub(senc, str, rep);
2690 if (NIL_P(dest)) dest = str;
2691 *self = dest;
2692 return dencidx;
2693 }
2694 return NIL_P(arg2) ? -1 : dencidx;
2695 }
2696 if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
2698 return dencidx;
2699 }
2700 }
2701 if (encoding_equal(sname, dname)) {
2702 return NIL_P(arg2) ? -1 : dencidx;
2703 }
2704 }
2705 else {
2706 if (encoding_equal(sname, dname)) {
2707 sname = "";
2708 dname = "";
2709 }
2710 }
2711
2712 fromp = sp = (unsigned char *)RSTRING_PTR(str);
2713 slen = RSTRING_LEN(str);
2714 blen = slen + 30; /* len + margin */
2715 dest = rb_str_tmp_new(blen);
2716 bp = (unsigned char *)RSTRING_PTR(dest);
2717
2718 transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
2719 if (fromp != sp+slen) {
2720 rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
2721 }
2722 buf = (unsigned char *)RSTRING_PTR(dest);
2723 *bp = '\0';
2724 rb_str_set_len(dest, bp - buf);
2725
2726 /* set encoding */
2727 if (!denc) {
2728 dencidx = rb_define_dummy_encoding(dname);
2729 RB_GC_GUARD(arg1);
2730 RB_GC_GUARD(arg2);
2731 }
2732 *self = dest;
2733
2734 return dencidx;
2735}
2736
2737static int
2738str_transcode(int argc, VALUE *argv, VALUE *self)
2739{
2740 VALUE opt;
2741 int ecflags = 0;
2742 VALUE ecopts = Qnil;
2743
2744 argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
2745 if (!NIL_P(opt)) {
2746 ecflags = rb_econv_prepare_opts(opt, &ecopts);
2747 }
2748 return str_transcode0(argc, argv, self, ecflags, ecopts);
2749}
2750
2751static inline VALUE
2752str_encode_associate(VALUE str, int encidx)
2753{
2754 int cr = 0;
2755
2756 rb_enc_associate_index(str, encidx);
2757
2758 /* transcoded string never be broken. */
2761 }
2762 else {
2764 }
2766 return str;
2767}
2768
2769/*
2770 * call-seq:
2771 * str.encode!(encoding [, options] ) -> str
2772 * str.encode!(dst_encoding, src_encoding [, options] ) -> str
2773 *
2774 * The first form transcodes the contents of <i>str</i> from
2775 * str.encoding to +encoding+.
2776 * The second form transcodes the contents of <i>str</i> from
2777 * src_encoding to dst_encoding.
2778 * The options Hash gives details for conversion. See String#encode
2779 * for details.
2780 * Returns the string even if no changes were made.
2781 */
2782
2783static VALUE
2784str_encode_bang(int argc, VALUE *argv, VALUE str)
2785{
2786 VALUE newstr;
2787 int encidx;
2788
2790
2791 newstr = str;
2792 encidx = str_transcode(argc, argv, &newstr);
2793
2794 if (encidx < 0) return str;
2795 if (newstr == str) {
2796 rb_enc_associate_index(str, encidx);
2797 return str;
2798 }
2799 rb_str_shared_replace(str, newstr);
2800 return str_encode_associate(str, encidx);
2801}
2802
2803static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
2804
2805/*
2806 * call-seq:
2807 * str.encode(encoding [, options] ) -> str
2808 * str.encode(dst_encoding, src_encoding [, options] ) -> str
2809 * str.encode([options]) -> str
2810 *
2811 * The first form returns a copy of +str+ transcoded
2812 * to encoding +encoding+.
2813 * The second form returns a copy of +str+ transcoded
2814 * from src_encoding to dst_encoding.
2815 * The last form returns a copy of +str+ transcoded to
2816 * <tt>Encoding.default_internal</tt>.
2817 *
2818 * By default, the first and second form raise
2819 * Encoding::UndefinedConversionError for characters that are
2820 * undefined in the destination encoding, and
2821 * Encoding::InvalidByteSequenceError for invalid byte sequences
2822 * in the source encoding. The last form by default does not raise
2823 * exceptions but uses replacement strings.
2824 *
2825 * The +options+ Hash gives details for conversion and can have the following
2826 * keys:
2827 *
2828 * :invalid ::
2829 * If the value is +:replace+, #encode replaces invalid byte sequences in
2830 * +str+ with the replacement character. The default is to raise the
2831 * Encoding::InvalidByteSequenceError exception
2832 * :undef ::
2833 * If the value is +:replace+, #encode replaces characters which are
2834 * undefined in the destination encoding with the replacement character.
2835 * The default is to raise the Encoding::UndefinedConversionError.
2836 * :replace ::
2837 * Sets the replacement string to the given value. The default replacement
2838 * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
2839 * :fallback ::
2840 * Sets the replacement string by the given object for undefined
2841 * character. The object should be a Hash, a Proc, a Method, or an
2842 * object which has [] method.
2843 * Its key is an undefined character encoded in the source encoding
2844 * of current transcoder. Its value can be any encoding until it
2845 * can be converted into the destination encoding of the transcoder.
2846 * :xml ::
2847 * The value must be +:text+ or +:attr+.
2848 * If the value is +:text+ #encode replaces undefined characters with their
2849 * (upper-case hexadecimal) numeric character references. '&', '<', and '>'
2850 * are converted to "&amp;", "&lt;", and "&gt;", respectively.
2851 * If the value is +:attr+, #encode also quotes the replacement result
2852 * (using '"'), and replaces '"' with "&quot;".
2853 * :cr_newline ::
2854 * Replaces LF ("\n") with CR ("\r") if value is true.
2855 * :crlf_newline ::
2856 * Replaces LF ("\n") with CRLF ("\r\n") if value is true.
2857 * :universal_newline ::
2858 * Replaces CRLF ("\r\n") and CR ("\r") with LF ("\n") if value is true.
2859 */
2860
2861static VALUE
2862str_encode(int argc, VALUE *argv, VALUE str)
2863{
2864 VALUE newstr = str;
2865 int encidx = str_transcode(argc, argv, &newstr);
2866 return encoded_dup(newstr, str, encidx);
2867}
2868
2869VALUE
2870rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
2871{
2872 int argc = 1;
2873 VALUE *argv = &to;
2874 VALUE newstr = str;
2875 int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
2876 return encoded_dup(newstr, str, encidx);
2877}
2878
2879static VALUE
2880encoded_dup(VALUE newstr, VALUE str, int encidx)
2881{
2882 if (encidx < 0) return rb_str_dup(str);
2883 if (newstr == str) {
2884 newstr = rb_str_dup(str);
2885 rb_enc_associate_index(newstr, encidx);
2886 return newstr;
2887 }
2888 else {
2890 }
2891 return str_encode_associate(newstr, encidx);
2892}
2893
2894/*
2895 * Document-class: Encoding::Converter
2896 *
2897 * Encoding conversion class.
2898 */
2899static void
2900econv_free(void *ptr)
2901{
2902 rb_econv_t *ec = ptr;
2903 rb_econv_close(ec);
2904}
2905
2906static size_t
2907econv_memsize(const void *ptr)
2908{
2909 return sizeof(rb_econv_t);
2910}
2911
2912static const rb_data_type_t econv_data_type = {
2913 "econv",
2914 {NULL, econv_free, econv_memsize,},
2916};
2917
2918static VALUE
2919econv_s_allocate(VALUE klass)
2920{
2921 return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
2922}
2923
2924static rb_encoding *
2925make_dummy_encoding(const char *name)
2926{
2927 rb_encoding *enc;
2928 int idx;
2930 enc = rb_enc_from_index(idx);
2931 return enc;
2932}
2933
2934static rb_encoding *
2935make_encoding(const char *name)
2936{
2937 rb_encoding *enc;
2938 enc = rb_enc_find(name);
2939 if (!enc)
2940 enc = make_dummy_encoding(name);
2941 return enc;
2942}
2943
2944static VALUE
2945make_encobj(const char *name)
2946{
2947 return rb_enc_from_encoding(make_encoding(name));
2948}
2949
2950/*
2951 * call-seq:
2952 * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
2953 * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
2954 *
2955 * Returns the corresponding ASCII compatible encoding.
2956 *
2957 * Returns nil if the argument is an ASCII compatible encoding.
2958 *
2959 * "corresponding ASCII compatible encoding" is an ASCII compatible encoding which
2960 * can represents exactly the same characters as the given ASCII incompatible encoding.
2961 * So, no conversion undefined error occurs when converting between the two encodings.
2962 *
2963 * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
2964 * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
2965 * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
2966 *
2967 */
2968static VALUE
2969econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
2970{
2971 const char *arg_name, *result_name;
2972 rb_encoding *arg_enc, *result_enc;
2973
2974 enc_arg(&arg, &arg_name, &arg_enc);
2975
2976 result_name = rb_econv_asciicompat_encoding(arg_name);
2977
2978 if (result_name == NULL)
2979 return Qnil;
2980
2981 result_enc = make_encoding(result_name);
2982
2983 return rb_enc_from_encoding(result_enc);
2984}
2985
2986static void
2987econv_args(int argc, VALUE *argv,
2988 VALUE *snamev_p, VALUE *dnamev_p,
2989 const char **sname_p, const char **dname_p,
2990 rb_encoding **senc_p, rb_encoding **denc_p,
2991 int *ecflags_p,
2992 VALUE *ecopts_p)
2993{
2994 VALUE opt, flags_v, ecopts;
2995 int sidx, didx;
2996 const char *sname, *dname;
2997 rb_encoding *senc, *denc;
2998 int ecflags;
2999
3000 argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
3001
3002 if (!NIL_P(flags_v)) {
3003 if (!NIL_P(opt)) {
3004 rb_error_arity(argc + 1, 2, 3);
3005 }
3006 ecflags = NUM2INT(rb_to_int(flags_v));
3007 ecopts = Qnil;
3008 }
3009 else if (!NIL_P(opt)) {
3010 ecflags = rb_econv_prepare_opts(opt, &ecopts);
3011 }
3012 else {
3013 ecflags = 0;
3014 ecopts = Qnil;
3015 }
3016
3017 senc = NULL;
3018 sidx = rb_to_encoding_index(*snamev_p);
3019 if (0 <= sidx) {
3020 senc = rb_enc_from_index(sidx);
3021 }
3022 else {
3023 StringValue(*snamev_p);
3024 }
3025
3026 denc = NULL;
3027 didx = rb_to_encoding_index(*dnamev_p);
3028 if (0 <= didx) {
3029 denc = rb_enc_from_index(didx);
3030 }
3031 else {
3032 StringValue(*dnamev_p);
3033 }
3034
3035 sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
3036 dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
3037
3038 *sname_p = sname;
3039 *dname_p = dname;
3040 *senc_p = senc;
3041 *denc_p = denc;
3042 *ecflags_p = ecflags;
3043 *ecopts_p = ecopts;
3044}
3045
3046static int
3047decorate_convpath(VALUE convpath, int ecflags)
3048{
3049 int num_decorators;
3050 const char *decorators[MAX_ECFLAGS_DECORATORS];
3051 int i;
3052 int n, len;
3053
3054 num_decorators = decorator_names(ecflags, decorators);
3055 if (num_decorators == -1)
3056 return -1;
3057
3058 len = n = RARRAY_LENINT(convpath);
3059 if (n != 0) {
3060 VALUE pair = RARRAY_AREF(convpath, n-1);
3061 if (RB_TYPE_P(pair, T_ARRAY)) {
3062 const char *sname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 0)));
3063 const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1)));
3064 transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
3065 const rb_transcoder *tr = load_transcoder_entry(entry);
3066 if (!tr)
3067 return -1;
3068 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
3069 tr->asciicompat_type == asciicompat_encoder) {
3070 n--;
3071 rb_ary_store(convpath, len + num_decorators - 1, pair);
3072 }
3073 }
3074 else {
3075 rb_ary_store(convpath, len + num_decorators - 1, pair);
3076 }
3077 }
3078
3079 for (i = 0; i < num_decorators; i++)
3080 rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
3081
3082 return 0;
3083}
3084
3085static void
3086search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3087{
3088 VALUE *ary_p = arg;
3089 VALUE v;
3090
3091 if (*ary_p == Qnil) {
3092 *ary_p = rb_ary_new();
3093 }
3094
3095 if (DECORATOR_P(sname, dname)) {
3096 v = rb_str_new_cstr(dname);
3097 }
3098 else {
3099 v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
3100 }
3101 rb_ary_store(*ary_p, depth, v);
3102}
3103
3104/*
3105 * call-seq:
3106 * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary
3107 * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary
3108 *
3109 * Returns a conversion path.
3110 *
3111 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
3112 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3113 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
3114 *
3115 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
3116 * or
3117 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
3118 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3119 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3120 * # "universal_newline"]
3121 *
3122 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
3123 * or
3124 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
3125 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3126 * # "universal_newline",
3127 * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
3128 */
3129static VALUE
3130econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
3131{
3132 VALUE snamev, dnamev;
3133 const char *sname, *dname;
3134 rb_encoding *senc, *denc;
3135 int ecflags;
3136 VALUE ecopts;
3137 VALUE convpath;
3138
3139 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3140
3141 convpath = Qnil;
3142 transcode_search_path(sname, dname, search_convpath_i, &convpath);
3143
3144 if (NIL_P(convpath)) {
3145 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3146 RB_GC_GUARD(snamev);
3147 RB_GC_GUARD(dnamev);
3149 }
3150
3151 if (decorate_convpath(convpath, ecflags) == -1) {
3152 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3153 RB_GC_GUARD(snamev);
3154 RB_GC_GUARD(dnamev);
3156 }
3157
3158 return convpath;
3159}
3160
3161/*
3162 * Check the existence of a conversion path.
3163 * Returns the number of converters in the conversion path.
3164 * result: >=0:success -1:failure
3165 */
3166int
3167rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
3168{
3169 VALUE convpath = Qnil;
3170 transcode_search_path(from_encoding, to_encoding, search_convpath_i,
3171 &convpath);
3172 return RTEST(convpath);
3173}
3174
3178 int ret;
3179};
3180
3181static void
3182rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3183{
3185 int ret;
3186
3187 if (a->ret == -1)
3188 return;
3189
3190 ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
3191
3192 a->ret = ret;
3193 return;
3194}
3195
3196static rb_econv_t *
3197rb_econv_init_by_convpath(VALUE self, VALUE convpath,
3198 const char **sname_p, const char **dname_p,
3199 rb_encoding **senc_p, rb_encoding**denc_p)
3200{
3201 rb_econv_t *ec;
3202 long i;
3203 int ret, first=1;
3204 VALUE elt;
3205 rb_encoding *senc = 0, *denc = 0;
3206 const char *sname, *dname;
3207
3208 ec = rb_econv_alloc(RARRAY_LENINT(convpath));
3209 DATA_PTR(self) = ec;
3210
3211 for (i = 0; i < RARRAY_LEN(convpath); i++) {
3212 VALUE snamev, dnamev;
3213 VALUE pair;
3214 elt = rb_ary_entry(convpath, i);
3215 if (!NIL_P(pair = rb_check_array_type(elt))) {
3216 if (RARRAY_LEN(pair) != 2)
3217 rb_raise(rb_eArgError, "not a 2-element array in convpath");
3218 snamev = rb_ary_entry(pair, 0);
3219 enc_arg(&snamev, &sname, &senc);
3220 dnamev = rb_ary_entry(pair, 1);
3221 enc_arg(&dnamev, &dname, &denc);
3222 }
3223 else {
3224 sname = "";
3225 dname = StringValueCStr(elt);
3226 }
3227 if (DECORATOR_P(sname, dname)) {
3228 ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
3229 if (ret == -1) {
3230 VALUE msg = rb_sprintf("decoration failed: %s", dname);
3231 RB_GC_GUARD(snamev);
3232 RB_GC_GUARD(dnamev);
3234 }
3235 }
3236 else {
3237 int j = ec->num_trans;
3239 arg.ec = ec;
3240 arg.index = ec->num_trans;
3241 arg.ret = 0;
3242 ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
3243 if (ret == -1 || arg.ret == -1) {
3244 VALUE msg = rb_sprintf("adding conversion failed: %s to %s", sname, dname);
3245 RB_GC_GUARD(snamev);
3246 RB_GC_GUARD(dnamev);
3248 }
3249 if (first) {
3250 first = 0;
3251 *senc_p = senc;
3252 *sname_p = ec->elems[j].tc->transcoder->src_encoding;
3253 }
3254 *denc_p = denc;
3255 *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
3256 }
3257 }
3258
3259 if (first) {
3260 *senc_p = NULL;
3261 *denc_p = NULL;
3262 *sname_p = "";
3263 *dname_p = "";
3264 }
3265
3266 ec->source_encoding_name = *sname_p;
3267 ec->destination_encoding_name = *dname_p;
3268
3269 return ec;
3270}
3271
3272/*
3273 * call-seq:
3274 * Encoding::Converter.new(source_encoding, destination_encoding)
3275 * Encoding::Converter.new(source_encoding, destination_encoding, opt)
3276 * Encoding::Converter.new(convpath)
3277 *
3278 * possible options elements:
3279 * hash form:
3280 * :invalid => nil # raise error on invalid byte sequence (default)
3281 * :invalid => :replace # replace invalid byte sequence
3282 * :undef => nil # raise error on undefined conversion (default)
3283 * :undef => :replace # replace undefined conversion
3284 * :replace => string # replacement string ("?" or "\uFFFD" if not specified)
3285 * :newline => :universal # decorator for converting CRLF and CR to LF
3286 * :newline => :crlf # decorator for converting LF to CRLF
3287 * :newline => :cr # decorator for converting LF to CR
3288 * :universal_newline => true # decorator for converting CRLF and CR to LF
3289 * :crlf_newline => true # decorator for converting LF to CRLF
3290 * :cr_newline => true # decorator for converting LF to CR
3291 * :xml => :text # escape as XML CharData.
3292 * :xml => :attr # escape as XML AttValue
3293 * integer form:
3294 * Encoding::Converter::INVALID_REPLACE
3295 * Encoding::Converter::UNDEF_REPLACE
3296 * Encoding::Converter::UNDEF_HEX_CHARREF
3297 * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
3298 * Encoding::Converter::CRLF_NEWLINE_DECORATOR
3299 * Encoding::Converter::CR_NEWLINE_DECORATOR
3300 * Encoding::Converter::XML_TEXT_DECORATOR
3301 * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
3302 * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
3303 *
3304 * Encoding::Converter.new creates an instance of Encoding::Converter.
3305 *
3306 * Source_encoding and destination_encoding should be a string or
3307 * Encoding object.
3308 *
3309 * opt should be nil, a hash or an integer.
3310 *
3311 * convpath should be an array.
3312 * convpath may contain
3313 * - two-element arrays which contain encodings or encoding names, or
3314 * - strings representing decorator names.
3315 *
3316 * Encoding::Converter.new optionally takes an option.
3317 * The option should be a hash or an integer.
3318 * The option hash can contain :invalid => nil, etc.
3319 * The option integer should be logical-or of constants such as
3320 * Encoding::Converter::INVALID_REPLACE, etc.
3321 *
3322 * [:invalid => nil]
3323 * Raise error on invalid byte sequence. This is a default behavior.
3324 * [:invalid => :replace]
3325 * Replace invalid byte sequence by replacement string.
3326 * [:undef => nil]
3327 * Raise an error if a character in source_encoding is not defined in destination_encoding.
3328 * This is a default behavior.
3329 * [:undef => :replace]
3330 * Replace undefined character in destination_encoding with replacement string.
3331 * [:replace => string]
3332 * Specify the replacement string.
3333 * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
3334 * [:universal_newline => true]
3335 * Convert CRLF and CR to LF.
3336 * [:crlf_newline => true]
3337 * Convert LF to CRLF.
3338 * [:cr_newline => true]
3339 * Convert LF to CR.
3340 * [:xml => :text]
3341 * Escape as XML CharData.
3342 * This form can be used as an HTML 4.0 #PCDATA.
3343 * - '&' -> '&amp;'
3344 * - '<' -> '&lt;'
3345 * - '>' -> '&gt;'
3346 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3347 * [:xml => :attr]
3348 * Escape as XML AttValue.
3349 * The converted result is quoted as "...".
3350 * This form can be used as an HTML 4.0 attribute value.
3351 * - '&' -> '&amp;'
3352 * - '<' -> '&lt;'
3353 * - '>' -> '&gt;'
3354 * - '"' -> '&quot;'
3355 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3356 *
3357 * Examples:
3358 * # UTF-16BE to UTF-8
3359 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3360 *
3361 * # Usually, decorators such as newline conversion are inserted last.
3362 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
3363 * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
3364 * # "universal_newline"]
3365 *
3366 * # But, if the last encoding is ASCII incompatible,
3367 * # decorators are inserted before the last conversion.
3368 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
3369 * p ec.convpath #=> ["crlf_newline",
3370 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3371 *
3372 * # Conversion path can be specified directly.
3373 * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
3374 * p ec.convpath #=> ["universal_newline",
3375 * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
3376 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3377 */
3378static VALUE
3379econv_init(int argc, VALUE *argv, VALUE self)
3380{
3381 VALUE ecopts;
3382 VALUE snamev, dnamev;
3383 const char *sname, *dname;
3384 rb_encoding *senc, *denc;
3385 rb_econv_t *ec;
3386 int ecflags;
3387 VALUE convpath;
3388
3389 if (rb_check_typeddata(self, &econv_data_type)) {
3390 rb_raise(rb_eTypeError, "already initialized");
3391 }
3392
3393 if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
3394 ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
3395 ecflags = 0;
3396 ecopts = Qnil;
3397 }
3398 else {
3399 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3400 ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
3401 }
3402
3403 if (!ec) {
3404 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3405 RB_GC_GUARD(snamev);
3406 RB_GC_GUARD(dnamev);
3408 }
3409
3410 if (!DECORATOR_P(sname, dname)) {
3411 if (!senc)
3412 senc = make_dummy_encoding(sname);
3413 if (!denc)
3414 denc = make_dummy_encoding(dname);
3415 RB_GC_GUARD(snamev);
3416 RB_GC_GUARD(dnamev);
3417 }
3418
3419 ec->source_encoding = senc;
3420 ec->destination_encoding = denc;
3421
3422 DATA_PTR(self) = ec;
3423
3424 return self;
3425}
3426
3427/*
3428 * call-seq:
3429 * ec.inspect -> string
3430 *
3431 * Returns a printable version of <i>ec</i>
3432 *
3433 * ec = Encoding::Converter.new("iso-8859-1", "utf-8")
3434 * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
3435 *
3436 */
3437static VALUE
3438econv_inspect(VALUE self)
3439{
3440 const char *cname = rb_obj_classname(self);
3441 rb_econv_t *ec;
3442
3443 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3444 if (!ec)
3445 return rb_sprintf("#<%s: uninitialized>", cname);
3446 else {
3447 const char *sname = ec->source_encoding_name;
3448 const char *dname = ec->destination_encoding_name;
3449 VALUE str;
3450 str = rb_sprintf("#<%s: ", cname);
3451 econv_description(sname, dname, ec->flags, str);
3452 rb_str_cat2(str, ">");
3453 return str;
3454 }
3455}
3456
3457static rb_econv_t *
3458check_econv(VALUE self)
3459{
3460 rb_econv_t *ec;
3461
3462 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3463 if (!ec) {
3464 rb_raise(rb_eTypeError, "uninitialized encoding converter");
3465 }
3466 return ec;
3467}
3468
3469/*
3470 * call-seq:
3471 * ec.source_encoding -> encoding
3472 *
3473 * Returns the source encoding as an Encoding object.
3474 */
3475static VALUE
3476econv_source_encoding(VALUE self)
3477{
3478 rb_econv_t *ec = check_econv(self);
3479 if (!ec->source_encoding)
3480 return Qnil;
3482}
3483
3484/*
3485 * call-seq:
3486 * ec.destination_encoding -> encoding
3487 *
3488 * Returns the destination encoding as an Encoding object.
3489 */
3490static VALUE
3491econv_destination_encoding(VALUE self)
3492{
3493 rb_econv_t *ec = check_econv(self);
3495 return Qnil;
3497}
3498
3499/*
3500 * call-seq:
3501 * ec.convpath -> ary
3502 *
3503 * Returns the conversion path of ec.
3504 *
3505 * The result is an array of conversions.
3506 *
3507 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
3508 * p ec.convpath
3509 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3510 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3511 * # "crlf_newline"]
3512 *
3513 * Each element of the array is a pair of encodings or a string.
3514 * A pair means an encoding conversion.
3515 * A string means a decorator.
3516 *
3517 * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
3518 * a converter from ISO-8859-1 to UTF-8.
3519 * "crlf_newline" means newline converter from LF to CRLF.
3520 */
3521static VALUE
3522econv_convpath(VALUE self)
3523{
3524 rb_econv_t *ec = check_econv(self);
3525 VALUE result;
3526 int i;
3527
3528 result = rb_ary_new();
3529 for (i = 0; i < ec->num_trans; i++) {
3530 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
3531 VALUE v;
3532 if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
3533 v = rb_str_new_cstr(tr->dst_encoding);
3534 else
3535 v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
3536 rb_ary_push(result, v);
3537 }
3538 return result;
3539}
3540
3541/*
3542 * call-seq:
3543 * ec == other -> true or false
3544 */
3545static VALUE
3546econv_equal(VALUE self, VALUE other)
3547{
3548 rb_econv_t *ec1 = check_econv(self);
3549 rb_econv_t *ec2;
3550 int i;
3551
3552 if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
3553 return Qnil;
3554 }
3555 ec2 = DATA_PTR(other);
3556 if (!ec2) return Qfalse;
3557 if (ec1->source_encoding_name != ec2->source_encoding_name &&
3559 return Qfalse;
3562 return Qfalse;
3563 if (ec1->flags != ec2->flags) return Qfalse;
3564 if (ec1->replacement_enc != ec2->replacement_enc &&
3566 return Qfalse;
3567 if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
3568 if (ec1->replacement_str != ec2->replacement_str &&
3570 return Qfalse;
3571
3572 if (ec1->num_trans != ec2->num_trans) return Qfalse;
3573 for (i = 0; i < ec1->num_trans; i++) {
3574 if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
3575 return Qfalse;
3576 }
3577 return Qtrue;
3578}
3579
3580static VALUE
3581econv_result_to_symbol(rb_econv_result_t res)
3582{
3583 switch (res) {
3584 case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
3585 case econv_incomplete_input: return sym_incomplete_input;
3586 case econv_undefined_conversion: return sym_undefined_conversion;
3587 case econv_destination_buffer_full: return sym_destination_buffer_full;
3588 case econv_source_buffer_empty: return sym_source_buffer_empty;
3589 case econv_finished: return sym_finished;
3590 case econv_after_output: return sym_after_output;
3591 default: return INT2NUM(res); /* should not be reached */
3592 }
3593}
3594
3595/*
3596 * call-seq:
3597 * ec.primitive_convert(source_buffer, destination_buffer) -> symbol
3598 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
3599 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
3600 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
3601 *
3602 * possible opt elements:
3603 * hash form:
3604 * :partial_input => true # source buffer may be part of larger source
3605 * :after_output => true # stop conversion after output before input
3606 * integer form:
3607 * Encoding::Converter::PARTIAL_INPUT
3608 * Encoding::Converter::AFTER_OUTPUT
3609 *
3610 * possible results:
3611 * :invalid_byte_sequence
3612 * :incomplete_input
3613 * :undefined_conversion
3614 * :after_output
3615 * :destination_buffer_full
3616 * :source_buffer_empty
3617 * :finished
3618 *
3619 * primitive_convert converts source_buffer into destination_buffer.
3620 *
3621 * source_buffer should be a string or nil.
3622 * nil means an empty string.
3623 *
3624 * destination_buffer should be a string.
3625 *
3626 * destination_byteoffset should be an integer or nil.
3627 * nil means the end of destination_buffer.
3628 * If it is omitted, nil is assumed.
3629 *
3630 * destination_bytesize should be an integer or nil.
3631 * nil means unlimited.
3632 * If it is omitted, nil is assumed.
3633 *
3634 * opt should be nil, a hash or an integer.
3635 * nil means no flags.
3636 * If it is omitted, nil is assumed.
3637 *
3638 * primitive_convert converts the content of source_buffer from beginning
3639 * and store the result into destination_buffer.
3640 *
3641 * destination_byteoffset and destination_bytesize specify the region which
3642 * the converted result is stored.
3643 * destination_byteoffset specifies the start position in destination_buffer in bytes.
3644 * If destination_byteoffset is nil,
3645 * destination_buffer.bytesize is used for appending the result.
3646 * destination_bytesize specifies maximum number of bytes.
3647 * If destination_bytesize is nil,
3648 * destination size is unlimited.
3649 * After conversion, destination_buffer is resized to
3650 * destination_byteoffset + actually produced number of bytes.
3651 * Also destination_buffer's encoding is set to destination_encoding.
3652 *
3653 * primitive_convert drops the converted part of source_buffer.
3654 * the dropped part is converted in destination_buffer or
3655 * buffered in Encoding::Converter object.
3656 *
3657 * primitive_convert stops conversion when one of following condition met.
3658 * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
3659 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3660 * - unexpected end of source buffer (:incomplete_input)
3661 * this occur only when :partial_input is not specified.
3662 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3663 * - character not representable in output encoding (:undefined_conversion)
3664 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3665 * - after some output is generated, before input is done (:after_output)
3666 * this occur only when :after_output is specified.
3667 * - destination buffer is full (:destination_buffer_full)
3668 * this occur only when destination_bytesize is non-nil.
3669 * - source buffer is empty (:source_buffer_empty)
3670 * this occur only when :partial_input is specified.
3671 * - conversion is finished (:finished)
3672 *
3673 * example:
3674 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3675 * ret = ec.primitive_convert(src="pi", dst="", nil, 100)
3676 * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
3677 *
3678 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3679 * ret = ec.primitive_convert(src="pi", dst="", nil, 1)
3680 * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
3681 * ret = ec.primitive_convert(src, dst="", nil, 1)
3682 * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
3683 * ret = ec.primitive_convert(src, dst="", nil, 1)
3684 * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
3685 * ret = ec.primitive_convert(src, dst="", nil, 1)
3686 * p [ret, src, dst] #=> [:finished, "", "i"]
3687 *
3688 */
3689static VALUE
3690econv_primitive_convert(int argc, VALUE *argv, VALUE self)
3691{
3692 VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
3693 rb_econv_t *ec = check_econv(self);
3695 const unsigned char *ip, *is;
3696 unsigned char *op, *os;
3697 long output_byteoffset, output_bytesize;
3698 unsigned long output_byteend;
3699 int flags;
3700
3701 argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
3702
3703 if (NIL_P(output_byteoffset_v))
3704 output_byteoffset = 0; /* dummy */
3705 else
3706 output_byteoffset = NUM2LONG(output_byteoffset_v);
3707
3708 if (NIL_P(output_bytesize_v))
3709 output_bytesize = 0; /* dummy */
3710 else
3711 output_bytesize = NUM2LONG(output_bytesize_v);
3712
3713 if (!NIL_P(flags_v)) {
3714 if (!NIL_P(opt)) {
3715 rb_error_arity(argc + 1, 2, 5);
3716 }
3717 flags = NUM2INT(rb_to_int(flags_v));
3718 }
3719 else if (!NIL_P(opt)) {
3720 VALUE v;
3721 flags = 0;
3722 v = rb_hash_aref(opt, sym_partial_input);
3723 if (RTEST(v))
3724 flags |= ECONV_PARTIAL_INPUT;
3725 v = rb_hash_aref(opt, sym_after_output);
3726 if (RTEST(v))
3727 flags |= ECONV_AFTER_OUTPUT;
3728 }
3729 else {
3730 flags = 0;
3731 }
3732
3733 StringValue(output);
3734 if (!NIL_P(input))
3736 rb_str_modify(output);
3737
3738 if (NIL_P(output_bytesize_v)) {
3739 output_bytesize = RSTRING_EMBED_LEN_MAX;
3740 if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
3741 output_bytesize = RSTRING_LEN(input);
3742 }
3743
3744 retry:
3745
3746 if (NIL_P(output_byteoffset_v))
3747 output_byteoffset = RSTRING_LEN(output);
3748
3749 if (output_byteoffset < 0)
3750 rb_raise(rb_eArgError, "negative output_byteoffset");
3751
3752 if (RSTRING_LEN(output) < output_byteoffset)
3753 rb_raise(rb_eArgError, "output_byteoffset too big");
3754
3755 if (output_bytesize < 0)
3756 rb_raise(rb_eArgError, "negative output_bytesize");
3757
3758 output_byteend = (unsigned long)output_byteoffset +
3759 (unsigned long)output_bytesize;
3760
3761 if (output_byteend < (unsigned long)output_byteoffset ||
3762 LONG_MAX < output_byteend)
3763 rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
3764
3765 if (rb_str_capacity(output) < output_byteend)
3766 rb_str_resize(output, output_byteend);
3767
3768 if (NIL_P(input)) {
3769 ip = is = NULL;
3770 }
3771 else {
3772 ip = (const unsigned char *)RSTRING_PTR(input);
3773 is = ip + RSTRING_LEN(input);
3774 }
3775
3776 op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
3777 os = op + output_bytesize;
3778
3779 res = rb_econv_convert(ec, &ip, is, &op, os, flags);
3780 rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
3781 if (!NIL_P(input)) {
3782 rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
3783 }
3784
3785 if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
3786 if (LONG_MAX / 2 < output_bytesize)
3787 rb_raise(rb_eArgError, "too long conversion result");
3788 output_bytesize *= 2;
3789 output_byteoffset_v = Qnil;
3790 goto retry;
3791 }
3792
3793 if (ec->destination_encoding) {
3795 }
3796
3797 return econv_result_to_symbol(res);
3798}
3799
3800/*
3801 * call-seq:
3802 * ec.convert(source_string) -> destination_string
3803 *
3804 * Convert source_string and return destination_string.
3805 *
3806 * source_string is assumed as a part of source.
3807 * i.e. :partial_input=>true is specified internally.
3808 * finish method should be used last.
3809 *
3810 * ec = Encoding::Converter.new("utf-8", "euc-jp")
3811 * puts ec.convert("\u3042").dump #=> "\xA4\xA2"
3812 * puts ec.finish.dump #=> ""
3813 *
3814 * ec = Encoding::Converter.new("euc-jp", "utf-8")
3815 * puts ec.convert("\xA4").dump #=> ""
3816 * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
3817 * puts ec.finish.dump #=> ""
3818 *
3819 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3820 * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
3821 * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
3822 * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
3823 * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
3824 *
3825 * If a conversion error occur,
3826 * Encoding::UndefinedConversionError or
3827 * Encoding::InvalidByteSequenceError is raised.
3828 * Encoding::Converter#convert doesn't supply methods to recover or restart
3829 * from these exceptions.
3830 * When you want to handle these conversion errors,
3831 * use Encoding::Converter#primitive_convert.
3832 *
3833 */
3834static VALUE
3835econv_convert(VALUE self, VALUE source_string)
3836{
3837 VALUE ret, dst;
3838 VALUE av[5];
3839 int ac;
3840 rb_econv_t *ec = check_econv(self);
3841
3842 StringValue(source_string);
3843
3844 dst = rb_str_new(NULL, 0);
3845
3846 av[0] = rb_str_dup(source_string);
3847 av[1] = dst;
3848 av[2] = Qnil;
3849 av[3] = Qnil;
3851 ac = 5;
3852
3853 ret = econv_primitive_convert(ac, av, self);
3854
3855 if (ret == sym_invalid_byte_sequence ||
3856 ret == sym_undefined_conversion ||
3857 ret == sym_incomplete_input) {
3858 VALUE exc = make_econv_exception(ec);
3860 }
3861
3862 if (ret == sym_finished) {
3863 rb_raise(rb_eArgError, "converter already finished");
3864 }
3865
3866 if (ret != sym_source_buffer_empty) {
3867 rb_bug("unexpected result of econv_primitive_convert");
3868 }
3869
3870 return dst;
3871}
3872
3873/*
3874 * call-seq:
3875 * ec.finish -> string
3876 *
3877 * Finishes the converter.
3878 * It returns the last part of the converted string.
3879 *
3880 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3881 * p ec.convert("\u3042") #=> "\e$B$\""
3882 * p ec.finish #=> "\e(B"
3883 */
3884static VALUE
3885econv_finish(VALUE self)
3886{
3887 VALUE ret, dst;
3888 VALUE av[5];
3889 int ac;
3890 rb_econv_t *ec = check_econv(self);
3891
3892 dst = rb_str_new(NULL, 0);
3893
3894 av[0] = Qnil;
3895 av[1] = dst;
3896 av[2] = Qnil;
3897 av[3] = Qnil;
3898 av[4] = INT2FIX(0);
3899 ac = 5;
3900
3901 ret = econv_primitive_convert(ac, av, self);
3902
3903 if (ret == sym_invalid_byte_sequence ||
3904 ret == sym_undefined_conversion ||
3905 ret == sym_incomplete_input) {
3906 VALUE exc = make_econv_exception(ec);
3908 }
3909
3910 if (ret != sym_finished) {
3911 rb_bug("unexpected result of econv_primitive_convert");
3912 }
3913
3914 return dst;
3915}
3916
3917/*
3918 * call-seq:
3919 * ec.primitive_errinfo -> array
3920 *
3921 * primitive_errinfo returns important information regarding the last error
3922 * as a 5-element array:
3923 *
3924 * [result, enc1, enc2, error_bytes, readagain_bytes]
3925 *
3926 * result is the last result of primitive_convert.
3927 *
3928 * Other elements are only meaningful when result is
3929 * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
3930 *
3931 * enc1 and enc2 indicate a conversion step as a pair of strings.
3932 * For example, a converter from EUC-JP to ISO-8859-1 converts
3933 * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
3934 * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
3935 *
3936 * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
3937 * error_bytes is discarded portion.
3938 * readagain_bytes is buffered portion which is read again on next conversion.
3939 *
3940 * Example:
3941 *
3942 * # \xff is invalid as EUC-JP.
3943 * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
3944 * ec.primitive_convert(src="\xff", dst="", nil, 10)
3945 * p ec.primitive_errinfo
3946 * #=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", ""]
3947 *
3948 * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
3949 * # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
3950 * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
3951 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3952 * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
3953 * p ec.primitive_errinfo
3954 * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
3955 *
3956 * # partial character is invalid
3957 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3958 * ec.primitive_convert(src="\xa4", dst="", nil, 10)
3959 * p ec.primitive_errinfo
3960 * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
3961 *
3962 * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
3963 * # partial characters.
3964 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3965 * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
3966 * p ec.primitive_errinfo
3967 * #=> [:source_buffer_empty, nil, nil, nil, nil]
3968 *
3969 * # \xd8\x00\x00@ is invalid as UTF-16BE because
3970 * # no low surrogate after high surrogate (\xd8\x00).
3971 * # It is detected by 3rd byte (\00) which is part of next character.
3972 * # So the high surrogate (\xd8\x00) is discarded and
3973 * # the 3rd byte is read again later.
3974 * # Since the byte is buffered in ec, it is dropped from src.
3975 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3976 * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
3977 * p ec.primitive_errinfo
3978 * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
3979 * p src
3980 * #=> "@"
3981 *
3982 * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
3983 * # The problem is detected by 4th byte.
3984 * ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
3985 * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
3986 * p ec.primitive_errinfo
3987 * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
3988 * p src
3989 * #=> ""
3990 *
3991 */
3992static VALUE
3993econv_primitive_errinfo(VALUE self)
3994{
3995 rb_econv_t *ec = check_econv(self);
3996
3997 VALUE ary;
3998
3999 ary = rb_ary_new2(5);
4000
4001 rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
4002 rb_ary_store(ary, 4, Qnil);
4003
4006
4009
4013 }
4014
4015 return ary;
4016}
4017
4018/*
4019 * call-seq:
4020 * ec.insert_output(string) -> nil
4021 *
4022 * Inserts string into the encoding converter.
4023 * The string will be converted to the destination encoding and
4024 * output on later conversions.
4025 *
4026 * If the destination encoding is stateful,
4027 * string is converted according to the state and the state is updated.
4028 *
4029 * This method should be used only when a conversion error occurs.
4030 *
4031 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4032 * src = "HIRAGANA LETTER A is \u{3042}."
4033 * dst = ""
4034 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4035 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
4036 * ec.insert_output("<err>")
4037 * p ec.primitive_convert(src, dst) #=> :finished
4038 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
4039 *
4040 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
4041 * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
4042 * dst = ""
4043 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4044 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
4045 * ec.insert_output "?" # state change required to output "?".
4046 * p ec.primitive_convert(src, dst) #=> :finished
4047 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
4048 *
4049 */
4050static VALUE
4051econv_insert_output(VALUE self, VALUE string)
4052{
4053 const char *insert_enc;
4054
4055 int ret;
4056
4057 rb_econv_t *ec = check_econv(self);
4058
4059 StringValue(string);
4061 string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
4062
4063 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
4064 if (ret == -1) {
4065 rb_raise(rb_eArgError, "too big string");
4066 }
4067
4068 return Qnil;
4069}
4070
4071/*
4072 * call-seq:
4073 * ec.putback -> string
4074 * ec.putback(max_numbytes) -> string
4075 *
4076 * Put back the bytes which will be converted.
4077 *
4078 * The bytes are caused by invalid_byte_sequence error.
4079 * When invalid_byte_sequence error, some bytes are discarded and
4080 * some bytes are buffered to be converted later.
4081 * The latter bytes can be put back.
4082 * It can be observed by
4083 * Encoding::InvalidByteSequenceError#readagain_bytes and
4084 * Encoding::Converter#primitive_errinfo.
4085 *
4086 * ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
4087 * src = "\x00\xd8\x61\x00"
4088 * dst = ""
4089 * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
4090 * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
4091 * p ec.putback #=> "a\x00"
4092 * p ec.putback #=> "" # no more bytes to put back
4093 *
4094 */
4095static VALUE
4096econv_putback(int argc, VALUE *argv, VALUE self)
4097{
4098 rb_econv_t *ec = check_econv(self);
4099 int n;
4100 int putbackable;
4101 VALUE str, max;
4102
4103 if (!rb_check_arity(argc, 0, 1) || NIL_P(max = argv[0])) {
4105 }
4106 else {
4107 n = NUM2INT(max);
4108 putbackable = rb_econv_putbackable(ec);
4109 if (putbackable < n)
4110 n = putbackable;
4111 }
4112
4113 str = rb_str_new(NULL, n);
4114 rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
4115
4116 if (ec->source_encoding) {
4118 }
4119
4120 return str;
4121}
4122
4123/*
4124 * call-seq:
4125 * ec.last_error -> exception or nil
4126 *
4127 * Returns an exception object for the last conversion.
4128 * Returns nil if the last conversion did not produce an error.
4129 *
4130 * "error" means that
4131 * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
4132 * Encoding::Converter#convert and
4133 * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
4134 * Encoding::Converter#primitive_convert.
4135 *
4136 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4137 * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
4138 * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
4139 * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
4140 * p ec.last_error #=> nil
4141 *
4142 */
4143static VALUE
4144econv_last_error(VALUE self)
4145{
4146 rb_econv_t *ec = check_econv(self);
4147 VALUE exc;
4148
4149 exc = make_econv_exception(ec);
4150 if (NIL_P(exc))
4151 return Qnil;
4152 return exc;
4153}
4154
4155/*
4156 * call-seq:
4157 * ec.replacement -> string
4158 *
4159 * Returns the replacement string.
4160 *
4161 * ec = Encoding::Converter.new("euc-jp", "us-ascii")
4162 * p ec.replacement #=> "?"
4163 *
4164 * ec = Encoding::Converter.new("euc-jp", "utf-8")
4165 * p ec.replacement #=> "\uFFFD"
4166 */
4167static VALUE
4168econv_get_replacement(VALUE self)
4169{
4170 rb_econv_t *ec = check_econv(self);
4171 int ret;
4172 rb_encoding *enc;
4173
4174 ret = make_replacement(ec);
4175 if (ret == -1) {
4176 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4177 }
4178
4180 return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
4181}
4182
4183/*
4184 * call-seq:
4185 * ec.replacement = string
4186 *
4187 * Sets the replacement string.
4188 *
4189 * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
4190 * ec.replacement = "<undef>"
4191 * p ec.convert("a \u3042 b") #=> "a <undef> b"
4192 */
4193static VALUE
4194econv_set_replacement(VALUE self, VALUE arg)
4195{
4196 rb_econv_t *ec = check_econv(self);
4197 VALUE string = arg;
4198 int ret;
4199 rb_encoding *enc;
4200
4201 StringValue(string);
4202 enc = rb_enc_get(string);
4203
4205 (const unsigned char *)RSTRING_PTR(string),
4206 RSTRING_LEN(string),
4207 rb_enc_name(enc));
4208
4209 if (ret == -1) {
4210 /* xxx: rb_eInvalidByteSequenceError? */
4211 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4212 }
4213
4214 return arg;
4215}
4216
4217VALUE
4219{
4220 return make_econv_exception(ec);
4221}
4222
4223void
4225{
4226 VALUE exc;
4227
4228 exc = make_econv_exception(ec);
4229 if (NIL_P(exc))
4230 return;
4232}
4233
4234/*
4235 * call-seq:
4236 * ecerr.source_encoding_name -> string
4237 *
4238 * Returns the source encoding name as a string.
4239 */
4240static VALUE
4241ecerr_source_encoding_name(VALUE self)
4242{
4243 return rb_attr_get(self, rb_intern("source_encoding_name"));
4244}
4245
4246/*
4247 * call-seq:
4248 * ecerr.source_encoding -> encoding
4249 *
4250 * Returns the source encoding as an encoding object.
4251 *
4252 * Note that the result may not be equal to the source encoding of
4253 * the encoding converter if the conversion has multiple steps.
4254 *
4255 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
4256 * begin
4257 * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
4258 * rescue Encoding::UndefinedConversionError
4259 * p $!.source_encoding #=> #<Encoding:UTF-8>
4260 * p $!.destination_encoding #=> #<Encoding:EUC-JP>
4261 * p $!.source_encoding_name #=> "UTF-8"
4262 * p $!.destination_encoding_name #=> "EUC-JP"
4263 * end
4264 *
4265 */
4266static VALUE
4267ecerr_source_encoding(VALUE self)
4268{
4269 return rb_attr_get(self, rb_intern("source_encoding"));
4270}
4271
4272/*
4273 * call-seq:
4274 * ecerr.destination_encoding_name -> string
4275 *
4276 * Returns the destination encoding name as a string.
4277 */
4278static VALUE
4279ecerr_destination_encoding_name(VALUE self)
4280{
4281 return rb_attr_get(self, rb_intern("destination_encoding_name"));
4282}
4283
4284/*
4285 * call-seq:
4286 * ecerr.destination_encoding -> string
4287 *
4288 * Returns the destination encoding as an encoding object.
4289 */
4290static VALUE
4291ecerr_destination_encoding(VALUE self)
4292{
4293 return rb_attr_get(self, rb_intern("destination_encoding"));
4294}
4295
4296/*
4297 * call-seq:
4298 * ecerr.error_char -> string
4299 *
4300 * Returns the one-character string which cause Encoding::UndefinedConversionError.
4301 *
4302 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
4303 * begin
4304 * ec.convert("\xa0")
4305 * rescue Encoding::UndefinedConversionError
4306 * puts $!.error_char.dump #=> "\xC2\xA0"
4307 * p $!.error_char.encoding #=> #<Encoding:UTF-8>
4308 * end
4309 *
4310 */
4311static VALUE
4312ecerr_error_char(VALUE self)
4313{
4314 return rb_attr_get(self, rb_intern("error_char"));
4315}
4316
4317/*
4318 * call-seq:
4319 * ecerr.error_bytes -> string
4320 *
4321 * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
4322 *
4323 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4324 * begin
4325 * ec.convert("abc\xA1\xFFdef")
4326 * rescue Encoding::InvalidByteSequenceError
4327 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
4328 * puts $!.error_bytes.dump #=> "\xA1"
4329 * puts $!.readagain_bytes.dump #=> "\xFF"
4330 * end
4331 */
4332static VALUE
4333ecerr_error_bytes(VALUE self)
4334{
4335 return rb_attr_get(self, rb_intern("error_bytes"));
4336}
4337
4338/*
4339 * call-seq:
4340 * ecerr.readagain_bytes -> string
4341 *
4342 * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
4343 */
4344static VALUE
4345ecerr_readagain_bytes(VALUE self)
4346{
4347 return rb_attr_get(self, rb_intern("readagain_bytes"));
4348}
4349
4350/*
4351 * call-seq:
4352 * ecerr.incomplete_input? -> true or false
4353 *
4354 * Returns true if the invalid byte sequence error is caused by
4355 * premature end of string.
4356 *
4357 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4358 *
4359 * begin
4360 * ec.convert("abc\xA1z")
4361 * rescue Encoding::InvalidByteSequenceError
4362 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
4363 * p $!.incomplete_input? #=> false
4364 * end
4365 *
4366 * begin
4367 * ec.convert("abc\xA1")
4368 * ec.finish
4369 * rescue Encoding::InvalidByteSequenceError
4370 * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
4371 * p $!.incomplete_input? #=> true
4372 * end
4373 */
4374static VALUE
4375ecerr_incomplete_input(VALUE self)
4376{
4377 return rb_attr_get(self, rb_intern("incomplete_input"));
4378}
4379
4380/*
4381 * Document-class: Encoding::UndefinedConversionError
4382 *
4383 * Raised by Encoding and String methods when a transcoding operation
4384 * fails.
4385 */
4386
4387/*
4388 * Document-class: Encoding::InvalidByteSequenceError
4389 *
4390 * Raised by Encoding and String methods when the string being
4391 * transcoded contains a byte invalid for the either the source or
4392 * target encoding.
4393 */
4394
4395/*
4396 * Document-class: Encoding::ConverterNotFoundError
4397 *
4398 * Raised by transcoding methods when a named encoding does not
4399 * correspond with a known converter.
4400 */
4401
4402#undef rb_intern
4403void
4405{
4406 transcoder_table = st_init_strcasetable();
4407
4408 sym_invalid = ID2SYM(rb_intern("invalid"));
4409 sym_undef = ID2SYM(rb_intern("undef"));
4410 sym_replace = ID2SYM(rb_intern("replace"));
4411 sym_fallback = ID2SYM(rb_intern("fallback"));
4412 sym_xml = ID2SYM(rb_intern("xml"));
4413 sym_text = ID2SYM(rb_intern("text"));
4414 sym_attr = ID2SYM(rb_intern("attr"));
4415
4416 sym_invalid_byte_sequence = ID2SYM(rb_intern("invalid_byte_sequence"));
4417 sym_undefined_conversion = ID2SYM(rb_intern("undefined_conversion"));
4418 sym_destination_buffer_full = ID2SYM(rb_intern("destination_buffer_full"));
4419 sym_source_buffer_empty = ID2SYM(rb_intern("source_buffer_empty"));
4420 sym_finished = ID2SYM(rb_intern("finished"));
4421 sym_after_output = ID2SYM(rb_intern("after_output"));
4422 sym_incomplete_input = ID2SYM(rb_intern("incomplete_input"));
4423 sym_universal_newline = ID2SYM(rb_intern("universal_newline"));
4424 sym_crlf_newline = ID2SYM(rb_intern("crlf_newline"));
4425 sym_cr_newline = ID2SYM(rb_intern("cr_newline"));
4426 sym_partial_input = ID2SYM(rb_intern("partial_input"));
4427
4428#ifdef ENABLE_ECONV_NEWLINE_OPTION
4429 sym_newline = ID2SYM(rb_intern("newline"));
4430 sym_universal = ID2SYM(rb_intern("universal"));
4431 sym_crlf = ID2SYM(rb_intern("crlf"));
4432 sym_cr = ID2SYM(rb_intern("cr"));
4433 sym_lf = ID2SYM(rb_intern("lf"));
4434#endif
4435
4436 InitVM(transcode);
4437}
4438
4439void
4441{
4442 rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
4443 rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
4444 rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
4445
4446 rb_define_method(rb_cString, "encode", str_encode, -1);
4447 rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
4448
4451 rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
4452 rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
4453 rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
4454 rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
4455 rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
4456 rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
4457 rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
4458 rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
4459 rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
4460 rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
4461 rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
4462 rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
4463 rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
4464 rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
4465 rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
4466 rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
4467 rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1);
4468
4469 /* Document-const: INVALID_MASK
4470 *
4471 * Mask for invalid byte sequences
4472 */
4474
4475 /* Document-const: INVALID_REPLACE
4476 *
4477 * Replace invalid byte sequences
4478 */
4480
4481 /* Document-const: UNDEF_MASK
4482 *
4483 * Mask for a valid character in the source encoding but no related
4484 * character(s) in destination encoding.
4485 */
4487
4488 /* Document-const: UNDEF_REPLACE
4489 *
4490 * Replace byte sequences that are undefined in the destination encoding.
4491 */
4493
4494 /* Document-const: UNDEF_HEX_CHARREF
4495 *
4496 * Replace byte sequences that are undefined in the destination encoding
4497 * with an XML hexadecimal character reference. This is valid for XML
4498 * conversion.
4499 */
4501
4502 /* Document-const: PARTIAL_INPUT
4503 *
4504 * Indicates the source may be part of a larger string. See
4505 * primitive_convert for an example.
4506 */
4508
4509 /* Document-const: AFTER_OUTPUT
4510 *
4511 * Stop converting after some output is complete but before all of the
4512 * input was consumed. See primitive_convert for an example.
4513 */
4515
4516 /* Document-const: UNIVERSAL_NEWLINE_DECORATOR
4517 *
4518 * Decorator for converting CRLF and CR to LF
4519 */
4521
4522 /* Document-const: CRLF_NEWLINE_DECORATOR
4523 *
4524 * Decorator for converting LF to CRLF
4525 */
4527
4528 /* Document-const: CR_NEWLINE_DECORATOR
4529 *
4530 * Decorator for converting LF to CR
4531 */
4533
4534 /* Document-const: XML_TEXT_DECORATOR
4535 *
4536 * Escape as XML CharData
4537 */
4539
4540 /* Document-const: XML_ATTR_CONTENT_DECORATOR
4541 *
4542 * Escape as XML AttValue
4543 */
4545
4546 /* Document-const: XML_ATTR_QUOTE_DECORATOR
4547 *
4548 * Escape as XML AttValue
4549 */
4551
4552 rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
4553 rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4554 rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
4555 rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
4556 rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
4557
4558 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
4559 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4560 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
4561 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
4562 rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
4563 rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
4564 rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
4565
4566 Init_newline();
4567}
#define fail()
struct RIMemo * ptr
Definition: debug.c:65
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:1032
int rb_enc_get_index(VALUE obj)
Definition: encoding.c:779
int rb_to_encoding_index(VALUE enc)
Definition: encoding.c:197
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Definition: encoding.c:866
rb_encoding * rb_utf8_encoding(void)
Definition: encoding.c:1328
rb_encoding * rb_enc_from_index(int index)
Definition: encoding.c:609
rb_encoding * rb_enc_get(VALUE obj)
Definition: encoding.c:872
rb_encoding * rb_enc_find(const char *name)
Definition: encoding.c:728
int rb_define_dummy_encoding(const char *name)
Definition: encoding.c:462
VALUE rb_enc_default_internal(void)
Definition: encoding.c:1521
VALUE rb_obj_encoding(VALUE obj)
Definition: encoding.c:1004
rb_encoding * rb_to_encoding(VALUE enc)
Definition: encoding.c:245
VALUE rb_enc_from_encoding(rb_encoding *encoding)
Definition: encoding.c:116
VALUE rb_enc_associate_index(VALUE obj, int idx)
Definition: encoding.c:838
int rb_enc_find_index(const char *name)
Definition: encoding.c:693
#define ECONV_XML_ATTR_QUOTE_DECORATOR
Definition: encoding.h:408
#define ECONV_AFTER_OUTPUT
Definition: encoding.h:416
#define ENC_CODERANGE_7BIT
Definition: encoding.h:104
#define ENC_CODERANGE_VALID
Definition: encoding.h:105
#define ECONV_UNIVERSAL_NEWLINE_DECORATOR
Definition: encoding.h:402
#define ECONV_XML_ATTR_CONTENT_DECORATOR
Definition: encoding.h:406
#define ECONV_INVALID_MASK
Definition: encoding.h:393
#define ECONV_CRLF_NEWLINE_DECORATOR
Definition: encoding.h:403
rb_econv_result_t
Definition: encoding.h:297
@ econv_incomplete_input
Definition: encoding.h:304
@ econv_finished
Definition: encoding.h:302
@ econv_undefined_conversion
Definition: encoding.h:299
@ econv_after_output
Definition: encoding.h:303
@ econv_source_buffer_empty
Definition: encoding.h:301
@ econv_destination_buffer_full
Definition: encoding.h:300
@ econv_invalid_byte_sequence
Definition: encoding.h:298
#define ECONV_UNDEF_REPLACE
Definition: encoding.h:396
int rb_enc_str_coderange(VALUE)
Definition: string.c:657
#define ECONV_XML_TEXT_DECORATOR
Definition: encoding.h:405
#define ECONV_CR_NEWLINE_DECORATOR
Definition: encoding.h:404
VALUE rb_enc_str_new(const char *, long, rb_encoding *)
Definition: string.c:796
#define rb_enc_name(enc)
Definition: encoding.h:177
#define ECONV_INVALID_REPLACE
Definition: encoding.h:394
#define rb_enc_mbc_to_codepoint(p, e, enc)
Definition: encoding.h:208
#define MBCLEN_CHARFOUND_LEN(ret)
Definition: encoding.h:192
#define rb_enc_asciicompat(enc)
Definition: encoding.h:245
struct rb_econv_t rb_econv_t
Definition: encoding.h:307
#define ECONV_UNDEF_MASK
Definition: encoding.h:395
#define ECONV_PARTIAL_INPUT
Definition: encoding.h:415
#define ECONV_ERROR_HANDLER_MASK
Definition: encoding.h:392
#define ENC_CODERANGE_BROKEN
Definition: encoding.h:106
long rb_str_coderange_scan_restartable(const char *, const char *, rb_encoding *, int *)
Definition: string.c:567
#define MBCLEN_CHARFOUND_P(ret)
Definition: encoding.h:191
#define ECONV_UNDEF_HEX_CHARREF
Definition: encoding.h:397
#define ECONV_NEWLINE_DECORATOR_MASK
Definition: encoding.h:399
#define ENC_CODERANGE_SET(obj, cr)
Definition: encoding.h:110
char str[HTML_ESCAPE_MAX_LEN+1]
Definition: escape.c:18
VALUE rb_define_class_under(VALUE, const char *, VALUE)
Defines a class under the namespace of outer.
Definition: class.c:711
VALUE rb_cData
Data class.
Definition: ruby.h:2020
VALUE rb_cString
Definition: ruby.h:2046
@ RSTRING_EMBED_LEN_MAX
Definition: ruby.h:982
VALUE rb_cEncoding
Definition: encoding.c:46
void rb_raise(VALUE exc, const char *fmt,...)
Definition: error.c:2671
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition: eval.c:668
int rb_typeddata_is_kind_of(VALUE obj, const rb_data_type_t *data_type)
Definition: error.c:874
void rb_bug(const char *fmt,...)
Definition: error.c:636
VALUE rb_eTypeError
Definition: error.c:924
VALUE rb_eRuntimeError
Definition: error.c:922
void * rb_check_typeddata(VALUE obj, const rb_data_type_t *data_type)
Definition: error.c:891
VALUE rb_exc_new_str(VALUE, VALUE)
Definition: error.c:974
VALUE rb_eArgError
Definition: error.c:925
VALUE rb_eEncodingError
Definition: error.c:930
VALUE rb_obj_class(VALUE)
Equivalent to Object#class in Ruby.
Definition: object.c:217
VALUE rb_to_int(VALUE)
Converts val into Integer.
Definition: object.c:3021
unsigned int input
Definition: nkf.c:4325
const char * name
Definition: nkf.c:208
#define RARRAY_LEN(a)
#define rb_str_new2
#define MEMCPY(p1, p2, type, n)
void Init_newline(void)
#define NULL
int memcmp(const void *, const void *, size_t)
Definition: memcmp.c:7
#define dp(v)
VALUE rb_str_resize(VALUE, long)
Definition: string.c:2709
use StringValue() instead")))
#define RSTRING_LEN(str)
#define RTEST(v)
#define ALLOCA_N(type, n)
#define bp()
unsigned long st_data_t
size_t strlen(const char *)
int strcmp(const char *, const char *)
VALUE rb_assoc_new(VALUE, VALUE)
Definition: array.c:896
VALUE rb_hash_aref(VALUE, VALUE)
Definition: hash.c:2037
#define RARRAY_LENINT(ary)
void rb_str_shared_replace(VALUE, VALUE)
Definition: string.c:1391
VALUE rb_require_string(VALUE)
Definition: load.c:1145
#define xfree
#define Qundef
const struct rb_call_cache * cc
#define rb_str_cat2
#define RSTRING_END(str)
#define rb_check_frozen(obj)
#define SIZE_MAX
#define RSTRING_PTR(str)
#define xrealloc
VALUE rb_obj_is_method(VALUE)
Definition: proc.c:1459
int snprintf(char *__restrict__, size_t, const char *__restrict__,...) __attribute__((__format__(__printf__
VALUE rb_obj_is_proc(VALUE)
Definition: proc.c:152
#define rb_str_new(str, len)
#define NIL_P(v)
VALUE rb_method_call(int, const VALUE *, VALUE)
Definition: proc.c:2273
#define ID2SYM(x)
#define LONG_MAX
#define REALLOC_N(var, type, n)
const char size_t n
void rb_str_set_len(VALUE, long)
Definition: string.c:2692
int rb_respond_to(VALUE, ID)
Definition: vm_method.c:2207
unsigned long VALUE
VALUE rb_ary_push(VALUE, VALUE)
Definition: array.c:1195
#define rb_ary_new4
__inline__ const void *__restrict__ src
VALUE rb_sym2str(VALUE)
Definition: symbol.c:784
void rb_str_modify(VALUE)
Definition: string.c:2114
VALUE rb_str_buf_new(long)
Definition: string.c:1315
#define xmalloc
void rb_define_alloc_func(VALUE, rb_alloc_func_t)
uint32_t i
#define char
__inline__ const void *__restrict__ size_t len
#define OBJ_FROZEN(x)
const char * rb_obj_classname(VALUE)
Definition: variable.c:289
#define ALLOC_N(type, n)
#define OBJ_FREEZE(x)
#define INT2NUM(x)
#define T_HASH
void rb_define_const(VALUE, const char *, VALUE)
Definition: variable.c:2891
#define long
#define NUM2INT(x)
void rb_define_singleton_method(VALUE, const char *, VALUE(*)(), int)
#define RB_GC_GUARD(v)
#define RUBY_TYPED_FREE_IMMEDIATELY
VALUE rb_hash_freeze(VALUE)
Definition: hash.c:87
#define TypedData_Get_Struct(obj, type, data_type, sval)
#define PRIsVALUE
void * memset(void *, int, size_t)
int VALUE v
VALUE rb_ary_new(void)
Definition: array.c:723
#define rb_scan_args(argc, argvp, fmt,...)
#define rb_exc_new3
VALUE rb_str_tmp_new(long)
Definition: string.c:1343
#define rb_intern(str)
#define INT_MAX
#define TypedData_Wrap_Struct(klass, data_type, sval)
VALUE rb_str_catf(VALUE, const char *,...) __attribute__((format(printf
#define TRUE
#define FALSE
unsigned int size
#define Qtrue
#define MEMMOVE(p1, p2, type, n)
long unsigned int size_t
struct rb_call_cache buf
VALUE rb_str_dump(VALUE)
Definition: string.c:6042
VALUE rb_str_new_frozen(VALUE)
Definition: string.c:1203
VALUE rb_attr_get(VALUE, ID)
Definition: variable.c:1084
#define Qnil
#define Qfalse
#define DATA_PTR(dta)
#define T_ARRAY
void * memcpy(void *__restrict__, const void *__restrict__, size_t)
VALUE rb_str_drop_bytes(VALUE, long)
Definition: string.c:4573
VALUE rb_check_hash_type(VALUE)
Definition: hash.c:1852
#define RB_TYPE_P(obj, type)
#define INT2FIX(i)
VALUE rb_check_array_type(VALUE)
Definition: array.c:909
VALUE rb_proc_call(VALUE, VALUE)
Definition: proc.c:966
#define ALLOC(type)
#define T_SYMBOL
#define PRIdPTRDIFF
const VALUE * argv
#define SYMBOL_P(x)
_ssize_t ssize_t
__inline__ int
VALUE rb_ivar_set(VALUE, ID, VALUE)
Definition: variable.c:1300
VALUE rb_hash_aset(VALUE, VALUE, VALUE)
Definition: hash.c:2852
#define rb_check_arity
VALUE rb_str_dup(VALUE)
Definition: string.c:1516
VALUE rb_sprintf(const char *,...) __attribute__((format(printf
#define InitVM(ext)
size_t st_index_t h
const rb_iseq_t const VALUE exc
#define RBASIC_SET_CLASS(obj, cls)
#define NUM2LONG(x)
void rb_define_method(VALUE, const char *, VALUE(*)(), int)
#define rb_ary_new2
#define RARRAY_AREF(a, i)
VALUE rb_hash_new(void)
Definition: hash.c:1523
#define rb_str_new_cstr(str)
struct iseq_catch_table_entry entries[]
void rb_ary_store(VALUE, long, VALUE)
Definition: array.c:1079
VALUE rb_funcallv_public(VALUE, ID, int, const VALUE *)
Calls a method.
Definition: vm_eval.c:980
VALUE rb_ary_entry(VALUE, long)
Definition: array.c:1512
#define StringValueCStr(v)
#define f
void st_free_table(st_table *tab)
Definition: st.c:709
void st_add_direct(st_table *tab, st_data_t key, st_data_t value)
Definition: st.c:1251
int st_lookup(st_table *tab, st_data_t key, st_data_t *value)
Definition: st.c:1101
int st_foreach(st_table *tab, st_foreach_callback_func *func, st_data_t arg)
Definition: st.c:1717
st_table * st_init_strcasetable(void)
Definition: st.c:683
size_t rb_str_capacity(VALUE str)
Definition: string.c:712
VALUE rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
Definition: string.c:10255
const char * ascii_compat_name
Definition: transcode.c:1747
const char * ascii_incompat_name
Definition: transcode.c:1748
unsigned char * out_data_start
Definition: transcode.c:105
struct rb_transcoding * tc
Definition: transcode.c:103
unsigned char * out_buf_start
Definition: transcode.c:104
rb_econv_result_t last_result
Definition: transcode.c:108
unsigned char * out_buf_end
Definition: transcode.c:107
unsigned char * out_data_end
Definition: transcode.c:106
rb_encoding * destination_encoding
Definition: transcode.c:147
unsigned char * in_buf_start
Definition: transcode.c:122
size_t error_bytes_len
Definition: transcode.c:140
const char * source_encoding_name
Definition: transcode.c:115
size_t readagain_len
Definition: transcode.c:141
struct rb_econv_t::@232 last_error
unsigned char * in_buf_end
Definition: transcode.c:125
size_t replacement_len
Definition: transcode.c:119
struct rb_transcoding * error_tc
Definition: transcode.c:136
int num_trans
Definition: transcode.c:129
unsigned char * in_data_start
Definition: transcode.c:123
rb_encoding * source_encoding
Definition: transcode.c:146
rb_econv_elem_t * elems
Definition: transcode.c:126
int started
Definition: transcode.c:113
const char * replacement_enc
Definition: transcode.c:120
const char * source_encoding
Definition: transcode.c:137
int replacement_allocated
Definition: transcode.c:127
const char * destination_encoding
Definition: transcode.c:138
rb_econv_result_t result
Definition: transcode.c:135
const unsigned char * replacement_str
Definition: transcode.c:118
struct rb_transcoding * last_tc
Definition: transcode.c:131
unsigned char * in_data_end
Definition: transcode.c:124
int num_allocated
Definition: transcode.c:128
const unsigned char * error_bytes_start
Definition: transcode.c:139
const char * destination_encoding_name
Definition: transcode.c:116
int num_finished
Definition: transcode.c:130
const char * dst_encoding
const char * src_encoding
rb_transcoder_asciicompat_type_t asciicompat_type
unsigned int output_index
Definition: transcode.c:62
ssize_t recognized_len
Definition: transcode.c:64
unsigned char next_byte
Definition: transcode.c:61
unsigned int next_table
Definition: transcode.c:59
int resume_position
Definition: transcode.c:58
union rb_transcoding::@230 readbuf
unsigned char ary[8]
Definition: transcode.c:67
ssize_t writebuf_len
Definition: transcode.c:72
VALUE next_info
Definition: transcode.c:60
const rb_transcoder * transcoder
Definition: transcode.c:54
union rb_transcoding::@231 writebuf
unsigned char * ptr
Definition: transcode.c:68
ssize_t readagain_len
Definition: transcode.c:65
union rb_transcoding::rb_transcoding_state_t state
ssize_t writebuf_off
Definition: transcode.c:71
st_table * visited
Definition: transcode.c:249
search_path_queue_t * queue
Definition: transcode.c:250
const char * base_enc
Definition: transcode.c:252
search_path_queue_t ** queue_last_ptr
Definition: transcode.c:251
const char * enc
Definition: transcode.c:245
struct search_path_queue_tag * next
Definition: transcode.c:244
Definition: string.c:6989
transcoder_entry_t ** entries
Definition: transcode.c:955
int num_additional
Definition: transcode.c:956
Definition: transcode.c:156
const char * sname
Definition: transcode.c:157
const rb_transcoder * transcoder
Definition: transcode.c:160
const char * dname
Definition: transcode.c:158
const char * lib
Definition: transcode.c:159
#define TRANSCODING_WRITEBUF(tc)
Definition: transcode.c:88
#define BL_ACTION(byte)
#define writebuf_len
#define hash_fallback
Definition: transcode.c:2228
rb_econv_t * rb_econv_open(const char *sname, const char *dname, int ecflags)
Definition: transcode.c:1052
VALUE rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
Definition: transcode.c:2019
#define BL_MIN_BYTE
const char * rb_econv_encoding_to_insert_output(rb_econv_t *ec)
Definition: transcode.c:1486
#define next_info
#define TRANSCODING_STATE(tc)
Definition: transcode.c:97
int rb_econv_putbackable(rb_econv_t *ec)
Definition: transcode.c:1725
int rb_econv_has_convpath_p(const char *from_encoding, const char *to_encoding)
Definition: transcode.c:3167
#define SUSPEND_AFTER_OUTPUT(num)
#define SUSPEND_OBUF(num)
VALUE rb_cEncodingConverter
Definition: transcode.c:25
VALUE rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
Definition: transcode.c:1848
#define next_table
void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
Definition: transcode.c:233
#define BYTE_ADDR(index)
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
Definition: transcode.c:2561
#define TRANSCODING_WRITEBUF_SIZE(tc)
Definition: transcode.c:92
size_t rb_econv_memsize(rb_econv_t *ec)
Definition: transcode.c:1703
#define DECORATOR_P(sname, dname)
Definition: transcode.c:154
int rb_econv_insert_output(rb_econv_t *ec, const unsigned char *str, size_t len, const char *str_encoding)
Definition: transcode.c:1570
VALUE rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
Definition: transcode.c:1860
VALUE rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
Definition: transcode.c:1839
int rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
Definition: transcode.c:1908
#define SUSPEND(ret, num)
void rb_econv_binmode(rb_econv_t *ec)
Definition: transcode.c:1925
int rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
Definition: transcode.c:1891
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Definition: transcode.c:2870
VALUE rb_econv_make_exception(rb_econv_t *ec)
Definition: transcode.c:4218
void rb_econv_check_error(rb_econv_t *ec)
Definition: transcode.c:4224
int rb_econv_prepare_opts(VALUE opthash, VALUE *opts)
Definition: transcode.c:2555
VALUE rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
Definition: transcode.c:1854
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags)
Definition: transcode.c:1429
#define TRANSCODING_READBUF(tc)
Definition: transcode.c:84
void Init_transcode(void)
Definition: transcode.c:4404
#define MAX_ECFLAGS_DECORATORS
Definition: transcode.c:1011
int rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
Definition: transcode.c:2510
struct rb_transcoding rb_transcoding
void InitVM_transcode(void)
Definition: transcode.c:4440
void rb_econv_close(rb_econv_t *ec)
Definition: transcode.c:1685
struct search_path_queue_tag search_path_queue_t
#define encoding_equal(enc1, enc2)
Definition: transcode.c:241
void rb_register_transcoder(const rb_transcoder *tr)
Definition: transcode.c:205
#define next_byte
void rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
Definition: transcode.c:1736
#define writebuf_off
VALUE rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
Definition: transcode.c:1796
int rb_econv_set_replacement(rb_econv_t *ec, const unsigned char *str, size_t len, const char *encname)
Definition: transcode.c:2181
const char * rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
Definition: transcode.c:1769
#define BL_MAX_BYTE
#define FUNio
#define FOURbt
#define FUNso
#define FUNsio
#define STR1
#define getBT3(a)
#define getBT2(a)
#define ZERObt
#define getBT1(a)
#define getGB4bt2(a)
#define getBT0(a)
#define getGB4bt1(a)
#define TWObt
#define FUNsi
#define STR1_LENGTH(byte_addr)
#define ONEbt
#define UNDEF
#define THREEbt
#define NOMAP
#define getGB4bt0(a)
@ asciicompat_encoder
@ asciicompat_decoder
#define INVALID
#define FUNii
#define STR1_BYTEINDEX(w)
#define getGB4bt3(a)
#define GB4bt
char ary[sizeof(double) > sizeof(void *) ? sizeof(double) :sizeof(void *)]
Definition: transcode.c:80
VALUE(* fallback_func)(VALUE obj, VALUE name)
Definition: variable.c:127
MJIT_STATIC void rb_error_arity(int argc, int min, int max)