summaryrefslogtreecommitdiff
path: root/transcode_data.h
blob: 6081aec81dcf13a88d99d912d1fa42583339938c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
/**********************************************************************

  transcode_data.h -

  $Author$
  created at: Mon 10 Dec 2007 14:01:47 JST 2007

  Copyright (C) 2007 Martin Duerst

**********************************************************************/

#include "ruby/ruby.h"

#ifndef RUBY_TRANSCODE_DATA_H
#define RUBY_TRANSCODE_DATA_H 1

#define WORDINDEX_SHIFT_BITS 2
#define WORDINDEX2INFO(widx)      ((widx) << WORDINDEX_SHIFT_BITS)
#define INFO2WORDINDEX(info)      ((info) >> WORDINDEX_SHIFT_BITS)
#define BYTE_LOOKUP_BASE(bl) ((bl)[0])
#define BYTE_LOOKUP_INFO(bl) ((bl)[1])

#ifndef PType
/* data file needs to treat this as a pointer, to remove warnings */
#define PType (unsigned int)
#endif

#define NOMAP	(PType 0x01)	/* single byte direct map */
#define ONEbt	(0x02)		/* one byte payload */
#define TWObt	(0x03)		/* two bytes payload */
#define THREEbt	(0x05)		/* three bytes payload */
#define FOURbt	(0x06)		/* four bytes payload, UTF-8 only, macros start at getBT0 */
#define INVALID	(PType 0x07)	/* invalid byte sequence */
#define UNDEF	(PType 0x09)	/* legal but undefined */
#define ZERObt	(PType 0x0A)	/* zero bytes of payload, i.e. remove */
#define FUNii	(PType 0x0B)	/* function from info to info */
#define FUNsi	(PType 0x0D)	/* function from start to info */
#define FUNio	(PType 0x0E)	/* function from info to output */
#define FUNso	(PType 0x0F)	/* function from start to output */

#define o1(b1)		(PType((((unsigned char)(b1))<<8)|ONEbt))
#define o2(b1,b2)	(PType((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|TWObt))
#define o3(b1,b2,b3)	(PType(((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|THREEbt)&0xffffffffU))
#define o4(b0,b1,b2,b3)	(PType(((((unsigned char)(b1))<< 8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|((((unsigned char)(b0))&0x07)<<5)|FOURbt)&0xffffffffU))

#define getBT1(a)	(((a)>> 8)&0xFF)
#define getBT2(a)	(((a)>>16)&0xFF)
#define getBT3(a)	(((a)>>24)&0xFF)
#define getBT0(a)	((((a)>> 5)&0x07)|0xF0)   /* for UTF-8 only!!! */

#define o2FUNii(b1,b2)	(PType((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|FUNii))

/* do we need these??? maybe not, can be done with simple tables */
#define ONETRAIL       /* legal but undefined if one more trailing UTF-8 */
#define TWOTRAIL       /* legal but undefined if two more trailing UTF-8 */
#define THREETRAIL     /* legal but undefined if three more trailing UTF-8 */

typedef enum {
  stateless_converter,  /* stateless -> stateless */
  stateful_decoder,     /* stateful -> stateless */
  stateful_encoder      /* stateless -> stateful */
  /* stateful -> stateful is intentionally ommitted. */
} rb_transcoder_stateful_type_t;

typedef struct rb_transcoder rb_transcoder;

/* dynamic structure, one per conversion (similar to iconv_t) */
/* may carry conversion state (e.g. for iso-2022-jp) */
typedef struct rb_transcoding {
    const rb_transcoder *transcoder;

    int flags;

    int resume_position;
    unsigned int next_table;
    VALUE next_info;
    unsigned char next_byte;

    int recognized_len; /* already interpreted */
    int readagain_len; /* not yet interpreted */
    union {
        unsigned char ary[8]; /* max_input <= sizeof(ary) */
        unsigned char *ptr; /* length: max_input */
    } readbuf; /* recognized_len + readagain_len used */

    int writebuf_off;
    int writebuf_len;
    union {
        unsigned char ary[8]; /* max_output <= sizeof(ary) */
        unsigned char *ptr; /* length: max_output */
    } writebuf;

    unsigned char stateful[256]; /* opaque data for stateful encoding */
} rb_transcoding;
#define TRANSCODING_READBUF(tc) \
    ((tc)->transcoder->max_input <= sizeof((tc)->readbuf.ary) ? \
     (tc)->readbuf.ary : \
     (tc)->readbuf.ptr)
#define TRANSCODING_WRITEBUF(tc) \
    ((tc)->transcoder->max_output <= sizeof((tc)->writebuf.ary) ? \
     (tc)->writebuf.ary : \
     (tc)->writebuf.ptr)

/* static structure, one per supported encoding pair */
struct rb_transcoder {
    const char *from_encoding;
    const char *to_encoding;
    unsigned int conv_tree_start;
    const unsigned char *byte_array;
    unsigned int byte_array_length;
    const unsigned int *word_array;
    unsigned int word_array_length;
    int word_size;
    int input_unit_length;
    int max_input;
    int max_output;
    rb_transcoder_stateful_type_t stateful_type;
    VALUE (*func_ii)(rb_transcoding*, VALUE); /* info  -> info   */
    VALUE (*func_si)(rb_transcoding*, const unsigned char*, size_t); /* start -> info   */
    int (*func_io)(rb_transcoding*, VALUE, const unsigned char*); /* info  -> output */
    int (*func_so)(rb_transcoding*, const unsigned char*, size_t, unsigned char*); /* start -> output */
    int (*finish_func)(rb_transcoding*, unsigned char*); /* -> output */
    int (*resetsize_func)(rb_transcoding*); /* -> len */
    int (*resetstate_func)(rb_transcoding*, unsigned char*); /* -> output */
};

void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib);
void rb_register_transcoder(const rb_transcoder *);

#endif /* RUBY_TRANSCODE_DATA_H */