4273
|
1 /*
|
|
2 UTF-to-VLQ
|
|
3 Public domain
|
|
4 */
|
|
5
|
|
6 #include <stdio.h>
|
|
7 #include <stdlib.h>
|
|
8
|
|
9 #ifdef _WIN32
|
|
10 #include <fcntl.h>
|
|
11 #endif
|
|
12
|
|
13 typedef unsigned char byte;
|
|
14 typedef unsigned long long ULL;
|
|
15
|
|
16 typedef ULL(*in_func_t)(void);
|
|
17 typedef void(*out_func_t)(ULL);
|
|
18
|
|
19 char in_mode;
|
|
20 char out_mode;
|
|
21 int options[128];
|
|
22 ULL translation[256];
|
|
23
|
|
24 #define conv_lf options['L']
|
|
25 #define conv_cr options['c']
|
|
26 #define bom_in options['b']
|
|
27 #define bom_out options['B']
|
|
28 #define trans_le options['t']
|
|
29
|
|
30 byte getb(void) {
|
|
31 int x=fgetc(stdin);
|
|
32 if(x==EOF) exit(0);
|
|
33 return x;
|
|
34 }
|
|
35
|
|
36 inline ULL sign_extend(ULL x,int y) {
|
|
37 return x|((x&(1LL<<y))?-1LL<<y:0);
|
|
38 }
|
|
39
|
|
40 ULL read_8bit_raw(void) {
|
|
41 return getb();
|
|
42 }
|
|
43
|
|
44 ULL read_16bit_le_raw(void) {
|
|
45 ULL x=getb();
|
|
46 return x|(getb()<<8);
|
|
47 }
|
|
48
|
|
49 ULL read_16bit_be_raw(void) {
|
|
50 ULL x=getb()<<8;
|
|
51 return x|getb();
|
|
52 }
|
|
53
|
|
54 ULL read_32bit_le_raw(void) {
|
|
55 ULL x=getb();
|
|
56 x|=getb()<<8;
|
|
57 x|=getb()<<16;
|
|
58 return x|(getb()<<24);
|
|
59 }
|
|
60
|
|
61 ULL read_32bit_be_raw(void) {
|
|
62 ULL x=getb()<<24;
|
|
63 x|=getb()<<16;
|
|
64 x|=getb()<<8;
|
|
65 return x|getb();
|
|
66 }
|
|
67
|
|
68 ULL read_64bit_le_raw(void) {
|
|
69 ULL x=getb();
|
|
70 x|=getb()<<8;
|
|
71 x|=getb()<<16;
|
|
72 x|=((ULL)getb())<<24;
|
|
73 x|=((ULL)getb())<<32;
|
|
74 x|=((ULL)getb())<<40;
|
|
75 x|=((ULL)getb())<<48;
|
|
76 x|=((ULL)getb())<<56;
|
|
77 return x;
|
|
78 }
|
|
79
|
|
80 ULL read_64bit_be_raw(void) {
|
|
81 ULL x=((ULL)getb())<<56;
|
|
82 x|=((ULL)getb())<<48;
|
|
83 x|=((ULL)getb())<<40;
|
|
84 x|=((ULL)getb())<<32;
|
|
85 x|=((ULL)getb())<<24;
|
|
86 x|=getb()<<16;
|
|
87 x|=getb()<<8;
|
|
88 return x;
|
|
89 }
|
|
90
|
|
91 ULL read_utf8(void) {
|
|
92 ULL x=getb();
|
|
93 if((x&0xE0)==0xC0) {
|
|
94 x=((x&0x1F)<<6)|(getb()&0x3F);
|
|
95 } else if((x&0xF0)==0xE0) {
|
|
96 x=((x&0x0F)<<6)|(getb()&0x3F);
|
|
97 x=(x<<6)|(getb()&0x3F);
|
|
98 } else if((x&0xF8)==0xF0) {
|
|
99 x=((x&0x07)<<6)|(getb()&0x3F);
|
|
100 x=(x<<6)|(getb()&0x3F);
|
|
101 x=(x<<6)|(getb()&0x3F);
|
|
102 } else if((x&0xFC)==0xF8) {
|
|
103 x=((x&0x03)<<6)|(getb()&0x3F);
|
|
104 x=(x<<6)|(getb()&0x3F);
|
|
105 x=(x<<6)|(getb()&0x3F);
|
|
106 x=(x<<6)|(getb()&0x3F);
|
|
107 } else if((x&0xFE)==0xFC) {
|
|
108 x=((x&0x01)<<6)|(getb()&0x3F);
|
|
109 x=(x<<6)|(getb()&0x3F);
|
|
110 x=(x<<6)|(getb()&0x3F);
|
|
111 x=(x<<6)|(getb()&0x3F);
|
|
112 x=(x<<6)|(getb()&0x3F);
|
|
113 } else if(x==0xFE || x==0xFF) {
|
|
114 x=((x&0x01)<<6)|(getb()&0x3F);
|
|
115 x=(x<<6)|(getb()&0x3F);
|
|
116 x=(x<<6)|(getb()&0x3F);
|
|
117 x=(x<<6)|(getb()&0x3F);
|
|
118 x=(x<<6)|(getb()&0x3F);
|
|
119 x=(x<<6)|(getb()&0x3F);
|
|
120 }
|
|
121 return x;
|
|
122 }
|
|
123
|
|
124 ULL read_vlq8(void) {
|
|
125 byte x=getb();
|
|
126 ULL r=0;
|
|
127 while(x&0x80) {
|
|
128 r=(r<<7)|(x&0x7F);
|
|
129 x=getb();
|
|
130 }
|
|
131 return r|x;
|
|
132 }
|
|
133
|
|
134 ULL read_leb128(void) {
|
|
135 byte x=getb();
|
|
136 int i=0;
|
|
137 ULL r=0;
|
|
138 while(x&0x80) {
|
|
139 r|=(x&0x7F)<<(7*(i++));
|
|
140 x=getb();
|
|
141 }
|
|
142 return r|(x<<(7*i));
|
|
143 }
|
|
144
|
|
145 ULL read_utf16_le(void) {
|
|
146 ULL r=getb()<<16;
|
|
147 r|=getb();
|
|
148 if(r>=0xD800 && r<0xDC00) {
|
|
149 int x=getb()<<16;
|
|
150 x|=getb();
|
|
151 return (((r&0x3FF)<<10)|(x&0x3FF))+0x10000ULL;
|
|
152 } else {
|
|
153 return r;
|
|
154 }
|
|
155 }
|
|
156
|
|
157 ULL read_utf16_be(void) {
|
|
158 ULL r=getb();
|
|
159 r|=getb()<<16;
|
|
160 if(r>=0xD800 && r<0xDC00) {
|
|
161 int x=getb();
|
|
162 x|=getb()<<16;
|
|
163 return (((r&0x3FF)<<10)|(x&0x3FF))+0x10000ULL;
|
|
164 } else {
|
|
165 return r;
|
|
166 }
|
|
167 }
|
|
168
|
|
169 ULL read_translate(void) {
|
|
170 return translation[getb()];
|
|
171 }
|
|
172
|
|
173 ULL read_messagepack(void) {
|
|
174 byte x;
|
|
175 ULL s;
|
|
176 float f;
|
|
177 double d;
|
|
178 for(;;) {
|
|
179 x=getb();
|
|
180 switch(x) {
|
|
181 case 0x00 ... 0x7F: return x;
|
|
182 case 0x80 ... 0x9F: continue;
|
|
183 case 0xA0 ... 0xBF:
|
|
184 while(x-->0xA0) putchar(getb());
|
|
185 continue;
|
|
186 case 0xC0: continue;
|
|
187 case 0xC2: return 0;
|
|
188 case 0xC3: return -1LL;
|
|
189 case 0xCA:
|
|
190 *(short*)&f=read_16bit_be_raw(); // Not completely portable
|
|
191 return (ULL)f;
|
|
192 case 0xCB:
|
|
193 *(int*)&d=read_32bit_be_raw(); // Not completely portable
|
|
194 return (ULL)f;
|
|
195 case 0xCC: return read_8bit_raw();
|
|
196 case 0xCD: return read_16bit_be_raw();
|
|
197 case 0xCE: return read_32bit_be_raw();
|
|
198 case 0xCF: return read_64bit_be_raw();
|
|
199 case 0xD0: return sign_extend(read_8bit_raw(),7);
|
|
200 case 0xD1: return sign_extend(read_16bit_be_raw(),15);
|
|
201 case 0xD2: return sign_extend(read_32bit_be_raw(),31);
|
|
202 case 0xD3: return read_64bit_be_raw();
|
|
203 case 0xDA:
|
|
204 s=read_16bit_be_raw();
|
|
205 while(s--) putchar(getb());
|
|
206 continue;
|
|
207 case 0xDB:
|
|
208 s=read_32bit_be_raw();
|
|
209 while(s--) putchar(getb());
|
|
210 continue;
|
|
211 case 0xDC: read_16bit_be_raw(); continue;
|
|
212 case 0xDD: read_32bit_be_raw(); continue;
|
|
213 case 0xDE: read_16bit_be_raw(); continue;
|
|
214 case 0xDF: read_32bit_be_raw(); continue;
|
|
215 case 0xE0 ... 0xFF: return x|~31LL;
|
|
216 default: exit(1);
|
|
217 }
|
|
218 }
|
|
219 }
|
|
220
|
|
221 ULL read_hex(void) {
|
|
222 char a,b;
|
|
223 do a=getb(); while(a<=' ');
|
|
224 do b=getb(); while(b<=' ');
|
|
225 return (((a&15)+(a>='A'?9:0))<<4)|((b&15)+(b>='A'?9:0));
|
|
226 }
|
|
227
|
|
228 void write_8bit_raw(ULL x) {
|
|
229 putchar(x);
|
|
230 }
|
|
231
|
|
232 void write_16bit_le_raw(ULL x) {
|
|
233 putchar(x&255);
|
|
234 putchar(x>>8);
|
|
235 }
|
|
236
|
|
237 void write_16bit_be_raw(ULL x) {
|
|
238 putchar(x>>8);
|
|
239 putchar(x&255);
|
|
240 }
|
|
241
|
|
242 void write_32bit_le_raw(ULL x) {
|
|
243 putchar(x&255);
|
|
244 putchar(x>>8);
|
|
245 putchar(x>>16);
|
|
246 putchar(x>>24);
|
|
247 }
|
|
248
|
|
249 void write_32bit_be_raw(ULL x) {
|
|
250 putchar(x>>24);
|
|
251 putchar(x>>16);
|
|
252 putchar(x>>8);
|
|
253 putchar(x&255);
|
|
254 }
|
|
255
|
|
256 void write_64bit_le_raw(ULL x) {
|
|
257 putchar(x&255);
|
|
258 putchar(x>>8);
|
|
259 putchar(x>>16);
|
|
260 putchar(x>>24);
|
|
261 putchar(x>>32);
|
|
262 putchar(x>>40);
|
|
263 putchar(x>>48);
|
|
264 putchar(x>>56);
|
|
265 }
|
|
266
|
|
267 void write_64bit_be_raw(ULL x) {
|
|
268 putchar(x>>56);
|
|
269 putchar(x>>48);
|
|
270 putchar(x>>40);
|
|
271 putchar(x>>32);
|
|
272 putchar(x>>24);
|
|
273 putchar(x>>16);
|
|
274 putchar(x>>8);
|
|
275 putchar(x&255);
|
|
276 }
|
|
277
|
|
278 void write_utf8(ULL x) {
|
|
279 if(out_mode=='0' && !x) {
|
|
280 putchar(0xC0);
|
|
281 putchar(0x80);
|
|
282 } else if(x<0x80ULL) {
|
|
283 putchar(x);
|
|
284 } else if(x<0x800ULL) {
|
|
285 putchar(0xC0|(x>>6));
|
|
286 putchar(0x80|(x)&0xBF);
|
|
287 } else if(x<0x10000ULL) {
|
|
288 putchar(0xE0|(x>>12));
|
|
289 putchar(0x80|(x>>6)&0xBF);
|
|
290 putchar(0x80|(x)&0xBF);
|
|
291 } else if(x<0x200000ULL) {
|
|
292 putchar(0xF0|(x>>18));
|
|
293 putchar(0x80|(x>>12)&0xBF);
|
|
294 putchar(0x80|(x>>6)&0xBF);
|
|
295 putchar(0x80|(x)&0xBF);
|
|
296 } else if(x<0x4000000ULL) {
|
|
297 putchar(0xF8|(x>>24));
|
|
298 putchar(0x80|(x>>18)&0xBF);
|
|
299 putchar(0x80|(x>>12)&0xBF);
|
|
300 putchar(0x80|(x>>6)&0xBF);
|
|
301 putchar(0x80|(x)&0xBF);
|
|
302 } else if(x<0x80000000ULL) {
|
|
303 putchar(0xFC|(x>>30));
|
|
304 putchar(0x80|(x>>24)&0xBF);
|
|
305 putchar(0x80|(x>>18)&0xBF);
|
|
306 putchar(0x80|(x>>12)&0xBF);
|
|
307 putchar(0x80|(x>>6)&0xBF);
|
|
308 putchar(0x80|(x)&0xBF);
|
|
309 } else if(x<0x1000000000ULL) {
|
|
310 putchar(0xFE|(x>>36));
|
|
311 putchar(0x80|(x>>30)&0xBF);
|
|
312 putchar(0x80|(x>>24)&0xBF);
|
|
313 putchar(0x80|(x>>18)&0xBF);
|
|
314 putchar(0x80|(x>>12)&0xBF);
|
|
315 putchar(0x80|(x>>6)&0xBF);
|
|
316 putchar(0x80|(x)&0xBF);
|
|
317 } else {
|
|
318 exit(1);
|
|
319 }
|
|
320 }
|
|
321
|
|
322 void write_vlq8(ULL x) {
|
|
323 int i;
|
|
324 for(i=63;i;i-=7) if(x&-(1LL<<i)) putchar(0x80|(x>>i)&0xFF);
|
|
325 putchar(x&0x7F);
|
|
326 }
|
|
327
|
|
328 void write_leb128(ULL x) {
|
|
329 while(x&~0x7FULL) {
|
|
330 putchar(0x80|x&0xFF);
|
|
331 x>>=7;
|
|
332 }
|
|
333 putchar(x);
|
|
334 }
|
|
335
|
|
336 void write_utf16_le(ULL x) {
|
|
337 if(x>0x10FFFFULL) exit(1);
|
|
338 if(x&0x1F0000ULL) {
|
|
339 x-=0x10000ULL;
|
|
340 write_16bit_le_raw((x>>10)|0xD800);
|
|
341 write_16bit_le_raw((x&0x3FF)|0xDC00);
|
|
342 } else {
|
|
343 write_16bit_le_raw(x);
|
|
344 }
|
|
345 }
|
|
346
|
|
347 void write_utf16_be(ULL x) {
|
|
348 if(x>0x10FFFFULL) exit(1);
|
|
349 if(x&0x1F0000ULL) {
|
|
350 x-=0x10000ULL;
|
|
351 write_16bit_be_raw((x>>10)|0xD800);
|
|
352 write_16bit_be_raw((x&0x3FF)|0xDC00);
|
|
353 } else {
|
|
354 write_16bit_be_raw(x);
|
|
355 }
|
|
356 }
|
|
357
|
|
358 void write_translate(ULL x) {
|
|
359 int i;
|
|
360 for(i=0;i<256;i++) if(translation[i]==x) putchar(i);
|
|
361 }
|
|
362
|
|
363 void write_hex(ULL x) {
|
|
364 printf("%02X",(int)x);
|
|
365 }
|
|
366
|
|
367 const in_func_t in_func[128]={
|
|
368 ['8']=read_8bit_raw,
|
|
369 ['w']=read_16bit_le_raw,
|
|
370 ['W']=read_16bit_be_raw,
|
|
371 ['d']=read_32bit_le_raw,
|
|
372 ['D']=read_32bit_be_raw,
|
|
373 ['q']=read_64bit_le_raw,
|
|
374 ['Q']=read_64bit_be_raw,
|
|
375 ['1']=read_utf8,
|
|
376 ['0']=read_utf8,
|
|
377 ['V']=read_vlq8,
|
|
378 ['v']=read_leb128,
|
|
379 ['u']=read_utf16_le,
|
|
380 ['U']=read_utf16_be,
|
|
381 ['T']=read_translate,
|
|
382 ['M']=read_messagepack,
|
|
383 ['4']=read_hex,
|
|
384 [0]=0
|
|
385 };
|
|
386
|
|
387 const out_func_t out_func[128]={
|
|
388 ['8']=write_8bit_raw,
|
|
389 ['w']=write_16bit_le_raw,
|
|
390 ['W']=write_16bit_be_raw,
|
|
391 ['d']=write_32bit_le_raw,
|
|
392 ['D']=write_32bit_be_raw,
|
|
393 ['q']=write_64bit_le_raw,
|
|
394 ['Q']=write_64bit_be_raw,
|
|
395 ['1']=write_utf8,
|
|
396 ['0']=write_utf8,
|
|
397 ['V']=write_vlq8,
|
|
398 ['v']=write_leb128,
|
|
399 ['u']=write_utf16_le,
|
|
400 ['U']=write_utf16_be,
|
|
401 ['T']=write_translate,
|
|
402 ['4']=write_hex,
|
|
403 [0]=0
|
|
404 };
|
|
405
|
|
406 int main(int argc,char**argv) {
|
|
407 int b;
|
|
408 int is_lf=0;
|
|
409 ULL x;
|
|
410 #ifdef _WIN32
|
|
411 _setmode(_fileno(stdin),_O_BINARY);
|
|
412 _setmode(_fileno(stdout),_O_BINARY);
|
|
413 #endif
|
|
414 if(argc<2 || !argv[1][0] || !in_func[argv[1][0]] || !out_func[argv[1][1]]) return 1;
|
|
415 in_mode=argv[1][0];
|
|
416 out_mode=argv[1][1];
|
|
417 for(b=2;argv[1][b];b++) options[argv[1][b]&127]=1;
|
|
418 if(argc>2) {
|
|
419 FILE*fp=fopen(argv[2],"rb");
|
|
420 int i;
|
|
421 if(!fp) return 1;
|
|
422 fseek(fp,0,SEEK_END);
|
|
423 b=ftell(fp)>>8;
|
|
424 rewind(fp);
|
|
425 for(i=0;i<255;i++) {
|
|
426 translation[i]=fgetc(fp);
|
|
427 if(b>1) translation[i]=trans_le?(translation[i]|(fgetc(fp)<<8)):((translation[i]<<8)|fgetc(fp));
|
|
428 if(b>2) translation[i]=trans_le?(translation[i]|(fgetc(fp)<<16)):((translation[i]<<8)|fgetc(fp));
|
|
429 if(b>3) translation[i]=trans_le?(translation[i]|(fgetc(fp)<<24)):((translation[i]<<8)|fgetc(fp));
|
|
430 }
|
|
431 fclose(fp);
|
|
432 }
|
|
433 if(bom_out) out_func[out_mode&127](0xFEFF);
|
|
434 while(!feof(stdin)) {
|
|
435 x=in_func[in_mode&127]();
|
|
436 if(bom_in && x!=0xFEFF) return 1;
|
|
437 if(is_lf && x==10) {
|
|
438 is_lf=0;
|
|
439 continue;
|
|
440 }
|
|
441 if(is_lf=(conv_lf && x==13)) x=10;
|
|
442 if(conv_cr && x==10) out_func[out_mode&127](13);
|
|
443 if(!bom_in) out_func[out_mode&127](x);
|
|
444 bom_in=0;
|
|
445 }
|
|
446 return 0;
|
|
447 }
|
|
448
|