Mercurial > repo
comparison interps/c-intercal/src/clc-cset.c @ 996:859f9b4339e6
<Gregor> tar xf egobot.tar.xz
author | HackBot |
---|---|
date | Sun, 09 Dec 2012 19:30:08 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
995:6883f5911eb7 | 996:859f9b4339e6 |
---|---|
1 /***************************************************************************** | |
2 | |
3 NAME | |
4 clc-cset.c -- CLC-INTERCAL character set support for C-INTERCAL | |
5 | |
6 LICENSE TERMS | |
7 Copyright (C) 2007 Alex Smith | |
8 | |
9 This program is free software; you can redistribute it and/or modify | |
10 it under the terms of the GNU General Public License as published by | |
11 the Free Software Foundation; either version 2 of the License, or | |
12 (at your option) any later version. | |
13 | |
14 This program is distributed in the hope that it will be useful, | |
15 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 GNU General Public License for more details. | |
18 | |
19 You should have received a copy of the GNU General Public License | |
20 along with this program; if not, write to the Free Software | |
21 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
22 | |
23 ***************************************************************************/ | |
24 | |
25 /* | |
26 The input is read from files such as latin1.bin and ebcdic.bin. | |
27 These contain three lines of text data that specify the length of | |
28 the character set (number of characters in each shift state), the | |
29 number of shift states, and the bit order of the input. The bit | |
30 order can either be 8 characters long (msb down to lsb) or 16 (the | |
31 ick_first byte in a pair, followed by the second); each bit of the input | |
32 is transferred to the corresponding bit in a binary table that | |
33 follows (a for the lsb, b for the second least significant bit, up | |
34 to l for the 12th least significant bit; at most 12 significant bits | |
35 are allowed), or x for a don't care on the bit. The table that | |
36 follows is the relevant character codes in Latin-1 (which is used as | |
37 the interconversion language); if shift states are used, they're | |
38 represented by character codes 1, 2, 3, etc.. Invalid characters are | |
39 represented by character code 0, and changed to nulls on output. The | |
40 order of the bytes is 'Latin-1 for this set's char 0 in shift state | |
41 1', 'Latin-1 for this set's char 0 in shift state 2', ..., 'Latin-1 | |
42 for this set's char 1 in shift state 1', and so on. | |
43 | |
44 Note that as the character set feature is designed to mirror | |
45 CLC-INTERCAL's, I've sprinkled a bit of idiomatic Perl throughout | |
46 the code. This is quite rare, though, as it has to be also legal in C. | |
47 */ | |
48 | |
49 #include <stdio.h> | |
50 #include <string.h> | |
51 #include <stdlib.h> | |
52 #include "uncommon.h" | |
53 #define NCSETRECENT 8 | |
54 | |
55 /* Sometimes we want to link the character set files to the program | |
56 * rather than reading them from disk; in this case, these extern | |
57 * variables will be set non-null by object files invented | |
58 * specifically for the purpose. */ | |
59 extern /*@null@*/ const char* ick_clc_cset_atari; | |
60 extern /*@null@*/ const char* ick_clc_cset_baudot; | |
61 extern /*@null@*/ const char* ick_clc_cset_ebcdic; | |
62 extern /*@null@*/ const char* ick_clc_cset_latin1; | |
63 | |
64 static /*@null@*/ const char* ick_clc_cset_ptr=0; | |
65 | |
66 /* Fake that we're reading hardcoded characters from a file. This | |
67 * method of doing it is obviously not thread-safe. */ | |
68 static int ick_clc_cset_hardcoderead(FILE* ignored) | |
69 { | |
70 /*@-noeffect@*/ | |
71 (void) ignored; | |
72 /*@=noeffect@*/ | |
73 return (int)*ick_clc_cset_ptr++; | |
74 } | |
75 | |
76 struct cset | |
77 { | |
78 unsigned char set[4096]; /* allow up to 12 bits of data+shifts */ | |
79 unsigned short setlen; | |
80 int shifts; | |
81 char setname[9]; /* 8.3 filenames are enforced! */ | |
82 char bitorder[16]; | |
83 int nbytes; | |
84 }; | |
85 | |
86 /* In particular, this initialises the setnames to the null string, | |
87 * and clears nbytes. Both of these are used to determine whether a | |
88 * cset is valid or not. */ | |
89 /*@-initallelements@*/ /*@-type@*/ | |
90 static struct cset ick_cset_recent[NCSETRECENT]={{{0},0,0,{0},{0},0}}; | |
91 /*@=initallelements@*/ /*@=type@*/ | |
92 static int ick_csetow=0; /* which cset to overwrite ick_next */ | |
93 | |
94 /* For help finding files */ | |
95 /*@observer@*/ extern char* ick_globalargv0; | |
96 /*@observer@*/ extern const char* ick_datadir; | |
97 | |
98 /*@-mustfreefresh@*/ | |
99 /* because Splint doesn't understand how findandfopen works */ | |
100 static void ick_clc_cset_load(/*@unique@*/ struct cset* cs, /*@unique@*/ const char* fname) | |
101 { | |
102 FILE* in; | |
103 char buf[13]; /* enough for an 8.3 filename */ | |
104 int i,j,c; | |
105 int (*ipf)(FILE*); | |
106 /* Avoid buffer-overflow attacks. */ | |
107 if(strlen(fname)>8) return; | |
108 /* If ick_clc_cset_atari is non-null, then don't read from disk. */ | |
109 if(ick_clc_cset_atari) | |
110 { | |
111 /* If the character sets have been hardcoded, only accept | |
112 * hardcoded chararacter sets. */ | |
113 ick_clc_cset_ptr=0; | |
114 if(!strcmp(fname,"atari")) ick_clc_cset_ptr=ick_clc_cset_atari; | |
115 if(!strcmp(fname,"baudot")) ick_clc_cset_ptr=ick_clc_cset_baudot; | |
116 if(!strcmp(fname,"ebcdic")) ick_clc_cset_ptr=ick_clc_cset_ebcdic; | |
117 if(!strcmp(fname,"latin1")) ick_clc_cset_ptr=ick_clc_cset_latin1; | |
118 if(!ick_clc_cset_ptr) return; /* not a hardcoded charset */ | |
119 in=(FILE*)0; | |
120 ipf=ick_clc_cset_hardcoderead; | |
121 } | |
122 else | |
123 { | |
124 /* We already checked above that this isn't a buffer overflow. */ | |
125 /*@-bufferoverflowhigh@*/ | |
126 sprintf(buf,"%s.bin",fname); | |
127 /*@=bufferoverflowhigh@*/ | |
128 if(!(in=ick_findandfopen(buf,ick_datadir,"rb",ick_globalargv0))) return; | |
129 ipf=fgetc; | |
130 } | |
131 /* First row: setlen */ | |
132 cs->setlen=0; | |
133 do | |
134 { | |
135 /* The input is definitely in ASCII, even if the C program isn't, | |
136 which is why numeric codes are used. */ | |
137 /* Here, ipf allows NULL input iff in is actually NULL; this situation | |
138 is impossible to explain with an annotation, so instead just disable | |
139 the warning. */ | |
140 /*@-nullpass@*/ | |
141 c=ipf(in); | |
142 /*@=nullpass@*/ | |
143 if(c==EOF) {if(in) (void)fclose(in); return;} | |
144 if(c<48||c>57) break; | |
145 cs->setlen*=10; | |
146 cs->setlen+=c-48; | |
147 } while(1); | |
148 if(c!=10) {if(in) (void)fclose(in); return;} | |
149 /* Second row: shifts. This can be from 1 to 9. */ | |
150 /*@-nullpass@*/ | |
151 c=ipf(in); | |
152 /*@=nullpass@*/ | |
153 if(c<49||c>57) {if(in) (void)fclose(in); return;} | |
154 cs->shifts=c-48; | |
155 /*@-nullpass@*/ | |
156 if(ipf(in)!=10) {if(in) (void)fclose(in); return;} | |
157 /*@=nullpass@*/ | |
158 /* Third row: byte order. */ | |
159 i=0; | |
160 /*@-nullpass@*/ | |
161 while(((c=ipf(in)))>96&&i<16) cs->bitorder[i++]=(char)c; | |
162 /*@=nullpass@*/ | |
163 /* Sanity check; that it is a whole number of bytes, that the input | |
164 * format is correct, and that there are at most 4096 bytes of data | |
165 * total. */ | |
166 if(c!=10||i%8||!i||cs->setlen*cs->shifts>4096) return; | |
167 /* i/8 is now the number of bytes, but don't set that yet in case | |
168 * there's an error later. */ | |
169 /* Rest of file: the bytes themselves. */ | |
170 j=0; | |
171 /*@-nullpass@*/ | |
172 while(j<cs->setlen*cs->shifts) | |
173 if((cs->set[j++]=(unsigned char)(c=ipf(in))),c==EOF && in != NULL) | |
174 {if(in) (void)fclose(in); return;} | |
175 /*@=nullpass@*/ | |
176 if(in) (void) fclose(in); | |
177 /* Now set the name and number of bytes, indicating a successful | |
178 * load. */ | |
179 cs->nbytes=i/8; | |
180 strcpy(cs->setname,fname); | |
181 } | |
182 /*@=mustfreefresh@*/ | |
183 | |
184 /* Helper function for fixing bit order in output. */ | |
185 static void ick_bitencout(char** pop, const struct cset* co, | |
186 unsigned short val, int padstyle) | |
187 { | |
188 unsigned short outword=0; | |
189 int i=co->nbytes*8; | |
190 /*@-shiftnegative@*/ /* i can't go above it's initial value here */ | |
191 while(i--) | |
192 if(co->bitorder[i]>'l') | |
193 { | |
194 if((padstyle==1&&(i==1||i==9) && !(outword&(1<<(co->nbytes*8-i)))) || | |
195 (padstyle==2&&(rand()%2||!outword))) | |
196 outword |= 1<<(co->nbytes*8-i-1); | |
197 } | |
198 /* Copy the appropriate bit from val to outword. */ | |
199 else outword |= (unsigned short)((val>>(co->bitorder[i]-'a'))&1) | |
200 << (co->nbytes*8-i-1); | |
201 /*@=shiftnegative@*/ | |
202 if(co->nbytes==2) *(*pop)++=(char)(outword/256); | |
203 *(*pop)++=(char)(outword%256); | |
204 } | |
205 | |
206 /* padstyle is 0 to pad with zeros, 1 to pad to make the output | |
207 * printable characters, or 2 to pad with garbage, avoiding 0s. | |
208 * Return value is the number of characters in the output string, | |
209 * which may contain embedded NULs if the input contained invalid | |
210 * characters. Returns -1 on error. The caller is responsible for | |
211 * making sure that out is big enough, but as a check, no more than | |
212 * outsize-1 characters and a NUL will be written to out. The code is | |
213 * conservative about this; to be safe, make outsize six times as long | |
214 * as the in is (including in's terminal NUL), plus 6. */ | |
215 int ick_clc_cset_convert(const char* in, /*@partial@*/ char* out, const char* incset, | |
216 const char* outcset, int padstyle, size_t outsize, | |
217 /*@null@*/ FILE* errsto) | |
218 { | |
219 int ic=-1, oc=-1; | |
220 int i; | |
221 int ssi, sso; | |
222 unsigned short tus, csi; | |
223 const char* ip; | |
224 char* op; | |
225 struct cset *csri, *csro; | |
226 int noconvwarn=0; | |
227 int substwarn=0; | |
228 /* First, see if we have a recently-used version of incset or outcset. */ | |
229 i=NCSETRECENT; | |
230 while(i--) | |
231 { | |
232 (void)(strcmp(incset,ick_cset_recent[i].setname) || (ic=i)); | |
233 (void)(strcmp(outcset,ick_cset_recent[i].setname) || (oc=i)); | |
234 } | |
235 /* Find a blank entry to load on top of. */ | |
236 if(ic==-1) for(i=NCSETRECENT;i--;) if(!ick_cset_recent[i].nbytes) ic=i; | |
237 if(oc==-1) for(i=NCSETRECENT;i--;) if(!ick_cset_recent[i].nbytes&&i!=ic) oc=i; | |
238 /* Failing that, find any entry to load on top of. */ | |
239 (void)(ic==-1 && (ick_cset_recent[ic=ick_csetow++].nbytes=0)); | |
240 if(ick_csetow==ic) ick_csetow++; | |
241 ick_csetow%=NCSETRECENT; | |
242 (void)(oc==-1 && (ick_cset_recent[oc=ick_csetow++].nbytes=0)); | |
243 ick_csetow%=NCSETRECENT; | |
244 /* If the character set hasn't been loaded, load it now. */ | |
245 ick_cset_recent[ic].nbytes || (ick_clc_cset_load(ick_cset_recent+ic,incset),0); | |
246 ick_cset_recent[oc].nbytes || (ick_clc_cset_load(ick_cset_recent+oc,outcset),0); | |
247 csri=ick_cset_recent+ic; | |
248 csro=ick_cset_recent+oc; | |
249 /* If a character set failed to load, bail out. */ | |
250 if(!csri->nbytes) | |
251 { | |
252 if(errsto) fprintf(errsto,"Error: Nonexistent input character set.\n"); | |
253 return -1; | |
254 } | |
255 if(!csro->nbytes) | |
256 { | |
257 if(errsto) fprintf(errsto,"Error: Nonexistent output character set.\n"); | |
258 return -1; | |
259 } | |
260 /* There is no initial shift state. */ | |
261 ssi=sso=0; | |
262 csri->shifts==1 && (ssi=1); | |
263 csro->shifts==1 && (sso=1); | |
264 ip=in; op=out; | |
265 while(*ip != '\0' && (size_t)(op-out)<outsize-6) | |
266 { | |
267 tus=(unsigned short)(unsigned char)*ip++; | |
268 if(csri->nbytes==2) | |
269 { | |
270 tus*=256; | |
271 tus+=(unsigned short)(unsigned char)*ip++; | |
272 } | |
273 i=csri->nbytes*8; | |
274 csi=0; | |
275 while(i--) | |
276 { | |
277 if(csri->bitorder[i]>'l') continue; | |
278 /* Copy the appropriate bit from tus to csi. */ | |
279 /*@-shiftnegative@*/ | |
280 csi |= (unsigned short)((tus>>(csri->nbytes*8-i-1))&1) | |
281 << (csri->bitorder[i]-'a'); | |
282 /*@=shiftnegative@*/ | |
283 } | |
284 if(csi>csri->setlen) | |
285 { | |
286 ick_bitencout(&op,csro,0,padstyle); /* not in the charset */ | |
287 if(!noconvwarn && errsto != NULL) | |
288 fprintf(errsto,"Warning: some characters could not be translated," | |
289 " they were replaced with NUL.\n"); | |
290 noconvwarn=1; | |
291 } | |
292 else | |
293 { | |
294 /* The more interesting case. */ | |
295 csi*=csri->shifts; | |
296 if(!ssi) | |
297 { | |
298 /* We're at the start of a shift-stated string, but not | |
299 actually in any shift state. There is no general solution | |
300 here, so use one that works for Baudot: starting in each | |
301 state in turn, choose the option that takes the longest | |
302 until it ends up not changing shift state, then perform one | |
303 shift from that option. */ | |
304 int sstesting, ssbestsf, ssrecord, j, k; | |
305 sstesting=csri->shifts+1; ssbestsf=ssrecord=0; | |
306 while(--sstesting) | |
307 { | |
308 k=sstesting; j=0; | |
309 while(csri->set[csi+i-1] != (unsigned char)0 && | |
310 (int)csri->set[csi+i-1]!=k && | |
311 (int)csri->set[csi+i-1]<=csri->shifts) | |
312 {k=(int)csri->set[csi+i-1]; j++;} | |
313 if(ssbestsf<j) {ssbestsf=sstesting; ssrecord=j;} | |
314 } | |
315 ssi=ssbestsf; | |
316 } | |
317 csi+=ssi-1; | |
318 tus=(unsigned short)csri->set[csi]; /* we now have the Latin-1 conversion! */ | |
319 if(tus>=1&&tus<=(unsigned short)csri->shifts&&csri->shifts>1) | |
320 { | |
321 /* That wasn't a character, but a shift command. */ | |
322 ssi=(int)tus; | |
323 continue; | |
324 } | |
325 /* Look for the character in the output's character | |
326 * set. Preferably we want something in the current shift | |
327 * state, but failing that, any character will do. */ | |
328 spacenowtab: | |
329 i=csro->shifts*csro->setlen; | |
330 csi=10000; | |
331 while(i--) | |
332 (void)((unsigned short)csro->set[i]==tus && | |
333 (csi==10000 || (int)csi%csro->shifts!=sso-1) && | |
334 (csi=(unsigned short)i)); | |
335 if(csi==10000&&tus==9 /* latin-1 tab */) | |
336 { | |
337 if(!substwarn && errsto != NULL) | |
338 fprintf(errsto,"Warning: no tab in output character set," | |
339 " space was used instead.\n"); | |
340 substwarn=1; | |
341 tus=32; /* latin-1 space */ | |
342 goto spacenowtab; | |
343 } | |
344 if(csi==10000) | |
345 { | |
346 ick_bitencout(&op,csro,0,padstyle); /* not in the charset */ | |
347 if(!noconvwarn && errsto != NULL) | |
348 fprintf(errsto,"Warning: some characters could not be translated," | |
349 " they were replaced with NUL.\n"); | |
350 noconvwarn=1; | |
351 } | |
352 else if((int)(csi%csro->shifts)==(int)sso-1) | |
353 /* in the right shift state already */ | |
354 ick_bitencout(&op,csro,(unsigned short)(csi/csro->shifts),padstyle); | |
355 else | |
356 { | |
357 int tempi; | |
358 /* Generate shift codes. If sso isn't 0, generate from where | |
359 * we are at the moment; if it is 0, generate worse-case | |
360 * shifts by assuming we're in a shift state that can't shift | |
361 * to the state we want directly, if possible. */ | |
362 if(!sso) | |
363 { | |
364 int j=csro->shifts+1; | |
365 while(--j>0) | |
366 { | |
367 if(j-1==(int)(csi%csro->shifts)) continue; | |
368 i=(int)csro->setlen; | |
369 while(i--) | |
370 if((int)csro->set[i*csro->shifts+j-1]==csi%csro->shifts+1) | |
371 {j=-j; break; /* there is one in this set */}; | |
372 j=-j; | |
373 if(j<0) break; | |
374 } | |
375 /* Pick the worst-case if we found one, or otherwise just | |
376 * any state we aren't in at the moment. */ | |
377 sso=(j<0?-j:(int)(csi%csro->shifts)); | |
378 if(!sso) sso=csro->shifts; | |
379 } | |
380 /* Look for the shift code, if there is one. */ | |
381 i=(int)csro->setlen; | |
382 while(i--) | |
383 if((int)csro->set[i*csro->shifts+sso-1]==csi%csro->shifts+1) break; | |
384 tempi=i*csro->shifts+sso-1; | |
385 if(i==-1) | |
386 { | |
387 int intershift=-1; | |
388 /* That didn't work. Look for the shift code in some shift | |
389 * state other than the one we're aiming for. */ | |
390 retry: | |
391 i=csro->setlen*csro->shifts; | |
392 while(i--) | |
393 if((int)csro->set[i]==csi%csro->shifts+1&& | |
394 i%csro->shifts!=(int)(csi%csro->shifts)&& | |
395 i%csro->shifts+1!=intershift) break; | |
396 if(i==-1) return -1; /* no way to get into the right state */ | |
397 intershift=i%csro->shifts+1; | |
398 tempi=i; | |
399 i=(int)csro->setlen; | |
400 while(i--) | |
401 if((int)csro->set[i*csro->shifts+sso-1]==intershift) break; | |
402 if(i==-1) goto retry; /* try once more */ | |
403 ick_bitencout(&op,csro,(unsigned short)i,padstyle); | |
404 /* sso=intershift here but we're going to overwrite it | |
405 * immediately anyway, so no point in the assignment */ | |
406 } | |
407 ick_bitencout(&op,csro,(unsigned short)(tempi/csro->shifts),padstyle); | |
408 ick_bitencout(&op,csro,(unsigned short)(csi/csro->shifts),padstyle); | |
409 sso=csi%csro->shifts+1; | |
410 } | |
411 } | |
412 } | |
413 *op='\0'; | |
414 return op-out; | |
415 } |