comparison interps/c-intercal/src/clc-cset.c @ 996:859f9b4339e6

<Gregor> tar xf egobot.tar.xz
author HackBot
date Sun, 09 Dec 2012 19:30:08 +0000
parents
children
comparison
equal deleted inserted replaced
995:6883f5911eb7 996:859f9b4339e6
1 /*****************************************************************************
2
3 NAME
4 clc-cset.c -- CLC-INTERCAL character set support for C-INTERCAL
5
6 LICENSE TERMS
7 Copyright (C) 2007 Alex Smith
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 2 of the License, or
12 (at your option) any later version.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22
23 ***************************************************************************/
24
25 /*
26 The input is read from files such as latin1.bin and ebcdic.bin.
27 These contain three lines of text data that specify the length of
28 the character set (number of characters in each shift state), the
29 number of shift states, and the bit order of the input. The bit
30 order can either be 8 characters long (msb down to lsb) or 16 (the
31 ick_first byte in a pair, followed by the second); each bit of the input
32 is transferred to the corresponding bit in a binary table that
33 follows (a for the lsb, b for the second least significant bit, up
34 to l for the 12th least significant bit; at most 12 significant bits
35 are allowed), or x for a don't care on the bit. The table that
36 follows is the relevant character codes in Latin-1 (which is used as
37 the interconversion language); if shift states are used, they're
38 represented by character codes 1, 2, 3, etc.. Invalid characters are
39 represented by character code 0, and changed to nulls on output. The
40 order of the bytes is 'Latin-1 for this set's char 0 in shift state
41 1', 'Latin-1 for this set's char 0 in shift state 2', ..., 'Latin-1
42 for this set's char 1 in shift state 1', and so on.
43
44 Note that as the character set feature is designed to mirror
45 CLC-INTERCAL's, I've sprinkled a bit of idiomatic Perl throughout
46 the code. This is quite rare, though, as it has to be also legal in C.
47 */
48
49 #include <stdio.h>
50 #include <string.h>
51 #include <stdlib.h>
52 #include "uncommon.h"
53 #define NCSETRECENT 8
54
55 /* Sometimes we want to link the character set files to the program
56 * rather than reading them from disk; in this case, these extern
57 * variables will be set non-null by object files invented
58 * specifically for the purpose. */
59 extern /*@null@*/ const char* ick_clc_cset_atari;
60 extern /*@null@*/ const char* ick_clc_cset_baudot;
61 extern /*@null@*/ const char* ick_clc_cset_ebcdic;
62 extern /*@null@*/ const char* ick_clc_cset_latin1;
63
64 static /*@null@*/ const char* ick_clc_cset_ptr=0;
65
66 /* Fake that we're reading hardcoded characters from a file. This
67 * method of doing it is obviously not thread-safe. */
68 static int ick_clc_cset_hardcoderead(FILE* ignored)
69 {
70 /*@-noeffect@*/
71 (void) ignored;
72 /*@=noeffect@*/
73 return (int)*ick_clc_cset_ptr++;
74 }
75
76 struct cset
77 {
78 unsigned char set[4096]; /* allow up to 12 bits of data+shifts */
79 unsigned short setlen;
80 int shifts;
81 char setname[9]; /* 8.3 filenames are enforced! */
82 char bitorder[16];
83 int nbytes;
84 };
85
86 /* In particular, this initialises the setnames to the null string,
87 * and clears nbytes. Both of these are used to determine whether a
88 * cset is valid or not. */
89 /*@-initallelements@*/ /*@-type@*/
90 static struct cset ick_cset_recent[NCSETRECENT]={{{0},0,0,{0},{0},0}};
91 /*@=initallelements@*/ /*@=type@*/
92 static int ick_csetow=0; /* which cset to overwrite ick_next */
93
94 /* For help finding files */
95 /*@observer@*/ extern char* ick_globalargv0;
96 /*@observer@*/ extern const char* ick_datadir;
97
98 /*@-mustfreefresh@*/
99 /* because Splint doesn't understand how findandfopen works */
100 static void ick_clc_cset_load(/*@unique@*/ struct cset* cs, /*@unique@*/ const char* fname)
101 {
102 FILE* in;
103 char buf[13]; /* enough for an 8.3 filename */
104 int i,j,c;
105 int (*ipf)(FILE*);
106 /* Avoid buffer-overflow attacks. */
107 if(strlen(fname)>8) return;
108 /* If ick_clc_cset_atari is non-null, then don't read from disk. */
109 if(ick_clc_cset_atari)
110 {
111 /* If the character sets have been hardcoded, only accept
112 * hardcoded chararacter sets. */
113 ick_clc_cset_ptr=0;
114 if(!strcmp(fname,"atari")) ick_clc_cset_ptr=ick_clc_cset_atari;
115 if(!strcmp(fname,"baudot")) ick_clc_cset_ptr=ick_clc_cset_baudot;
116 if(!strcmp(fname,"ebcdic")) ick_clc_cset_ptr=ick_clc_cset_ebcdic;
117 if(!strcmp(fname,"latin1")) ick_clc_cset_ptr=ick_clc_cset_latin1;
118 if(!ick_clc_cset_ptr) return; /* not a hardcoded charset */
119 in=(FILE*)0;
120 ipf=ick_clc_cset_hardcoderead;
121 }
122 else
123 {
124 /* We already checked above that this isn't a buffer overflow. */
125 /*@-bufferoverflowhigh@*/
126 sprintf(buf,"%s.bin",fname);
127 /*@=bufferoverflowhigh@*/
128 if(!(in=ick_findandfopen(buf,ick_datadir,"rb",ick_globalargv0))) return;
129 ipf=fgetc;
130 }
131 /* First row: setlen */
132 cs->setlen=0;
133 do
134 {
135 /* The input is definitely in ASCII, even if the C program isn't,
136 which is why numeric codes are used. */
137 /* Here, ipf allows NULL input iff in is actually NULL; this situation
138 is impossible to explain with an annotation, so instead just disable
139 the warning. */
140 /*@-nullpass@*/
141 c=ipf(in);
142 /*@=nullpass@*/
143 if(c==EOF) {if(in) (void)fclose(in); return;}
144 if(c<48||c>57) break;
145 cs->setlen*=10;
146 cs->setlen+=c-48;
147 } while(1);
148 if(c!=10) {if(in) (void)fclose(in); return;}
149 /* Second row: shifts. This can be from 1 to 9. */
150 /*@-nullpass@*/
151 c=ipf(in);
152 /*@=nullpass@*/
153 if(c<49||c>57) {if(in) (void)fclose(in); return;}
154 cs->shifts=c-48;
155 /*@-nullpass@*/
156 if(ipf(in)!=10) {if(in) (void)fclose(in); return;}
157 /*@=nullpass@*/
158 /* Third row: byte order. */
159 i=0;
160 /*@-nullpass@*/
161 while(((c=ipf(in)))>96&&i<16) cs->bitorder[i++]=(char)c;
162 /*@=nullpass@*/
163 /* Sanity check; that it is a whole number of bytes, that the input
164 * format is correct, and that there are at most 4096 bytes of data
165 * total. */
166 if(c!=10||i%8||!i||cs->setlen*cs->shifts>4096) return;
167 /* i/8 is now the number of bytes, but don't set that yet in case
168 * there's an error later. */
169 /* Rest of file: the bytes themselves. */
170 j=0;
171 /*@-nullpass@*/
172 while(j<cs->setlen*cs->shifts)
173 if((cs->set[j++]=(unsigned char)(c=ipf(in))),c==EOF && in != NULL)
174 {if(in) (void)fclose(in); return;}
175 /*@=nullpass@*/
176 if(in) (void) fclose(in);
177 /* Now set the name and number of bytes, indicating a successful
178 * load. */
179 cs->nbytes=i/8;
180 strcpy(cs->setname,fname);
181 }
182 /*@=mustfreefresh@*/
183
184 /* Helper function for fixing bit order in output. */
185 static void ick_bitencout(char** pop, const struct cset* co,
186 unsigned short val, int padstyle)
187 {
188 unsigned short outword=0;
189 int i=co->nbytes*8;
190 /*@-shiftnegative@*/ /* i can't go above it's initial value here */
191 while(i--)
192 if(co->bitorder[i]>'l')
193 {
194 if((padstyle==1&&(i==1||i==9) && !(outword&(1<<(co->nbytes*8-i)))) ||
195 (padstyle==2&&(rand()%2||!outword)))
196 outword |= 1<<(co->nbytes*8-i-1);
197 }
198 /* Copy the appropriate bit from val to outword. */
199 else outword |= (unsigned short)((val>>(co->bitorder[i]-'a'))&1)
200 << (co->nbytes*8-i-1);
201 /*@=shiftnegative@*/
202 if(co->nbytes==2) *(*pop)++=(char)(outword/256);
203 *(*pop)++=(char)(outword%256);
204 }
205
206 /* padstyle is 0 to pad with zeros, 1 to pad to make the output
207 * printable characters, or 2 to pad with garbage, avoiding 0s.
208 * Return value is the number of characters in the output string,
209 * which may contain embedded NULs if the input contained invalid
210 * characters. Returns -1 on error. The caller is responsible for
211 * making sure that out is big enough, but as a check, no more than
212 * outsize-1 characters and a NUL will be written to out. The code is
213 * conservative about this; to be safe, make outsize six times as long
214 * as the in is (including in's terminal NUL), plus 6. */
215 int ick_clc_cset_convert(const char* in, /*@partial@*/ char* out, const char* incset,
216 const char* outcset, int padstyle, size_t outsize,
217 /*@null@*/ FILE* errsto)
218 {
219 int ic=-1, oc=-1;
220 int i;
221 int ssi, sso;
222 unsigned short tus, csi;
223 const char* ip;
224 char* op;
225 struct cset *csri, *csro;
226 int noconvwarn=0;
227 int substwarn=0;
228 /* First, see if we have a recently-used version of incset or outcset. */
229 i=NCSETRECENT;
230 while(i--)
231 {
232 (void)(strcmp(incset,ick_cset_recent[i].setname) || (ic=i));
233 (void)(strcmp(outcset,ick_cset_recent[i].setname) || (oc=i));
234 }
235 /* Find a blank entry to load on top of. */
236 if(ic==-1) for(i=NCSETRECENT;i--;) if(!ick_cset_recent[i].nbytes) ic=i;
237 if(oc==-1) for(i=NCSETRECENT;i--;) if(!ick_cset_recent[i].nbytes&&i!=ic) oc=i;
238 /* Failing that, find any entry to load on top of. */
239 (void)(ic==-1 && (ick_cset_recent[ic=ick_csetow++].nbytes=0));
240 if(ick_csetow==ic) ick_csetow++;
241 ick_csetow%=NCSETRECENT;
242 (void)(oc==-1 && (ick_cset_recent[oc=ick_csetow++].nbytes=0));
243 ick_csetow%=NCSETRECENT;
244 /* If the character set hasn't been loaded, load it now. */
245 ick_cset_recent[ic].nbytes || (ick_clc_cset_load(ick_cset_recent+ic,incset),0);
246 ick_cset_recent[oc].nbytes || (ick_clc_cset_load(ick_cset_recent+oc,outcset),0);
247 csri=ick_cset_recent+ic;
248 csro=ick_cset_recent+oc;
249 /* If a character set failed to load, bail out. */
250 if(!csri->nbytes)
251 {
252 if(errsto) fprintf(errsto,"Error: Nonexistent input character set.\n");
253 return -1;
254 }
255 if(!csro->nbytes)
256 {
257 if(errsto) fprintf(errsto,"Error: Nonexistent output character set.\n");
258 return -1;
259 }
260 /* There is no initial shift state. */
261 ssi=sso=0;
262 csri->shifts==1 && (ssi=1);
263 csro->shifts==1 && (sso=1);
264 ip=in; op=out;
265 while(*ip != '\0' && (size_t)(op-out)<outsize-6)
266 {
267 tus=(unsigned short)(unsigned char)*ip++;
268 if(csri->nbytes==2)
269 {
270 tus*=256;
271 tus+=(unsigned short)(unsigned char)*ip++;
272 }
273 i=csri->nbytes*8;
274 csi=0;
275 while(i--)
276 {
277 if(csri->bitorder[i]>'l') continue;
278 /* Copy the appropriate bit from tus to csi. */
279 /*@-shiftnegative@*/
280 csi |= (unsigned short)((tus>>(csri->nbytes*8-i-1))&1)
281 << (csri->bitorder[i]-'a');
282 /*@=shiftnegative@*/
283 }
284 if(csi>csri->setlen)
285 {
286 ick_bitencout(&op,csro,0,padstyle); /* not in the charset */
287 if(!noconvwarn && errsto != NULL)
288 fprintf(errsto,"Warning: some characters could not be translated,"
289 " they were replaced with NUL.\n");
290 noconvwarn=1;
291 }
292 else
293 {
294 /* The more interesting case. */
295 csi*=csri->shifts;
296 if(!ssi)
297 {
298 /* We're at the start of a shift-stated string, but not
299 actually in any shift state. There is no general solution
300 here, so use one that works for Baudot: starting in each
301 state in turn, choose the option that takes the longest
302 until it ends up not changing shift state, then perform one
303 shift from that option. */
304 int sstesting, ssbestsf, ssrecord, j, k;
305 sstesting=csri->shifts+1; ssbestsf=ssrecord=0;
306 while(--sstesting)
307 {
308 k=sstesting; j=0;
309 while(csri->set[csi+i-1] != (unsigned char)0 &&
310 (int)csri->set[csi+i-1]!=k &&
311 (int)csri->set[csi+i-1]<=csri->shifts)
312 {k=(int)csri->set[csi+i-1]; j++;}
313 if(ssbestsf<j) {ssbestsf=sstesting; ssrecord=j;}
314 }
315 ssi=ssbestsf;
316 }
317 csi+=ssi-1;
318 tus=(unsigned short)csri->set[csi]; /* we now have the Latin-1 conversion! */
319 if(tus>=1&&tus<=(unsigned short)csri->shifts&&csri->shifts>1)
320 {
321 /* That wasn't a character, but a shift command. */
322 ssi=(int)tus;
323 continue;
324 }
325 /* Look for the character in the output's character
326 * set. Preferably we want something in the current shift
327 * state, but failing that, any character will do. */
328 spacenowtab:
329 i=csro->shifts*csro->setlen;
330 csi=10000;
331 while(i--)
332 (void)((unsigned short)csro->set[i]==tus &&
333 (csi==10000 || (int)csi%csro->shifts!=sso-1) &&
334 (csi=(unsigned short)i));
335 if(csi==10000&&tus==9 /* latin-1 tab */)
336 {
337 if(!substwarn && errsto != NULL)
338 fprintf(errsto,"Warning: no tab in output character set,"
339 " space was used instead.\n");
340 substwarn=1;
341 tus=32; /* latin-1 space */
342 goto spacenowtab;
343 }
344 if(csi==10000)
345 {
346 ick_bitencout(&op,csro,0,padstyle); /* not in the charset */
347 if(!noconvwarn && errsto != NULL)
348 fprintf(errsto,"Warning: some characters could not be translated,"
349 " they were replaced with NUL.\n");
350 noconvwarn=1;
351 }
352 else if((int)(csi%csro->shifts)==(int)sso-1)
353 /* in the right shift state already */
354 ick_bitencout(&op,csro,(unsigned short)(csi/csro->shifts),padstyle);
355 else
356 {
357 int tempi;
358 /* Generate shift codes. If sso isn't 0, generate from where
359 * we are at the moment; if it is 0, generate worse-case
360 * shifts by assuming we're in a shift state that can't shift
361 * to the state we want directly, if possible. */
362 if(!sso)
363 {
364 int j=csro->shifts+1;
365 while(--j>0)
366 {
367 if(j-1==(int)(csi%csro->shifts)) continue;
368 i=(int)csro->setlen;
369 while(i--)
370 if((int)csro->set[i*csro->shifts+j-1]==csi%csro->shifts+1)
371 {j=-j; break; /* there is one in this set */};
372 j=-j;
373 if(j<0) break;
374 }
375 /* Pick the worst-case if we found one, or otherwise just
376 * any state we aren't in at the moment. */
377 sso=(j<0?-j:(int)(csi%csro->shifts));
378 if(!sso) sso=csro->shifts;
379 }
380 /* Look for the shift code, if there is one. */
381 i=(int)csro->setlen;
382 while(i--)
383 if((int)csro->set[i*csro->shifts+sso-1]==csi%csro->shifts+1) break;
384 tempi=i*csro->shifts+sso-1;
385 if(i==-1)
386 {
387 int intershift=-1;
388 /* That didn't work. Look for the shift code in some shift
389 * state other than the one we're aiming for. */
390 retry:
391 i=csro->setlen*csro->shifts;
392 while(i--)
393 if((int)csro->set[i]==csi%csro->shifts+1&&
394 i%csro->shifts!=(int)(csi%csro->shifts)&&
395 i%csro->shifts+1!=intershift) break;
396 if(i==-1) return -1; /* no way to get into the right state */
397 intershift=i%csro->shifts+1;
398 tempi=i;
399 i=(int)csro->setlen;
400 while(i--)
401 if((int)csro->set[i*csro->shifts+sso-1]==intershift) break;
402 if(i==-1) goto retry; /* try once more */
403 ick_bitencout(&op,csro,(unsigned short)i,padstyle);
404 /* sso=intershift here but we're going to overwrite it
405 * immediately anyway, so no point in the assignment */
406 }
407 ick_bitencout(&op,csro,(unsigned short)(tempi/csro->shifts),padstyle);
408 ick_bitencout(&op,csro,(unsigned short)(csi/csro->shifts),padstyle);
409 sso=csi%csro->shifts+1;
410 }
411 }
412 }
413 *op='\0';
414 return op-out;
415 }