996
|
1 /*****************************************************************************
|
|
2
|
|
3 NAME
|
|
4 clc-cset.c -- CLC-INTERCAL character set support for C-INTERCAL
|
|
5
|
|
6 LICENSE TERMS
|
|
7 Copyright (C) 2007 Alex Smith
|
|
8
|
|
9 This program is free software; you can redistribute it and/or modify
|
|
10 it under the terms of the GNU General Public License as published by
|
|
11 the Free Software Foundation; either version 2 of the License, or
|
|
12 (at your option) any later version.
|
|
13
|
|
14 This program is distributed in the hope that it will be useful,
|
|
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
17 GNU General Public License for more details.
|
|
18
|
|
19 You should have received a copy of the GNU General Public License
|
|
20 along with this program; if not, write to the Free Software
|
|
21 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
22
|
|
23 ***************************************************************************/
|
|
24
|
|
25 /*
|
|
26 The input is read from files such as latin1.bin and ebcdic.bin.
|
|
27 These contain three lines of text data that specify the length of
|
|
28 the character set (number of characters in each shift state), the
|
|
29 number of shift states, and the bit order of the input. The bit
|
|
30 order can either be 8 characters long (msb down to lsb) or 16 (the
|
|
31 ick_first byte in a pair, followed by the second); each bit of the input
|
|
32 is transferred to the corresponding bit in a binary table that
|
|
33 follows (a for the lsb, b for the second least significant bit, up
|
|
34 to l for the 12th least significant bit; at most 12 significant bits
|
|
35 are allowed), or x for a don't care on the bit. The table that
|
|
36 follows is the relevant character codes in Latin-1 (which is used as
|
|
37 the interconversion language); if shift states are used, they're
|
|
38 represented by character codes 1, 2, 3, etc.. Invalid characters are
|
|
39 represented by character code 0, and changed to nulls on output. The
|
|
40 order of the bytes is 'Latin-1 for this set's char 0 in shift state
|
|
41 1', 'Latin-1 for this set's char 0 in shift state 2', ..., 'Latin-1
|
|
42 for this set's char 1 in shift state 1', and so on.
|
|
43
|
|
44 Note that as the character set feature is designed to mirror
|
|
45 CLC-INTERCAL's, I've sprinkled a bit of idiomatic Perl throughout
|
|
46 the code. This is quite rare, though, as it has to be also legal in C.
|
|
47 */
|
|
48
|
|
49 #include <stdio.h>
|
|
50 #include <string.h>
|
|
51 #include <stdlib.h>
|
|
52 #include "uncommon.h"
|
|
53 #define NCSETRECENT 8
|
|
54
|
|
55 /* Sometimes we want to link the character set files to the program
|
|
56 * rather than reading them from disk; in this case, these extern
|
|
57 * variables will be set non-null by object files invented
|
|
58 * specifically for the purpose. */
|
|
59 extern /*@null@*/ const char* ick_clc_cset_atari;
|
|
60 extern /*@null@*/ const char* ick_clc_cset_baudot;
|
|
61 extern /*@null@*/ const char* ick_clc_cset_ebcdic;
|
|
62 extern /*@null@*/ const char* ick_clc_cset_latin1;
|
|
63
|
|
64 static /*@null@*/ const char* ick_clc_cset_ptr=0;
|
|
65
|
|
66 /* Fake that we're reading hardcoded characters from a file. This
|
|
67 * method of doing it is obviously not thread-safe. */
|
|
68 static int ick_clc_cset_hardcoderead(FILE* ignored)
|
|
69 {
|
|
70 /*@-noeffect@*/
|
|
71 (void) ignored;
|
|
72 /*@=noeffect@*/
|
|
73 return (int)*ick_clc_cset_ptr++;
|
|
74 }
|
|
75
|
|
76 struct cset
|
|
77 {
|
|
78 unsigned char set[4096]; /* allow up to 12 bits of data+shifts */
|
|
79 unsigned short setlen;
|
|
80 int shifts;
|
|
81 char setname[9]; /* 8.3 filenames are enforced! */
|
|
82 char bitorder[16];
|
|
83 int nbytes;
|
|
84 };
|
|
85
|
|
86 /* In particular, this initialises the setnames to the null string,
|
|
87 * and clears nbytes. Both of these are used to determine whether a
|
|
88 * cset is valid or not. */
|
|
89 /*@-initallelements@*/ /*@-type@*/
|
|
90 static struct cset ick_cset_recent[NCSETRECENT]={{{0},0,0,{0},{0},0}};
|
|
91 /*@=initallelements@*/ /*@=type@*/
|
|
92 static int ick_csetow=0; /* which cset to overwrite ick_next */
|
|
93
|
|
94 /* For help finding files */
|
|
95 /*@observer@*/ extern char* ick_globalargv0;
|
|
96 /*@observer@*/ extern const char* ick_datadir;
|
|
97
|
|
98 /*@-mustfreefresh@*/
|
|
99 /* because Splint doesn't understand how findandfopen works */
|
|
100 static void ick_clc_cset_load(/*@unique@*/ struct cset* cs, /*@unique@*/ const char* fname)
|
|
101 {
|
|
102 FILE* in;
|
|
103 char buf[13]; /* enough for an 8.3 filename */
|
|
104 int i,j,c;
|
|
105 int (*ipf)(FILE*);
|
|
106 /* Avoid buffer-overflow attacks. */
|
|
107 if(strlen(fname)>8) return;
|
|
108 /* If ick_clc_cset_atari is non-null, then don't read from disk. */
|
|
109 if(ick_clc_cset_atari)
|
|
110 {
|
|
111 /* If the character sets have been hardcoded, only accept
|
|
112 * hardcoded chararacter sets. */
|
|
113 ick_clc_cset_ptr=0;
|
|
114 if(!strcmp(fname,"atari")) ick_clc_cset_ptr=ick_clc_cset_atari;
|
|
115 if(!strcmp(fname,"baudot")) ick_clc_cset_ptr=ick_clc_cset_baudot;
|
|
116 if(!strcmp(fname,"ebcdic")) ick_clc_cset_ptr=ick_clc_cset_ebcdic;
|
|
117 if(!strcmp(fname,"latin1")) ick_clc_cset_ptr=ick_clc_cset_latin1;
|
|
118 if(!ick_clc_cset_ptr) return; /* not a hardcoded charset */
|
|
119 in=(FILE*)0;
|
|
120 ipf=ick_clc_cset_hardcoderead;
|
|
121 }
|
|
122 else
|
|
123 {
|
|
124 /* We already checked above that this isn't a buffer overflow. */
|
|
125 /*@-bufferoverflowhigh@*/
|
|
126 sprintf(buf,"%s.bin",fname);
|
|
127 /*@=bufferoverflowhigh@*/
|
|
128 if(!(in=ick_findandfopen(buf,ick_datadir,"rb",ick_globalargv0))) return;
|
|
129 ipf=fgetc;
|
|
130 }
|
|
131 /* First row: setlen */
|
|
132 cs->setlen=0;
|
|
133 do
|
|
134 {
|
|
135 /* The input is definitely in ASCII, even if the C program isn't,
|
|
136 which is why numeric codes are used. */
|
|
137 /* Here, ipf allows NULL input iff in is actually NULL; this situation
|
|
138 is impossible to explain with an annotation, so instead just disable
|
|
139 the warning. */
|
|
140 /*@-nullpass@*/
|
|
141 c=ipf(in);
|
|
142 /*@=nullpass@*/
|
|
143 if(c==EOF) {if(in) (void)fclose(in); return;}
|
|
144 if(c<48||c>57) break;
|
|
145 cs->setlen*=10;
|
|
146 cs->setlen+=c-48;
|
|
147 } while(1);
|
|
148 if(c!=10) {if(in) (void)fclose(in); return;}
|
|
149 /* Second row: shifts. This can be from 1 to 9. */
|
|
150 /*@-nullpass@*/
|
|
151 c=ipf(in);
|
|
152 /*@=nullpass@*/
|
|
153 if(c<49||c>57) {if(in) (void)fclose(in); return;}
|
|
154 cs->shifts=c-48;
|
|
155 /*@-nullpass@*/
|
|
156 if(ipf(in)!=10) {if(in) (void)fclose(in); return;}
|
|
157 /*@=nullpass@*/
|
|
158 /* Third row: byte order. */
|
|
159 i=0;
|
|
160 /*@-nullpass@*/
|
|
161 while(((c=ipf(in)))>96&&i<16) cs->bitorder[i++]=(char)c;
|
|
162 /*@=nullpass@*/
|
|
163 /* Sanity check; that it is a whole number of bytes, that the input
|
|
164 * format is correct, and that there are at most 4096 bytes of data
|
|
165 * total. */
|
|
166 if(c!=10||i%8||!i||cs->setlen*cs->shifts>4096) return;
|
|
167 /* i/8 is now the number of bytes, but don't set that yet in case
|
|
168 * there's an error later. */
|
|
169 /* Rest of file: the bytes themselves. */
|
|
170 j=0;
|
|
171 /*@-nullpass@*/
|
|
172 while(j<cs->setlen*cs->shifts)
|
|
173 if((cs->set[j++]=(unsigned char)(c=ipf(in))),c==EOF && in != NULL)
|
|
174 {if(in) (void)fclose(in); return;}
|
|
175 /*@=nullpass@*/
|
|
176 if(in) (void) fclose(in);
|
|
177 /* Now set the name and number of bytes, indicating a successful
|
|
178 * load. */
|
|
179 cs->nbytes=i/8;
|
|
180 strcpy(cs->setname,fname);
|
|
181 }
|
|
182 /*@=mustfreefresh@*/
|
|
183
|
|
184 /* Helper function for fixing bit order in output. */
|
|
185 static void ick_bitencout(char** pop, const struct cset* co,
|
|
186 unsigned short val, int padstyle)
|
|
187 {
|
|
188 unsigned short outword=0;
|
|
189 int i=co->nbytes*8;
|
|
190 /*@-shiftnegative@*/ /* i can't go above it's initial value here */
|
|
191 while(i--)
|
|
192 if(co->bitorder[i]>'l')
|
|
193 {
|
|
194 if((padstyle==1&&(i==1||i==9) && !(outword&(1<<(co->nbytes*8-i)))) ||
|
|
195 (padstyle==2&&(rand()%2||!outword)))
|
|
196 outword |= 1<<(co->nbytes*8-i-1);
|
|
197 }
|
|
198 /* Copy the appropriate bit from val to outword. */
|
|
199 else outword |= (unsigned short)((val>>(co->bitorder[i]-'a'))&1)
|
|
200 << (co->nbytes*8-i-1);
|
|
201 /*@=shiftnegative@*/
|
|
202 if(co->nbytes==2) *(*pop)++=(char)(outword/256);
|
|
203 *(*pop)++=(char)(outword%256);
|
|
204 }
|
|
205
|
|
206 /* padstyle is 0 to pad with zeros, 1 to pad to make the output
|
|
207 * printable characters, or 2 to pad with garbage, avoiding 0s.
|
|
208 * Return value is the number of characters in the output string,
|
|
209 * which may contain embedded NULs if the input contained invalid
|
|
210 * characters. Returns -1 on error. The caller is responsible for
|
|
211 * making sure that out is big enough, but as a check, no more than
|
|
212 * outsize-1 characters and a NUL will be written to out. The code is
|
|
213 * conservative about this; to be safe, make outsize six times as long
|
|
214 * as the in is (including in's terminal NUL), plus 6. */
|
|
215 int ick_clc_cset_convert(const char* in, /*@partial@*/ char* out, const char* incset,
|
|
216 const char* outcset, int padstyle, size_t outsize,
|
|
217 /*@null@*/ FILE* errsto)
|
|
218 {
|
|
219 int ic=-1, oc=-1;
|
|
220 int i;
|
|
221 int ssi, sso;
|
|
222 unsigned short tus, csi;
|
|
223 const char* ip;
|
|
224 char* op;
|
|
225 struct cset *csri, *csro;
|
|
226 int noconvwarn=0;
|
|
227 int substwarn=0;
|
|
228 /* First, see if we have a recently-used version of incset or outcset. */
|
|
229 i=NCSETRECENT;
|
|
230 while(i--)
|
|
231 {
|
|
232 (void)(strcmp(incset,ick_cset_recent[i].setname) || (ic=i));
|
|
233 (void)(strcmp(outcset,ick_cset_recent[i].setname) || (oc=i));
|
|
234 }
|
|
235 /* Find a blank entry to load on top of. */
|
|
236 if(ic==-1) for(i=NCSETRECENT;i--;) if(!ick_cset_recent[i].nbytes) ic=i;
|
|
237 if(oc==-1) for(i=NCSETRECENT;i--;) if(!ick_cset_recent[i].nbytes&&i!=ic) oc=i;
|
|
238 /* Failing that, find any entry to load on top of. */
|
|
239 (void)(ic==-1 && (ick_cset_recent[ic=ick_csetow++].nbytes=0));
|
|
240 if(ick_csetow==ic) ick_csetow++;
|
|
241 ick_csetow%=NCSETRECENT;
|
|
242 (void)(oc==-1 && (ick_cset_recent[oc=ick_csetow++].nbytes=0));
|
|
243 ick_csetow%=NCSETRECENT;
|
|
244 /* If the character set hasn't been loaded, load it now. */
|
|
245 ick_cset_recent[ic].nbytes || (ick_clc_cset_load(ick_cset_recent+ic,incset),0);
|
|
246 ick_cset_recent[oc].nbytes || (ick_clc_cset_load(ick_cset_recent+oc,outcset),0);
|
|
247 csri=ick_cset_recent+ic;
|
|
248 csro=ick_cset_recent+oc;
|
|
249 /* If a character set failed to load, bail out. */
|
|
250 if(!csri->nbytes)
|
|
251 {
|
|
252 if(errsto) fprintf(errsto,"Error: Nonexistent input character set.\n");
|
|
253 return -1;
|
|
254 }
|
|
255 if(!csro->nbytes)
|
|
256 {
|
|
257 if(errsto) fprintf(errsto,"Error: Nonexistent output character set.\n");
|
|
258 return -1;
|
|
259 }
|
|
260 /* There is no initial shift state. */
|
|
261 ssi=sso=0;
|
|
262 csri->shifts==1 && (ssi=1);
|
|
263 csro->shifts==1 && (sso=1);
|
|
264 ip=in; op=out;
|
|
265 while(*ip != '\0' && (size_t)(op-out)<outsize-6)
|
|
266 {
|
|
267 tus=(unsigned short)(unsigned char)*ip++;
|
|
268 if(csri->nbytes==2)
|
|
269 {
|
|
270 tus*=256;
|
|
271 tus+=(unsigned short)(unsigned char)*ip++;
|
|
272 }
|
|
273 i=csri->nbytes*8;
|
|
274 csi=0;
|
|
275 while(i--)
|
|
276 {
|
|
277 if(csri->bitorder[i]>'l') continue;
|
|
278 /* Copy the appropriate bit from tus to csi. */
|
|
279 /*@-shiftnegative@*/
|
|
280 csi |= (unsigned short)((tus>>(csri->nbytes*8-i-1))&1)
|
|
281 << (csri->bitorder[i]-'a');
|
|
282 /*@=shiftnegative@*/
|
|
283 }
|
|
284 if(csi>csri->setlen)
|
|
285 {
|
|
286 ick_bitencout(&op,csro,0,padstyle); /* not in the charset */
|
|
287 if(!noconvwarn && errsto != NULL)
|
|
288 fprintf(errsto,"Warning: some characters could not be translated,"
|
|
289 " they were replaced with NUL.\n");
|
|
290 noconvwarn=1;
|
|
291 }
|
|
292 else
|
|
293 {
|
|
294 /* The more interesting case. */
|
|
295 csi*=csri->shifts;
|
|
296 if(!ssi)
|
|
297 {
|
|
298 /* We're at the start of a shift-stated string, but not
|
|
299 actually in any shift state. There is no general solution
|
|
300 here, so use one that works for Baudot: starting in each
|
|
301 state in turn, choose the option that takes the longest
|
|
302 until it ends up not changing shift state, then perform one
|
|
303 shift from that option. */
|
|
304 int sstesting, ssbestsf, ssrecord, j, k;
|
|
305 sstesting=csri->shifts+1; ssbestsf=ssrecord=0;
|
|
306 while(--sstesting)
|
|
307 {
|
|
308 k=sstesting; j=0;
|
|
309 while(csri->set[csi+i-1] != (unsigned char)0 &&
|
|
310 (int)csri->set[csi+i-1]!=k &&
|
|
311 (int)csri->set[csi+i-1]<=csri->shifts)
|
|
312 {k=(int)csri->set[csi+i-1]; j++;}
|
|
313 if(ssbestsf<j) {ssbestsf=sstesting; ssrecord=j;}
|
|
314 }
|
|
315 ssi=ssbestsf;
|
|
316 }
|
|
317 csi+=ssi-1;
|
|
318 tus=(unsigned short)csri->set[csi]; /* we now have the Latin-1 conversion! */
|
|
319 if(tus>=1&&tus<=(unsigned short)csri->shifts&&csri->shifts>1)
|
|
320 {
|
|
321 /* That wasn't a character, but a shift command. */
|
|
322 ssi=(int)tus;
|
|
323 continue;
|
|
324 }
|
|
325 /* Look for the character in the output's character
|
|
326 * set. Preferably we want something in the current shift
|
|
327 * state, but failing that, any character will do. */
|
|
328 spacenowtab:
|
|
329 i=csro->shifts*csro->setlen;
|
|
330 csi=10000;
|
|
331 while(i--)
|
|
332 (void)((unsigned short)csro->set[i]==tus &&
|
|
333 (csi==10000 || (int)csi%csro->shifts!=sso-1) &&
|
|
334 (csi=(unsigned short)i));
|
|
335 if(csi==10000&&tus==9 /* latin-1 tab */)
|
|
336 {
|
|
337 if(!substwarn && errsto != NULL)
|
|
338 fprintf(errsto,"Warning: no tab in output character set,"
|
|
339 " space was used instead.\n");
|
|
340 substwarn=1;
|
|
341 tus=32; /* latin-1 space */
|
|
342 goto spacenowtab;
|
|
343 }
|
|
344 if(csi==10000)
|
|
345 {
|
|
346 ick_bitencout(&op,csro,0,padstyle); /* not in the charset */
|
|
347 if(!noconvwarn && errsto != NULL)
|
|
348 fprintf(errsto,"Warning: some characters could not be translated,"
|
|
349 " they were replaced with NUL.\n");
|
|
350 noconvwarn=1;
|
|
351 }
|
|
352 else if((int)(csi%csro->shifts)==(int)sso-1)
|
|
353 /* in the right shift state already */
|
|
354 ick_bitencout(&op,csro,(unsigned short)(csi/csro->shifts),padstyle);
|
|
355 else
|
|
356 {
|
|
357 int tempi;
|
|
358 /* Generate shift codes. If sso isn't 0, generate from where
|
|
359 * we are at the moment; if it is 0, generate worse-case
|
|
360 * shifts by assuming we're in a shift state that can't shift
|
|
361 * to the state we want directly, if possible. */
|
|
362 if(!sso)
|
|
363 {
|
|
364 int j=csro->shifts+1;
|
|
365 while(--j>0)
|
|
366 {
|
|
367 if(j-1==(int)(csi%csro->shifts)) continue;
|
|
368 i=(int)csro->setlen;
|
|
369 while(i--)
|
|
370 if((int)csro->set[i*csro->shifts+j-1]==csi%csro->shifts+1)
|
|
371 {j=-j; break; /* there is one in this set */};
|
|
372 j=-j;
|
|
373 if(j<0) break;
|
|
374 }
|
|
375 /* Pick the worst-case if we found one, or otherwise just
|
|
376 * any state we aren't in at the moment. */
|
|
377 sso=(j<0?-j:(int)(csi%csro->shifts));
|
|
378 if(!sso) sso=csro->shifts;
|
|
379 }
|
|
380 /* Look for the shift code, if there is one. */
|
|
381 i=(int)csro->setlen;
|
|
382 while(i--)
|
|
383 if((int)csro->set[i*csro->shifts+sso-1]==csi%csro->shifts+1) break;
|
|
384 tempi=i*csro->shifts+sso-1;
|
|
385 if(i==-1)
|
|
386 {
|
|
387 int intershift=-1;
|
|
388 /* That didn't work. Look for the shift code in some shift
|
|
389 * state other than the one we're aiming for. */
|
|
390 retry:
|
|
391 i=csro->setlen*csro->shifts;
|
|
392 while(i--)
|
|
393 if((int)csro->set[i]==csi%csro->shifts+1&&
|
|
394 i%csro->shifts!=(int)(csi%csro->shifts)&&
|
|
395 i%csro->shifts+1!=intershift) break;
|
|
396 if(i==-1) return -1; /* no way to get into the right state */
|
|
397 intershift=i%csro->shifts+1;
|
|
398 tempi=i;
|
|
399 i=(int)csro->setlen;
|
|
400 while(i--)
|
|
401 if((int)csro->set[i*csro->shifts+sso-1]==intershift) break;
|
|
402 if(i==-1) goto retry; /* try once more */
|
|
403 ick_bitencout(&op,csro,(unsigned short)i,padstyle);
|
|
404 /* sso=intershift here but we're going to overwrite it
|
|
405 * immediately anyway, so no point in the assignment */
|
|
406 }
|
|
407 ick_bitencout(&op,csro,(unsigned short)(tempi/csro->shifts),padstyle);
|
|
408 ick_bitencout(&op,csro,(unsigned short)(csi/csro->shifts),padstyle);
|
|
409 sso=csi%csro->shifts+1;
|
|
410 }
|
|
411 }
|
|
412 }
|
|
413 *op='\0';
|
|
414 return op-out;
|
|
415 }
|