Mercurial > repo
view interps/c-intercal/src/lexer.l @ 12518:2d8fe55c6e65 draft default tip
<int-e> learn The password of the month is release incident pilot.
author | HackEso <hackeso@esolangs.org> |
---|---|
date | Sun, 03 Nov 2024 00:31:02 +0000 |
parents | 859f9b4339e6 |
children |
line wrap: on
line source
%e 2000 %p 4000 %n 1000 %{ /* the directives above are for Solaris lex, and will be ignored by * flex */ /* * NAME * lexer.l -- source for the C-INTERCAL lexical analyzer. * LICENSE TERMS Copyright (C) 1996 Eric S. Raymond This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "config.h" #include <stdio.h> #include <stdlib.h> #include <ctype.h> #include <string.h> #include "ick.h" #include "parser.h" #include "ick_lose.h" /*#undef wchar_t*/ /*#define wchar_t unsigned char*/ #ifndef yywrap static int yywrap(void) { return 1; } #endif /* yywrap */ int iyylineno = 1; #ifdef MAIN YYSTYPE yylval; #endif /* MAIN */ char **textlines = NULL; int textlinecount = 0; int politesse = 0; int stbeginline = 0; /* AIS: Sort out a grammar near-ambiguity */ unsigned long sparkearsstack[SENESTMAX] = {0}; int sparkearslev = 0; /* AIS: Some symbols are ambiguous between C-INTERCAL and CLC-INTERCAL: Symbol C-INTERCAL CLC-INTERCAL NOSPOT _ @ WHIRL @ ? XOR ? yen, or bookworm (bookworm's also C-INTERCAL legal) By default, the C-INTERCAL meanings are used; the extern variable clclex causes CLC-INTERCAL interpretations to be put on the ambiguous symbols. Otherwise, mixing syntaxes freely is allowed. */ extern int clclex; #ifdef FLEX_SCANNER static char linebuf[YY_BUF_SIZE]; #else /* FLEX_SCANNER */ static char linebuf[YYLMAX]; #endif /* FLEX_SCANNER */ static char *lineptr = linebuf; ick_bool re_send_token = ick_FALSE; int lexer(void); static int myatoi(const char *text); void yyerror(const char *errtype); #define SETLINENO \ {if (stbeginline == 0) stbeginline = iyylineno;\ else if (stbeginline < 0) stbeginline = 0;} /* AIS: Keep track of the spark/ears situation */ #define STACKSPARKEARS(a) \ if (sparkearslev+1>=SENESTMAX*32) ick_lose(IE281, iyylineno, (char*) NULL); \ sparkearslev++; sparkearsstack[sparkearslev/32]<<=1; \ sparkearsstack[sparkearslev/32]+=a #define CLEARSPARKEARSTACK {int i=SENESTMAX; \ while(i--) sparkearsstack[i] = 0;} \ sparkearslev = 0 /* * The spectacular ugliness of INTERCAL syntax requires that the lexical * analyzer have two levels. One, embedded in the getc() function, handles * logical-line continuation and the ! abbrev, and stashes each logical * line away in a buffer accessible to the code generator (this is necessary * for the * construct to be interpreted correctly). The upper level is * generated by lex(1) and does normal tokenizing. */ #undef getc int getc(FILE *fp) { extern FILE* yyin; static ick_bool bangflag = ick_FALSE; static ick_bool backflag = ick_FALSE; static ick_bool eolflag = ick_FALSE; if ((size_t)(lineptr - linebuf) > sizeof linebuf) ick_lose(IE666, iyylineno, (char *)NULL); if (bangflag) { bangflag = ick_FALSE; /* *lineptr++ = '!'; */ return('.'); } else if (backflag) /* converting ctrl-H (backspace) to two chars "^H" */ { backflag = ick_FALSE; /* *lineptr++ = '\b'; */ return('H'); } else { int c; char c_char; /*fprintf(stderr,"about to fgetc(\045p)",(void*)fp);*/ c_char=0; /* AIS */ (void)fread(&c_char,1,1,fp); /* AIS: ignore the first \r in a row to deal with DOS newlines. The second in a row is definitely an error, though, and will be caught later on. */ if(c_char=='\r') (void)fread(&c_char,1,1,fp); c = c_char; if(feof(fp)) c=EOF; if(!eolflag && c == EOF) c = '\n'; /*fprintf(stderr,"getc input a character: %c\n",c);*/ if (feof(yyin)) { *lineptr = '\0'; if(eolflag) return(EOF); if(c=='\0' || c==EOF) c='\n'; } eolflag = ick_FALSE; if (c == '!') { *lineptr++ = '!'; bangflag = ick_TRUE; return(c = '\''); } else if (c == '\b') /* convert ctrl-H (backspace) to two chars "^" and "H" so lex can take it */ { *lineptr++ = '\b'; backflag = ick_TRUE; return(c = '^'); } else if (c == '\n') { *lineptr = '\0'; lineptr = linebuf; if (iyylineno >= textlinecount) { textlinecount += ALLOC_CHUNK; if (textlines) textlines = realloc(textlines, textlinecount * sizeof(char*)); else textlines = malloc(textlinecount * sizeof(char*)); if (!textlines) ick_lose(IE666, iyylineno, (char *)NULL); } textlines[iyylineno] = malloc(1 + strlen(linebuf)); if (!textlines[iyylineno]) ick_lose(IE666, iyylineno, (char *)NULL); strcpy(textlines[iyylineno], linebuf); iyylineno++; eolflag=ick_TRUE; return('\n'); } else { return(*lineptr++ = c); } } } /* replace YY_INPUT so that it uses our getc function. */ #undef YY_INPUT #define YY_INPUT(buf,result,max_size) \ { \ int c = getc(yyin); \ if (c == EOF) { \ if (ferror(yyin)) \ YY_FATAL_ERROR("input in flex scanner failed"); \ result = YY_NULL; \ } else { \ buf[0] = c; \ result = 1; \ } \ } %} W [\ \t\n]* D [0-9][\ \t\n0-9]* I [A-Z] %% {D} {yylval.numval = myatoi(yytext); return(NUMBER);} \_ {return(NOSPOT);} \. {return(ick_ONESPOT);} \: {return(ick_TWOSPOT);} \, {return(ick_TAIL);} \; {return(ick_HYBRID);} \# {return(MESH);} \xBD | "c^H/" | "c^H|" {return(MINGLE); /* AIS: CLC-INTERCAL ick_mingle symbols. The \xBD is ISO-8859-1 for cent. */} \$ | \xA2 | \xA3 | \xA4 | \xC2\xA2 | \xC2\xA3 | \xC2\xA4 | \xE2\x82\xA0 | \xE2\x82\xA1 | \xE2\x82\xA2 | \xE2\x82\xA3 | \xE2\x82\xA4 | \xE2\x82\xA5 | \xE2\x82\xA6 | \xE2\x82\xA7 | \xE2\x82\xA8 | \xE2\x82\xA9 | \xE2\x82\xAA | \xE2\x82\xAB | \xE2\x82\xAC | \xE0\xA7\xB2 | \xE0\xA7\xB3 | \xE0\xB8\xBF {return(MINGLE);} \~ {return(SELECT);} \/ {return(SLAT); /* AIS: Operand overloading */} \\ {return(BACKSLAT); /* ditto */} \& {yylval.numval = AND; return(UNARY);} V {yylval.numval = OR; return(UNARY);} \xA5 | \xBE | "V^H-" | \xE2\x88\x80 {yylval.numval = XOR; return(UNARY); /* AIS: CLC-INTERCAL uses \xBE, ISO-8859-1 for yen; for some reason, \xA5 is what was detected by the compiler during my tests, so that's here too */} \? {if(clclex) yylval.numval = WHIRL; else yylval.numval = XOR; return(UNARY); /* AIS: ? is a unary operator in both C-INTERCAL and CLC-INTERCAL, but with different meanings. */} \| | \^ {yylval.numval = FIN; return(UNARY); /* AIS: | is CLC */} @ {if(clclex) return(NOSPOT); /* AIS: a C/CLC ambiguity */ else {yylval.numval = WHIRL; return(UNARY);}} [2-5]{W}@ {yylval.numval = WHIRL + myatoi(yytext) - 1; return(UNARY);} \' {char temp = sparkearsstack[sparkearslev/32]&1; STACKSPARKEARS(0); /* AIS: I added all mentions of STACKSPARKEARS, OPEN\(SPARK\|EARS\), CLOSE\(SPARK\|EARS\), and CLEARSPARKEARSTACK */ return(temp?OPENSPARK:CLOSESPARK);} \" {char temp = sparkearsstack[sparkearslev/32]&1; STACKSPARKEARS(1); return(temp?CLOSEEARS:OPENEARS);} \({W}{D}\) {SETLINENO; yylval.numval = myatoi(yytext); return(LABEL);} DO {SETLINENO; CLEARSPARKEARSTACK; return(DO);} FAC {SETLINENO; CLEARSPARKEARSTACK; return(DO);} PLEASE {SETLINENO; CLEARSPARKEARSTACK; politesse++; return(DO);} PLACET {SETLINENO; CLEARSPARKEARSTACK; politesse++; return(DO);} PLEASE{W}DO {SETLINENO; CLEARSPARKEARSTACK; politesse++; return(DO);} PLACET{W}FACERE {SETLINENO; CLEARSPARKEARSTACK; politesse++; return(DO);} MAYBE {SETLINENO; CLEARSPARKEARSTACK; return(MAYBE);} MAYBE{W}DO {SETLINENO; CLEARSPARKEARSTACK; return(MAYBE);} MAYBE{W}PLEASE {SETLINENO; CLEARSPARKEARSTACK; politesse++; return(MAYBE);} MAYBE{W}PLEASE{W}DO {SETLINENO; CLEARSPARKEARSTACK; politesse++; return(MAYBE); /* AIS: I added all the MAYBE cases. It seems that MAYBE has no simple Latin synonym. */} NOT {return(NOT);} N\'T {return(NOT);} NON {return(NOT);} \xAA {return(NOT); /* AIS: CLC-INTERCAL again, this time it's ISO-8859-1 for the logical NOT symbol... */} \xAC {return(NOT); /* ... but my computer translates it to \xAC */} ONCE {return(ONCE);} QUONDAM {return(ONCE);} AGAIN {return(AGAIN);} ITERUM {return(AGAIN);} \%{W}{D} {yylval.numval = myatoi(yytext); if (yylval.numval && yylval.numval < 100) return(OHOHSEVEN); else ick_lose(IE017, iyylineno, (char *)NULL);} SUB {return(SUB);} MULTIPLICATUS{W}A | BY {return(BY);} \<- {return(GETS);} CALCULANDUM | CALCULATING {yylval.numval = GETS; return(GERUND);} ALIENERE | FORGET {return(FORGET);} ALIENENDUM | FORGETTING {yylval.numval = FORGET; return(GERUND);} RECOLERE | RESUME {return(RESUME);} RECOLERENDUM | RESUMING {yylval.numval = RESUME; return(GERUND);} EXUERE | STASH {return(STASH);} EXUENDUM | STASHING {yylval.numval = STASH; return(GERUND);} INUERE | RETRIEVE {return(RETRIEVE);} INUENDUM | RETRIEVING {yylval.numval = RETRIEVE; return(GERUND);} DISSIMULARE | IGNORE {return(IGNORE);} DISSIMULANDUM | IGNORING {yylval.numval = IGNORE; return(GERUND);} MEMINISSE | REMEMBER {return(REMEMBER);} MEMINISSENDUM | REMEMBERING {yylval.numval = REMEMBER; return(GERUND);} ABSTINERE | ABSTAIN {return(ABSTAIN);} ABSTINENDUM | ABSTAINING {yylval.numval = ABSTAIN; return(GERUND);} REINSTARE | REINSTATE {return(REINSTATE);} REINSTATANDUM | REINSTATING {yylval.numval = REINSTATE; return(GERUND);} LEGERE{W}EX | READ{W}OUT {return(READ_OUT);} LEGENDUM | READING{W}OUT {yylval.numval = READ_OUT; return(GERUND);} SCRIBERE{W}IN | WRITE{W}IN {return(WRITE_IN);} SCRIBENDUM | WRITING{W}IN {yylval.numval = WRITE_IN; return(GERUND);} COMMEMERO | COMMENTS | COMMENTING | COMMENT {yylval.numval = UNKNOWN; return(GERUND); /* AIS: An idea stolen from CLC-INTERCAL. The Latin means literally 'remind' or 'mention'. */} PIN {/* By AIS. I can't find a Latin translation for this. */ return(PIN);} PINNING {/* By AIS */ yylval.numval = PIN; return(GERUND);} DEINDERE{W}A{W}\({W}{D}\) | NEXT{W}FROM{W}\({W}{D}\) {/* AIS */ yylval.numval = myatoi(yytext); return(NEXTFROMLABEL);} DEINDERE{W}A | NEXT{W}FROM {/* AIS: 'next' is not a verb, so the Latin is invented */ return(NEXTFROMEXPR);} DEINDENDUM | NEXTING{W}FROM {/* AIS */ yylval.numval = NEXTFROMLABEL; return(GERUND);} ADVENIRE{W}DE{W}\({W}{D}\) | COME{W}FROM{W}\({W}{D}\) {/* AIS */ yylval.numval = myatoi(yytext); return(COME_FROM);} ADVENIRE{W}DE | COME{W}FROM {/* AIS */ return(COMPUCOME);} ADVENENDUM | COMING{W}FROM {yylval.numval = COME_FROM; return(GERUND);} DEINDE | NEXT {stbeginline = 0; return(NEXT);} PROXIMANDUM | NEXTING {yylval.numval = NEXT; return(GERUND);} FROM {return(FROM); /* AIS: Latin is 'A', which confuses the rest of the parser */} CONCEDERE | DESPERARE | GIVE{W}UP {return(GIVE_UP);} CONOR{W}ITERUM | TRY{W}AGAIN {return(TRY_AGAIN);} WHILE {return(WHILE); /* AIS. Latin for this is needed. */} WHILING | LOOPING {yylval.numval = WHILE; return(GERUND);} TRYING{W}AGAIN {yylval.numval = TRY_AGAIN; return(GERUND);} GO{W}BACK | REDIRE {return(GO_BACK);} GOING{W}BACK | REDENDUM {yylval.numval = GO_BACK; return(GERUND);} GO{W}AHEAD | GRASSOR {return(GO_AHEAD);} GOING{W}AHEAD {yylval.numval = GO_AHEAD; return(GERUND); /* AIS: I'm having a few deponent troubles with the Latin, so there are no Latin gerunds around here. Besides, the Latin 'gerunds' look somewhat like gerundives to me, but that's purely based on memory so I may be wrong. */} CREATE{W}\({W}{D}\) | CONFICE{W}\({W}{D}\) {yylval.numval = myatoi(yytext); return(CREATE);} CREATE | CONFICE {return(COMPUCREATE);} CREATING | CREATION | CONFICENDUM {yylval.numval = CREATE; return(GERUND);} \+ {return(INTERSECTION);} {W} ; {I} {/* AIS */ yylval.numval = *yytext; return(UNKNOWNID);} .\^H. {/* AIS */ yylval.numval = yytext[0]*256 + yytext[3]; if(yytext[0] > yytext[3]) yylval.numval = yytext[0] + yytext[3]*256; return(BADCHAR);} . {yylval.numval = yytext[0]; /* AIS: The line below for debug */ if(yydebug) fprintf(stdout, "yylex: bad char %#x\n",(unsigned char)yytext[0]); return(BADCHAR);} %% int lexer(void) { static int tok = BADCHAR; if (re_send_token) re_send_token = ick_FALSE; else { tok = yylex(); #ifdef YYDEBUG if (yydebug) (void) fprintf(stdout, "yylex: returning token %d\n", tok); #endif /* YYDEBUG */ } #ifdef YYDEBUG if (yydebug) (void) fprintf(stdout, "lexer: returning token %d\n", tok); #endif /* YYDEBUG */ return(tok); } static int myatoi(const char *text) /* AIS */ { #define MAXTEXT 100 static char buf[MAXTEXT]; static char thinbuf[MAXTEXT]; char* bp; char* tp; register int i; for(buf[i = 0] = '\0';*text && i < MAXTEXT;text++) { if(isdigit(*text)) { buf[i++] = *text; } } buf[i] = '\0'; bp=buf; tp=thinbuf; while(((*tp++=*bp++))); /* thinbuf code added by an AIS in case we want to work with wchar_t; the extra brackets tell GCC that this is intended and not a mistaken assignment */ return atoi(thinbuf); } void yyerror(const char *errtype) { #ifdef MAIN (void) printf("lextest: lexer error: %s.\n", errtype); #else /* MAIN */ (void) errtype; #endif /* MAIN */ } #ifdef MAIN int ick_main(void) { int t; while ((t = yylex()) > 0) { (void) printf("%03d %09d\n", t, yylval.numval); yylval.numval = 0; } return 0; } #endif /* MAIN */