view interps/c-intercal/src/lexer.l @ 12253:ad5c5d1b7d04 draft

<oerjan> t sled lib/karma//s/egrep.*>/egrep -x \'<[^>]*>/
author HackEso <hackeso@esolangs.org>
date Fri, 06 Dec 2019 07:53:22 +0000
parents 859f9b4339e6
children
line wrap: on
line source

%e 2000
%p 4000
%n 1000
%{
/* the directives above are for Solaris lex, and will be ignored by
 * flex */

/*
 * NAME
 *   lexer.l -- source for the C-INTERCAL lexical analyzer.
 *
LICENSE TERMS
    Copyright (C) 1996 Eric S. Raymond

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "config.h"
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include "ick.h"
#include "parser.h"
#include "ick_lose.h"

/*#undef wchar_t*/
/*#define wchar_t unsigned char*/

#ifndef yywrap
static int yywrap(void)
{
    return 1;
}
#endif /* yywrap */

int 	iyylineno = 1;

#ifdef MAIN
YYSTYPE	yylval;
#endif /* MAIN */

char **textlines = NULL;
int textlinecount = 0;
int politesse = 0;
int stbeginline = 0;

/* AIS: Sort out a grammar near-ambiguity */
unsigned long sparkearsstack[SENESTMAX] = {0};
int sparkearslev = 0;

/* AIS: Some symbols are ambiguous between C-INTERCAL and CLC-INTERCAL:
   Symbol    C-INTERCAL CLC-INTERCAL
   NOSPOT    _          @
   WHIRL     @          ?
   XOR       ?          yen, or bookworm (bookworm's also C-INTERCAL legal)
   By default, the C-INTERCAL meanings are used; the extern variable clclex
   causes CLC-INTERCAL interpretations to be put on the ambiguous symbols.
   Otherwise, mixing syntaxes freely is allowed. */
extern int clclex;

#ifdef FLEX_SCANNER
static char linebuf[YY_BUF_SIZE];
#else /* FLEX_SCANNER */
static char linebuf[YYLMAX];
#endif /* FLEX_SCANNER */

static char *lineptr = linebuf;

ick_bool re_send_token = ick_FALSE;

int lexer(void);
static int myatoi(const char *text);
void yyerror(const char *errtype);

#define SETLINENO					\
    {if (stbeginline == 0)	stbeginline = iyylineno;\
     else if (stbeginline < 0)	stbeginline = 0;}

/* AIS: Keep track of the spark/ears situation */
#define STACKSPARKEARS(a) \
    if (sparkearslev+1>=SENESTMAX*32) ick_lose(IE281, iyylineno, (char*) NULL); \
    sparkearslev++; sparkearsstack[sparkearslev/32]<<=1; \
    sparkearsstack[sparkearslev/32]+=a
#define CLEARSPARKEARSTACK {int i=SENESTMAX; \
                           while(i--) sparkearsstack[i] = 0;} \
                           sparkearslev = 0

/*
 * The spectacular ugliness of INTERCAL syntax requires that the lexical
 * analyzer have two levels. One, embedded in the getc() function, handles
 * logical-line continuation and the ! abbrev, and stashes each logical
 * line away in a buffer accessible to the code generator (this is necessary
 * for the * construct to be interpreted correctly). The upper level is
 * generated by lex(1) and does normal tokenizing.
 */

#undef getc
int getc(FILE *fp)
{
    extern FILE* yyin;

    static ick_bool bangflag = ick_FALSE;
    static ick_bool backflag = ick_FALSE;

    static ick_bool eolflag = ick_FALSE;

    if ((size_t)(lineptr - linebuf) > sizeof linebuf)
      ick_lose(IE666, iyylineno, (char *)NULL);

    if (bangflag)
    {
	bangflag = ick_FALSE;
	/*	*lineptr++ = '!'; */
	return('.');
    }
    else if (backflag)	/* converting ctrl-H (backspace) to two chars "^H" */
    {
	backflag = ick_FALSE;
	/*	*lineptr++ = '\b'; */
	return('H');
    }
    else
    {
      int c;
      char c_char;
      /*fprintf(stderr,"about to fgetc(\045p)",(void*)fp);*/

      c_char=0; /* AIS */
      (void)fread(&c_char,1,1,fp);
      /* AIS: ignore the first \r in a row to deal with DOS newlines. The
	 second in a row is definitely an error, though, and will be caught
	 later on. */
      if(c_char=='\r') (void)fread(&c_char,1,1,fp);
      c = c_char;

      if(feof(fp)) c=EOF;
      if(!eolflag && c == EOF) c = '\n';

      /*fprintf(stderr,"getc input a character: %c\n",c);*/

      if (feof(yyin))
      {
	*lineptr = '\0';
	if(eolflag) return(EOF);
	if(c=='\0' || c==EOF) c='\n';
      }

      eolflag = ick_FALSE;

      if (c == '!')
      {
	*lineptr++ = '!';
	bangflag = ick_TRUE;
	return(c = '\'');
      }
      else if (c == '\b')	/* convert ctrl-H (backspace) to
				   two chars "^" and "H" so lex can take it */
      {
	*lineptr++ = '\b';
	backflag = ick_TRUE;
	return(c = '^');
      }
      else if (c == '\n')
      {
	*lineptr = '\0';
	lineptr = linebuf;
	if (iyylineno >= textlinecount)
	{
	  textlinecount += ALLOC_CHUNK;
	  if (textlines)
	    textlines = realloc(textlines,
				textlinecount * sizeof(char*));
	  else
	    textlines = malloc(textlinecount * sizeof(char*));
	  if (!textlines)
	    ick_lose(IE666, iyylineno, (char *)NULL);
	}
	textlines[iyylineno] = malloc(1 + strlen(linebuf));
	if (!textlines[iyylineno])
	  ick_lose(IE666, iyylineno, (char *)NULL);
	strcpy(textlines[iyylineno], linebuf);
	iyylineno++;
	eolflag=ick_TRUE;
	return('\n');
      }
      else
      {
	return(*lineptr++ = c);
      }
    }
}

/* replace YY_INPUT so that it uses our getc function. */
#undef YY_INPUT
#define YY_INPUT(buf,result,max_size) \
	{ \
	  int c = getc(yyin); \
	  if (c == EOF) { \
	    if (ferror(yyin)) \
	      YY_FATAL_ERROR("input in flex scanner failed"); \
	    result = YY_NULL; \
	  } else { \
	    buf[0] = c; \
	    result = 1; \
	  } \
	}

%}

W	[\ \t\n]*
D	[0-9][\ \t\n0-9]*
I       [A-Z]

%%

{D}		{yylval.numval = myatoi(yytext); return(NUMBER);}
\_              {return(NOSPOT);}
\.		{return(ick_ONESPOT);}
\:		{return(ick_TWOSPOT);}
\,		{return(ick_TAIL);}
\;		{return(ick_HYBRID);}
\#		{return(MESH);}

\xBD            |
"c^H/"          |
"c^H|"          {return(MINGLE); /* AIS: CLC-INTERCAL ick_mingle symbols.
				    The \xBD is ISO-8859-1 for cent. */}
\$		|
\xA2		|
\xA3		|
\xA4		|
\xC2\xA2	|
\xC2\xA3	|
\xC2\xA4	|
\xE2\x82\xA0	|
\xE2\x82\xA1	|
\xE2\x82\xA2	|
\xE2\x82\xA3	|
\xE2\x82\xA4	|
\xE2\x82\xA5	|
\xE2\x82\xA6	|
\xE2\x82\xA7	|
\xE2\x82\xA8	|
\xE2\x82\xA9	|
\xE2\x82\xAA	|
\xE2\x82\xAB	|
\xE2\x82\xAC	|
\xE0\xA7\xB2	|
\xE0\xA7\xB3	|
\xE0\xB8\xBF	{return(MINGLE);}

\~		{return(SELECT);}

\/		{return(SLAT); /* AIS: Operand overloading */}
\\		{return(BACKSLAT); /* ditto */}

\&		{yylval.numval = AND; return(UNARY);}
V		{yylval.numval = OR; return(UNARY);}
\xA5            |
\xBE            |
"V^H-"		|
\xE2\x88\x80    {yylval.numval = XOR; return(UNARY);
                 /* AIS: CLC-INTERCAL uses \xBE, ISO-8859-1 for yen;
                    for some reason, \xA5 is what was detected by the compiler
		    during my tests, so that's here too */}
\?              {if(clclex) yylval.numval = WHIRL;
                 else yylval.numval = XOR; return(UNARY);
                 /* AIS: ? is a unary operator in both C-INTERCAL and
		    CLC-INTERCAL, but with different meanings. */}
\|              |
\^		{yylval.numval = FIN; return(UNARY); /* AIS: | is CLC */}
@		{if(clclex) return(NOSPOT); /* AIS: a C/CLC ambiguity */
                 else {yylval.numval = WHIRL; return(UNARY);}}
[2-5]{W}@	{yylval.numval = WHIRL + myatoi(yytext) - 1; return(UNARY);}

\'		{char temp = sparkearsstack[sparkearslev/32]&1;
                 STACKSPARKEARS(0); /* AIS: I added all mentions of
				       STACKSPARKEARS, OPEN\(SPARK\|EARS\),
				       CLOSE\(SPARK\|EARS\),
				       and CLEARSPARKEARSTACK */
                 return(temp?OPENSPARK:CLOSESPARK);}
\"		{char temp = sparkearsstack[sparkearslev/32]&1;
                 STACKSPARKEARS(1);
                 return(temp?CLOSEEARS:OPENEARS);}

\({W}{D}\)	{SETLINENO; yylval.numval = myatoi(yytext); return(LABEL);}

DO		{SETLINENO; CLEARSPARKEARSTACK; return(DO);}
FAC		{SETLINENO; CLEARSPARKEARSTACK; return(DO);}
PLEASE		{SETLINENO; CLEARSPARKEARSTACK; politesse++; return(DO);}
PLACET		{SETLINENO; CLEARSPARKEARSTACK; politesse++; return(DO);}
PLEASE{W}DO	{SETLINENO; CLEARSPARKEARSTACK; politesse++; return(DO);}
PLACET{W}FACERE	{SETLINENO; CLEARSPARKEARSTACK; politesse++; return(DO);}
MAYBE           {SETLINENO; CLEARSPARKEARSTACK; return(MAYBE);}
MAYBE{W}DO      {SETLINENO; CLEARSPARKEARSTACK; return(MAYBE);}
MAYBE{W}PLEASE  {SETLINENO; CLEARSPARKEARSTACK; politesse++; return(MAYBE);}
MAYBE{W}PLEASE{W}DO {SETLINENO; CLEARSPARKEARSTACK; politesse++; return(MAYBE);
                     /* AIS: I added all the MAYBE cases. It seems that MAYBE
                             has no simple Latin synonym. */}

NOT		{return(NOT);}
N\'T		{return(NOT);}
NON		{return(NOT);}
\xAA            {return(NOT); /* AIS: CLC-INTERCAL again, this time it's
				 ISO-8859-1 for the logical NOT symbol... */}
\xAC            {return(NOT); /* ... but my computer translates it to \xAC */}

ONCE            {return(ONCE);}
QUONDAM         {return(ONCE);}
AGAIN           {return(AGAIN);}
ITERUM          {return(AGAIN);}

\%{W}{D}	{yylval.numval = myatoi(yytext);
                 if (yylval.numval && yylval.numval < 100)
		   return(OHOHSEVEN);
		 else
		   ick_lose(IE017, iyylineno, (char *)NULL);}
SUB		{return(SUB);}
MULTIPLICATUS{W}A	|
BY		{return(BY);}

\<-		{return(GETS);}
CALCULANDUM	|
CALCULATING	{yylval.numval = GETS; return(GERUND);}

ALIENERE	|
FORGET		{return(FORGET);}
ALIENENDUM	|
FORGETTING	{yylval.numval = FORGET; return(GERUND);}
RECOLERE	|
RESUME		{return(RESUME);}
RECOLERENDUM	|
RESUMING	{yylval.numval = RESUME; return(GERUND);}
EXUERE		|
STASH		{return(STASH);}
EXUENDUM	|
STASHING	{yylval.numval = STASH; return(GERUND);}
INUERE		|
RETRIEVE	{return(RETRIEVE);}
INUENDUM	|
RETRIEVING	{yylval.numval = RETRIEVE; return(GERUND);}
DISSIMULARE	|
IGNORE		{return(IGNORE);}
DISSIMULANDUM	|
IGNORING	{yylval.numval = IGNORE; return(GERUND);}
MEMINISSE	|
REMEMBER	{return(REMEMBER);}
MEMINISSENDUM	|
REMEMBERING	{yylval.numval = REMEMBER; return(GERUND);}
ABSTINERE	|
ABSTAIN         {return(ABSTAIN);}
ABSTINENDUM	|
ABSTAINING	{yylval.numval = ABSTAIN; return(GERUND);}
REINSTARE	|
REINSTATE	{return(REINSTATE);}
REINSTATANDUM	|
REINSTATING	{yylval.numval = REINSTATE; return(GERUND);}
LEGERE{W}EX	|
READ{W}OUT	{return(READ_OUT);}
LEGENDUM	|
READING{W}OUT	{yylval.numval = READ_OUT; return(GERUND);}
SCRIBERE{W}IN	|
WRITE{W}IN	{return(WRITE_IN);}
SCRIBENDUM	|
WRITING{W}IN	{yylval.numval = WRITE_IN; return(GERUND);}
COMMEMERO       |
COMMENTS        |
COMMENTING      |
COMMENT         {yylval.numval = UNKNOWN; return(GERUND);
                 /* AIS: An idea stolen from CLC-INTERCAL.
		    The Latin means literally 'remind' or 'mention'. */}
PIN             {/* By AIS. I can't find a Latin translation for this. */
                 return(PIN);}
PINNING         {/* By AIS */ yylval.numval = PIN; return(GERUND);}
DEINDERE{W}A{W}\({W}{D}\) |
NEXT{W}FROM{W}\({W}{D}\) {/* AIS */ yylval.numval = myatoi(yytext);
			  return(NEXTFROMLABEL);}
DEINDERE{W}A    |
NEXT{W}FROM     {/* AIS: 'next' is not a verb, so the Latin is invented */
                 return(NEXTFROMEXPR);}
DEINDENDUM	|
NEXTING{W}FROM	{/* AIS */ yylval.numval = NEXTFROMLABEL; return(GERUND);}
ADVENIRE{W}DE{W}\({W}{D}\) |
COME{W}FROM{W}\({W}{D}\) {/* AIS */ yylval.numval = myatoi(yytext);
			  return(COME_FROM);}
ADVENIRE{W}DE   |
COME{W}FROM     {/* AIS */ return(COMPUCOME);}
ADVENENDUM	|
COMING{W}FROM	{yylval.numval = COME_FROM; return(GERUND);}
DEINDE		|
NEXT		{stbeginline = 0; return(NEXT);}
PROXIMANDUM	|
NEXTING		{yylval.numval = NEXT; return(GERUND);}
FROM            {return(FROM); /* AIS: Latin is 'A', which confuses the rest
                                  of the parser */}
CONCEDERE	|
DESPERARE	|
GIVE{W}UP	{return(GIVE_UP);}
CONOR{W}ITERUM  |
TRY{W}AGAIN     {return(TRY_AGAIN);}
WHILE           {return(WHILE); /* AIS. Latin for this is needed. */}
WHILING         |
LOOPING         {yylval.numval = WHILE; return(GERUND);}
TRYING{W}AGAIN  {yylval.numval = TRY_AGAIN; return(GERUND);}
GO{W}BACK       |
REDIRE          {return(GO_BACK);}
GOING{W}BACK    |
REDENDUM        {yylval.numval = GO_BACK; return(GERUND);}
GO{W}AHEAD      |
GRASSOR         {return(GO_AHEAD);}
GOING{W}AHEAD   {yylval.numval = GO_AHEAD; return(GERUND);
		 /* AIS: I'm having a few deponent troubles with the Latin, so
		    there are no Latin gerunds around here. Besides, the Latin
		    'gerunds' look somewhat like gerundives to me, but that's
		    purely based on memory so I may be wrong. */}
CREATE{W}\({W}{D}\)          |
CONFICE{W}\({W}{D}\)         {yylval.numval = myatoi(yytext); return(CREATE);}
CREATE          |
CONFICE         {return(COMPUCREATE);}
CREATING        |
CREATION        |
CONFICENDUM     {yylval.numval = CREATE; return(GERUND);}

\+		{return(INTERSECTION);}

{W}     	;
{I}             {/* AIS */ yylval.numval = *yytext; return(UNKNOWNID);}
.\^H.           {/* AIS */ yylval.numval = yytext[0]*256 + yytext[3];
		 if(yytext[0] > yytext[3])
		   yylval.numval = yytext[0] + yytext[3]*256;
                 return(BADCHAR);}
.		{yylval.numval = yytext[0]; /* AIS: The line below for debug */
 if(yydebug) fprintf(stdout, "yylex: bad char %#x\n",(unsigned char)yytext[0]);
                 return(BADCHAR);}


%%

int lexer(void)
{
    static int tok = BADCHAR;

    if (re_send_token)
	re_send_token = ick_FALSE;
    else 
    {
	tok = yylex();
#ifdef YYDEBUG
	if (yydebug)
	    (void) fprintf(stdout, "yylex: returning token %d\n", tok); 
#endif /* YYDEBUG */
    }

#ifdef YYDEBUG
    if (yydebug)
	(void) fprintf(stdout, "lexer: returning token %d\n", tok); 
#endif /* YYDEBUG */
    return(tok);
}

static int myatoi(const char *text) /* AIS */
{
#define MAXTEXT 100
    static char buf[MAXTEXT];
    static char thinbuf[MAXTEXT];
    char* bp;
    char* tp;
    register int i;

    for(buf[i = 0] = '\0';*text && i < MAXTEXT;text++) {
	if(isdigit(*text)) {
	    buf[i++] = *text;
	}
    }
    buf[i] = '\0';
    bp=buf; tp=thinbuf;
    while(((*tp++=*bp++))); /* thinbuf code added by an AIS
			       in case we want to work with wchar_t;
			       the extra brackets tell GCC that this is
			       intended and not a mistaken assignment */
    return atoi(thinbuf);
}

void yyerror(const char *errtype)
{
#ifdef MAIN
    (void) printf("lextest: lexer error: %s.\n", errtype);
#else /* MAIN */
    (void) errtype;
#endif /* MAIN */
}

#ifdef MAIN
int ick_main(void)
{
	int	t;

	while ((t = yylex()) > 0)
	{
		(void) printf("%03d %09d\n", t, yylval.numval);
		yylval.numval = 0;
	}
	return 0;
}
#endif /* MAIN */