view UNPA @ 4654:d6d727dead8d

<oerjan> addquote <int-e> The people of Procrasti hereby resolve to lodge a formal complaint with Taneb and nortti about their ridicule of Procasti\'s glorious nation... later.
author HackBot
date Sat, 07 Jun 2014 19:24:43 +0000
parents 5fddb59b5d15
children
line wrap: on
line source

/*
  UTF-to-VLQ
  Public domain
*/

#include <stdio.h>
#include <stdlib.h>

#ifdef _WIN32
#include <fcntl.h>
#endif

typedef unsigned char byte;
typedef unsigned long long ULL;

typedef ULL(*in_func_t)(void);
typedef void(*out_func_t)(ULL);

char in_mode;
char out_mode;
int options[128];
ULL translation[256];

#define conv_lf options['L']
#define conv_cr options['c']
#define bom_in options['b']
#define bom_out options['B']
#define trans_le options['t']

byte getb(void) {
  int x=fgetc(stdin);
  if(x==EOF) exit(0);
  return x;
}

inline ULL sign_extend(ULL x,int y) {
  return x|((x&(1LL<<y))?-1LL<<y:0);
}

ULL read_8bit_raw(void) {
  return getb();
}

ULL read_16bit_le_raw(void) {
  ULL x=getb();
  return x|(getb()<<8);
}

ULL read_16bit_be_raw(void) {
  ULL x=getb()<<8;
  return x|getb();
}

ULL read_32bit_le_raw(void) {
  ULL x=getb();
  x|=getb()<<8;
  x|=getb()<<16;
  return x|(getb()<<24);
}

ULL read_32bit_be_raw(void) {
  ULL x=getb()<<24;
  x|=getb()<<16;
  x|=getb()<<8;
  return x|getb();
}

ULL read_64bit_le_raw(void) {
  ULL x=getb();
  x|=getb()<<8;
  x|=getb()<<16;
  x|=((ULL)getb())<<24;
  x|=((ULL)getb())<<32;
  x|=((ULL)getb())<<40;
  x|=((ULL)getb())<<48;
  x|=((ULL)getb())<<56;
  return x;
}

ULL read_64bit_be_raw(void) {
  ULL x=((ULL)getb())<<56;
  x|=((ULL)getb())<<48;
  x|=((ULL)getb())<<40;
  x|=((ULL)getb())<<32;
  x|=((ULL)getb())<<24;
  x|=getb()<<16;
  x|=getb()<<8;
  return x;
}

ULL read_utf8(void) {
  ULL x=getb();
  if((x&0xE0)==0xC0) {
    x=((x&0x1F)<<6)|(getb()&0x3F);
  } else if((x&0xF0)==0xE0) {
    x=((x&0x0F)<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
  } else if((x&0xF8)==0xF0) {
    x=((x&0x07)<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
  } else if((x&0xFC)==0xF8) {
    x=((x&0x03)<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
  } else if((x&0xFE)==0xFC) {
    x=((x&0x01)<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
  } else if(x==0xFE || x==0xFF) {
    x=((x&0x01)<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
  }
  return x;
}

ULL read_vlq8(void) {
  byte x=getb();
  ULL r=0;
  while(x&0x80) {
    r=(r<<7)|(x&0x7F);
    x=getb();
  }
  return r|x;
}

ULL read_leb128(void) {
  byte x=getb();
  int i=0;
  ULL r=0;
  while(x&0x80) {
    r|=(x&0x7F)<<(7*(i++));
    x=getb();
  }
  return r|(x<<(7*i));
}

ULL read_utf16_le(void) {
  ULL r=getb()<<16;
  r|=getb();
  if(r>=0xD800 && r<0xDC00) {
    int x=getb()<<16;
    x|=getb();
    return (((r&0x3FF)<<10)|(x&0x3FF))+0x10000ULL;
  } else {
    return r;
  }
}

ULL read_utf16_be(void) {
  ULL r=getb();
  r|=getb()<<16;
  if(r>=0xD800 && r<0xDC00) {
    int x=getb();
    x|=getb()<<16;
    return (((r&0x3FF)<<10)|(x&0x3FF))+0x10000ULL;
  } else {
    return r;
  }
}

ULL read_translate(void) {
  return translation[getb()];
}

ULL read_messagepack(void) {
  byte x;
  ULL s;
  float f;
  double d;
  for(;;) {
    x=getb();
    switch(x) {
      case 0x00 ... 0x7F: return x;
      case 0x80 ... 0x9F: continue;
      case 0xA0 ... 0xBF:
        while(x-->0xA0) putchar(getb());
        continue;
      case 0xC0: continue;
      case 0xC2: return 0;
      case 0xC3: return -1LL;
      case 0xCA:
        *(short*)&f=read_16bit_be_raw(); // Not completely portable
        return (ULL)f;
      case 0xCB:
        *(int*)&d=read_32bit_be_raw(); // Not completely portable
        return (ULL)f;
      case 0xCC: return read_8bit_raw();
      case 0xCD: return read_16bit_be_raw();
      case 0xCE: return read_32bit_be_raw();
      case 0xCF: return read_64bit_be_raw();
      case 0xD0: return sign_extend(read_8bit_raw(),7);
      case 0xD1: return sign_extend(read_16bit_be_raw(),15);
      case 0xD2: return sign_extend(read_32bit_be_raw(),31);
      case 0xD3: return read_64bit_be_raw();
      case 0xDA:
        s=read_16bit_be_raw();
        while(s--) putchar(getb());
        continue;
      case 0xDB:
        s=read_32bit_be_raw();
        while(s--) putchar(getb());
        continue;
      case 0xDC: read_16bit_be_raw(); continue;
      case 0xDD: read_32bit_be_raw(); continue;
      case 0xDE: read_16bit_be_raw(); continue;
      case 0xDF: read_32bit_be_raw(); continue;
      case 0xE0 ... 0xFF: return x|~31LL;
      default: exit(1);
    }
  }
}

ULL read_hex(void) {
  char a,b;
  do a=getb(); while(a<=' ');
  do b=getb(); while(b<=' ');
  return (((a&15)+(a>='A'?9:0))<<4)|((b&15)+(b>='A'?9:0));
}

void write_8bit_raw(ULL x) {
  putchar(x);
}

void write_16bit_le_raw(ULL x) {
  putchar(x&255);
  putchar(x>>8);
}

void write_16bit_be_raw(ULL x) {
  putchar(x>>8);
  putchar(x&255);
}

void write_32bit_le_raw(ULL x) {
  putchar(x&255);
  putchar(x>>8);
  putchar(x>>16);
  putchar(x>>24);
}

void write_32bit_be_raw(ULL x) {
  putchar(x>>24);
  putchar(x>>16);
  putchar(x>>8);
  putchar(x&255);
}

void write_64bit_le_raw(ULL x) {
  putchar(x&255);
  putchar(x>>8);
  putchar(x>>16);
  putchar(x>>24);
  putchar(x>>32);
  putchar(x>>40);
  putchar(x>>48);
  putchar(x>>56);
}

void write_64bit_be_raw(ULL x) {
  putchar(x>>56);
  putchar(x>>48);
  putchar(x>>40);
  putchar(x>>32);
  putchar(x>>24);
  putchar(x>>16);
  putchar(x>>8);
  putchar(x&255);
}

void write_utf8(ULL x) {
  if(out_mode=='0' && !x) {
    putchar(0xC0);
    putchar(0x80);
  } else if(x<0x80ULL) {
    putchar(x);
  } else if(x<0x800ULL) {
    putchar(0xC0|(x>>6));
    putchar(0x80|(x)&0xBF);
  } else if(x<0x10000ULL) {
    putchar(0xE0|(x>>12));
    putchar(0x80|(x>>6)&0xBF);
    putchar(0x80|(x)&0xBF);
  } else if(x<0x200000ULL) {
    putchar(0xF0|(x>>18));
    putchar(0x80|(x>>12)&0xBF);
    putchar(0x80|(x>>6)&0xBF);
    putchar(0x80|(x)&0xBF);
  } else if(x<0x4000000ULL) {
    putchar(0xF8|(x>>24));
    putchar(0x80|(x>>18)&0xBF);
    putchar(0x80|(x>>12)&0xBF);
    putchar(0x80|(x>>6)&0xBF);
    putchar(0x80|(x)&0xBF);
  } else if(x<0x80000000ULL) {
    putchar(0xFC|(x>>30));
    putchar(0x80|(x>>24)&0xBF);
    putchar(0x80|(x>>18)&0xBF);
    putchar(0x80|(x>>12)&0xBF);
    putchar(0x80|(x>>6)&0xBF);
    putchar(0x80|(x)&0xBF);
  } else if(x<0x1000000000ULL) {
    putchar(0xFE|(x>>36));
    putchar(0x80|(x>>30)&0xBF);
    putchar(0x80|(x>>24)&0xBF);
    putchar(0x80|(x>>18)&0xBF);
    putchar(0x80|(x>>12)&0xBF);
    putchar(0x80|(x>>6)&0xBF);
    putchar(0x80|(x)&0xBF);
  } else {
    exit(1);
  }
}

void write_vlq8(ULL x) {
  int i;
  for(i=63;i;i-=7) if(x&-(1LL<<i)) putchar(0x80|(x>>i)&0xFF);
  putchar(x&0x7F);
}

void write_leb128(ULL x) {
  while(x&~0x7FULL) {
    putchar(0x80|x&0xFF);
    x>>=7;
  }
  putchar(x);
}

void write_utf16_le(ULL x) {
  if(x>0x10FFFFULL) exit(1);
  if(x&0x1F0000ULL) {
    x-=0x10000ULL;
    write_16bit_le_raw((x>>10)|0xD800);
    write_16bit_le_raw((x&0x3FF)|0xDC00);
  } else {
    write_16bit_le_raw(x);
  }
}

void write_utf16_be(ULL x) {
  if(x>0x10FFFFULL) exit(1);
  if(x&0x1F0000ULL) {
    x-=0x10000ULL;
    write_16bit_be_raw((x>>10)|0xD800);
    write_16bit_be_raw((x&0x3FF)|0xDC00);
  } else {
    write_16bit_be_raw(x);
  }
}

void write_translate(ULL x) {
  int i;
  for(i=0;i<256;i++) if(translation[i]==x) putchar(i);
}

void write_hex(ULL x) {
  printf("%02X",(int)x);
}

const in_func_t in_func[128]={
  ['8']=read_8bit_raw,
  ['w']=read_16bit_le_raw,
  ['W']=read_16bit_be_raw,
  ['d']=read_32bit_le_raw,
  ['D']=read_32bit_be_raw,
  ['q']=read_64bit_le_raw,
  ['Q']=read_64bit_be_raw,
  ['1']=read_utf8,
  ['0']=read_utf8,
  ['V']=read_vlq8,
  ['v']=read_leb128,
  ['u']=read_utf16_le,
  ['U']=read_utf16_be,
  ['T']=read_translate,
  ['M']=read_messagepack,
  ['4']=read_hex,
  [0]=0
};

const out_func_t out_func[128]={
  ['8']=write_8bit_raw,
  ['w']=write_16bit_le_raw,
  ['W']=write_16bit_be_raw,
  ['d']=write_32bit_le_raw,
  ['D']=write_32bit_be_raw,
  ['q']=write_64bit_le_raw,
  ['Q']=write_64bit_be_raw,
  ['1']=write_utf8,
  ['0']=write_utf8,
  ['V']=write_vlq8,
  ['v']=write_leb128,
  ['u']=write_utf16_le,
  ['U']=write_utf16_be,
  ['T']=write_translate,
  ['4']=write_hex,
  [0]=0
};

int main(int argc,char**argv) {
  int b;
  int is_lf=0;
  ULL x;
#ifdef _WIN32
  _setmode(_fileno(stdin),_O_BINARY);
  _setmode(_fileno(stdout),_O_BINARY);
#endif
  if(argc<2 || !argv[1][0] || !in_func[argv[1][0]] || !out_func[argv[1][1]]) return 1;
  in_mode=argv[1][0];
  out_mode=argv[1][1];
  for(b=2;argv[1][b];b++) options[argv[1][b]&127]=1;
  if(argc>2) {
    FILE*fp=fopen(argv[2],"rb");
    int i;
    if(!fp) return 1;
    fseek(fp,0,SEEK_END);
    b=ftell(fp)>>8;
    rewind(fp);
    for(i=0;i<255;i++) {
      translation[i]=fgetc(fp);
      if(b>1) translation[i]=trans_le?(translation[i]|(fgetc(fp)<<8)):((translation[i]<<8)|fgetc(fp));
      if(b>2) translation[i]=trans_le?(translation[i]|(fgetc(fp)<<16)):((translation[i]<<8)|fgetc(fp));
      if(b>3) translation[i]=trans_le?(translation[i]|(fgetc(fp)<<24)):((translation[i]<<8)|fgetc(fp));
    }
    fclose(fp);
  }
  if(bom_out) out_func[out_mode&127](0xFEFF);
  while(!feof(stdin)) {
    x=in_func[in_mode&127]();
    if(bom_in && x!=0xFEFF) return 1;
    if(is_lf && x==10) {
      is_lf=0;
      continue;
    }
    if(is_lf=(conv_lf && x==13)) x=10;
    if(conv_cr && x==10) out_func[out_mode&127](13);
    if(!bom_in) out_func[out_mode&127](x);
    bom_in=0;
  }
  return 0;
}