view UNPA @ 4621:d84364ce79eb

<oerjan> addquote <olsner> oh, fungot moved to github? <fungot> olsner: when i finally found myself fnord and free, i awkwardly clambered down to the sea and fnord down from the fifth story. he had known merely by reputation or not at all.
author HackBot
date Tue, 29 Apr 2014 06:55:26 +0000
parents 5fddb59b5d15
children
line wrap: on
line source

/*
  UTF-to-VLQ
  Public domain
*/

#include <stdio.h>
#include <stdlib.h>

#ifdef _WIN32
#include <fcntl.h>
#endif

typedef unsigned char byte;
typedef unsigned long long ULL;

typedef ULL(*in_func_t)(void);
typedef void(*out_func_t)(ULL);

char in_mode;
char out_mode;
int options[128];
ULL translation[256];

#define conv_lf options['L']
#define conv_cr options['c']
#define bom_in options['b']
#define bom_out options['B']
#define trans_le options['t']

byte getb(void) {
  int x=fgetc(stdin);
  if(x==EOF) exit(0);
  return x;
}

inline ULL sign_extend(ULL x,int y) {
  return x|((x&(1LL<<y))?-1LL<<y:0);
}

ULL read_8bit_raw(void) {
  return getb();
}

ULL read_16bit_le_raw(void) {
  ULL x=getb();
  return x|(getb()<<8);
}

ULL read_16bit_be_raw(void) {
  ULL x=getb()<<8;
  return x|getb();
}

ULL read_32bit_le_raw(void) {
  ULL x=getb();
  x|=getb()<<8;
  x|=getb()<<16;
  return x|(getb()<<24);
}

ULL read_32bit_be_raw(void) {
  ULL x=getb()<<24;
  x|=getb()<<16;
  x|=getb()<<8;
  return x|getb();
}

ULL read_64bit_le_raw(void) {
  ULL x=getb();
  x|=getb()<<8;
  x|=getb()<<16;
  x|=((ULL)getb())<<24;
  x|=((ULL)getb())<<32;
  x|=((ULL)getb())<<40;
  x|=((ULL)getb())<<48;
  x|=((ULL)getb())<<56;
  return x;
}

ULL read_64bit_be_raw(void) {
  ULL x=((ULL)getb())<<56;
  x|=((ULL)getb())<<48;
  x|=((ULL)getb())<<40;
  x|=((ULL)getb())<<32;
  x|=((ULL)getb())<<24;
  x|=getb()<<16;
  x|=getb()<<8;
  return x;
}

ULL read_utf8(void) {
  ULL x=getb();
  if((x&0xE0)==0xC0) {
    x=((x&0x1F)<<6)|(getb()&0x3F);
  } else if((x&0xF0)==0xE0) {
    x=((x&0x0F)<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
  } else if((x&0xF8)==0xF0) {
    x=((x&0x07)<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
  } else if((x&0xFC)==0xF8) {
    x=((x&0x03)<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
  } else if((x&0xFE)==0xFC) {
    x=((x&0x01)<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
  } else if(x==0xFE || x==0xFF) {
    x=((x&0x01)<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
    x=(x<<6)|(getb()&0x3F);
  }
  return x;
}

ULL read_vlq8(void) {
  byte x=getb();
  ULL r=0;
  while(x&0x80) {
    r=(r<<7)|(x&0x7F);
    x=getb();
  }
  return r|x;
}

ULL read_leb128(void) {
  byte x=getb();
  int i=0;
  ULL r=0;
  while(x&0x80) {
    r|=(x&0x7F)<<(7*(i++));
    x=getb();
  }
  return r|(x<<(7*i));
}

ULL read_utf16_le(void) {
  ULL r=getb()<<16;
  r|=getb();
  if(r>=0xD800 && r<0xDC00) {
    int x=getb()<<16;
    x|=getb();
    return (((r&0x3FF)<<10)|(x&0x3FF))+0x10000ULL;
  } else {
    return r;
  }
}

ULL read_utf16_be(void) {
  ULL r=getb();
  r|=getb()<<16;
  if(r>=0xD800 && r<0xDC00) {
    int x=getb();
    x|=getb()<<16;
    return (((r&0x3FF)<<10)|(x&0x3FF))+0x10000ULL;
  } else {
    return r;
  }
}

ULL read_translate(void) {
  return translation[getb()];
}

ULL read_messagepack(void) {
  byte x;
  ULL s;
  float f;
  double d;
  for(;;) {
    x=getb();
    switch(x) {
      case 0x00 ... 0x7F: return x;
      case 0x80 ... 0x9F: continue;
      case 0xA0 ... 0xBF:
        while(x-->0xA0) putchar(getb());
        continue;
      case 0xC0: continue;
      case 0xC2: return 0;
      case 0xC3: return -1LL;
      case 0xCA:
        *(short*)&f=read_16bit_be_raw(); // Not completely portable
        return (ULL)f;
      case 0xCB:
        *(int*)&d=read_32bit_be_raw(); // Not completely portable
        return (ULL)f;
      case 0xCC: return read_8bit_raw();
      case 0xCD: return read_16bit_be_raw();
      case 0xCE: return read_32bit_be_raw();
      case 0xCF: return read_64bit_be_raw();
      case 0xD0: return sign_extend(read_8bit_raw(),7);
      case 0xD1: return sign_extend(read_16bit_be_raw(),15);
      case 0xD2: return sign_extend(read_32bit_be_raw(),31);
      case 0xD3: return read_64bit_be_raw();
      case 0xDA:
        s=read_16bit_be_raw();
        while(s--) putchar(getb());
        continue;
      case 0xDB:
        s=read_32bit_be_raw();
        while(s--) putchar(getb());
        continue;
      case 0xDC: read_16bit_be_raw(); continue;
      case 0xDD: read_32bit_be_raw(); continue;
      case 0xDE: read_16bit_be_raw(); continue;
      case 0xDF: read_32bit_be_raw(); continue;
      case 0xE0 ... 0xFF: return x|~31LL;
      default: exit(1);
    }
  }
}

ULL read_hex(void) {
  char a,b;
  do a=getb(); while(a<=' ');
  do b=getb(); while(b<=' ');
  return (((a&15)+(a>='A'?9:0))<<4)|((b&15)+(b>='A'?9:0));
}

void write_8bit_raw(ULL x) {
  putchar(x);
}

void write_16bit_le_raw(ULL x) {
  putchar(x&255);
  putchar(x>>8);
}

void write_16bit_be_raw(ULL x) {
  putchar(x>>8);
  putchar(x&255);
}

void write_32bit_le_raw(ULL x) {
  putchar(x&255);
  putchar(x>>8);
  putchar(x>>16);
  putchar(x>>24);
}

void write_32bit_be_raw(ULL x) {
  putchar(x>>24);
  putchar(x>>16);
  putchar(x>>8);
  putchar(x&255);
}

void write_64bit_le_raw(ULL x) {
  putchar(x&255);
  putchar(x>>8);
  putchar(x>>16);
  putchar(x>>24);
  putchar(x>>32);
  putchar(x>>40);
  putchar(x>>48);
  putchar(x>>56);
}

void write_64bit_be_raw(ULL x) {
  putchar(x>>56);
  putchar(x>>48);
  putchar(x>>40);
  putchar(x>>32);
  putchar(x>>24);
  putchar(x>>16);
  putchar(x>>8);
  putchar(x&255);
}

void write_utf8(ULL x) {
  if(out_mode=='0' && !x) {
    putchar(0xC0);
    putchar(0x80);
  } else if(x<0x80ULL) {
    putchar(x);
  } else if(x<0x800ULL) {
    putchar(0xC0|(x>>6));
    putchar(0x80|(x)&0xBF);
  } else if(x<0x10000ULL) {
    putchar(0xE0|(x>>12));
    putchar(0x80|(x>>6)&0xBF);
    putchar(0x80|(x)&0xBF);
  } else if(x<0x200000ULL) {
    putchar(0xF0|(x>>18));
    putchar(0x80|(x>>12)&0xBF);
    putchar(0x80|(x>>6)&0xBF);
    putchar(0x80|(x)&0xBF);
  } else if(x<0x4000000ULL) {
    putchar(0xF8|(x>>24));
    putchar(0x80|(x>>18)&0xBF);
    putchar(0x80|(x>>12)&0xBF);
    putchar(0x80|(x>>6)&0xBF);
    putchar(0x80|(x)&0xBF);
  } else if(x<0x80000000ULL) {
    putchar(0xFC|(x>>30));
    putchar(0x80|(x>>24)&0xBF);
    putchar(0x80|(x>>18)&0xBF);
    putchar(0x80|(x>>12)&0xBF);
    putchar(0x80|(x>>6)&0xBF);
    putchar(0x80|(x)&0xBF);
  } else if(x<0x1000000000ULL) {
    putchar(0xFE|(x>>36));
    putchar(0x80|(x>>30)&0xBF);
    putchar(0x80|(x>>24)&0xBF);
    putchar(0x80|(x>>18)&0xBF);
    putchar(0x80|(x>>12)&0xBF);
    putchar(0x80|(x>>6)&0xBF);
    putchar(0x80|(x)&0xBF);
  } else {
    exit(1);
  }
}

void write_vlq8(ULL x) {
  int i;
  for(i=63;i;i-=7) if(x&-(1LL<<i)) putchar(0x80|(x>>i)&0xFF);
  putchar(x&0x7F);
}

void write_leb128(ULL x) {
  while(x&~0x7FULL) {
    putchar(0x80|x&0xFF);
    x>>=7;
  }
  putchar(x);
}

void write_utf16_le(ULL x) {
  if(x>0x10FFFFULL) exit(1);
  if(x&0x1F0000ULL) {
    x-=0x10000ULL;
    write_16bit_le_raw((x>>10)|0xD800);
    write_16bit_le_raw((x&0x3FF)|0xDC00);
  } else {
    write_16bit_le_raw(x);
  }
}

void write_utf16_be(ULL x) {
  if(x>0x10FFFFULL) exit(1);
  if(x&0x1F0000ULL) {
    x-=0x10000ULL;
    write_16bit_be_raw((x>>10)|0xD800);
    write_16bit_be_raw((x&0x3FF)|0xDC00);
  } else {
    write_16bit_be_raw(x);
  }
}

void write_translate(ULL x) {
  int i;
  for(i=0;i<256;i++) if(translation[i]==x) putchar(i);
}

void write_hex(ULL x) {
  printf("%02X",(int)x);
}

const in_func_t in_func[128]={
  ['8']=read_8bit_raw,
  ['w']=read_16bit_le_raw,
  ['W']=read_16bit_be_raw,
  ['d']=read_32bit_le_raw,
  ['D']=read_32bit_be_raw,
  ['q']=read_64bit_le_raw,
  ['Q']=read_64bit_be_raw,
  ['1']=read_utf8,
  ['0']=read_utf8,
  ['V']=read_vlq8,
  ['v']=read_leb128,
  ['u']=read_utf16_le,
  ['U']=read_utf16_be,
  ['T']=read_translate,
  ['M']=read_messagepack,
  ['4']=read_hex,
  [0]=0
};

const out_func_t out_func[128]={
  ['8']=write_8bit_raw,
  ['w']=write_16bit_le_raw,
  ['W']=write_16bit_be_raw,
  ['d']=write_32bit_le_raw,
  ['D']=write_32bit_be_raw,
  ['q']=write_64bit_le_raw,
  ['Q']=write_64bit_be_raw,
  ['1']=write_utf8,
  ['0']=write_utf8,
  ['V']=write_vlq8,
  ['v']=write_leb128,
  ['u']=write_utf16_le,
  ['U']=write_utf16_be,
  ['T']=write_translate,
  ['4']=write_hex,
  [0]=0
};

int main(int argc,char**argv) {
  int b;
  int is_lf=0;
  ULL x;
#ifdef _WIN32
  _setmode(_fileno(stdin),_O_BINARY);
  _setmode(_fileno(stdout),_O_BINARY);
#endif
  if(argc<2 || !argv[1][0] || !in_func[argv[1][0]] || !out_func[argv[1][1]]) return 1;
  in_mode=argv[1][0];
  out_mode=argv[1][1];
  for(b=2;argv[1][b];b++) options[argv[1][b]&127]=1;
  if(argc>2) {
    FILE*fp=fopen(argv[2],"rb");
    int i;
    if(!fp) return 1;
    fseek(fp,0,SEEK_END);
    b=ftell(fp)>>8;
    rewind(fp);
    for(i=0;i<255;i++) {
      translation[i]=fgetc(fp);
      if(b>1) translation[i]=trans_le?(translation[i]|(fgetc(fp)<<8)):((translation[i]<<8)|fgetc(fp));
      if(b>2) translation[i]=trans_le?(translation[i]|(fgetc(fp)<<16)):((translation[i]<<8)|fgetc(fp));
      if(b>3) translation[i]=trans_le?(translation[i]|(fgetc(fp)<<24)):((translation[i]<<8)|fgetc(fp));
    }
    fclose(fp);
  }
  if(bom_out) out_func[out_mode&127](0xFEFF);
  while(!feof(stdin)) {
    x=in_func[in_mode&127]();
    if(bom_in && x!=0xFEFF) return 1;
    if(is_lf && x==10) {
      is_lf=0;
      continue;
    }
    if(is_lf=(conv_lf && x==13)) x=10;
    if(conv_cr && x==10) out_func[out_mode&127](13);
    if(!bom_in) out_func[out_mode&127](x);
    bom_in=0;
  }
  return 0;
}