#include <stdio.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#include "codec.h"

// EXHAUST_SEARCH forces the encoder to check ALL of the possible
// motion-vectors; this is slow, but gives better quality
#define EXHAUST_SEARCH

// CODE_FOR_TV forces some of the high frequencies to zero
//#define CODE_FOR_TV

// if all blocks in an I-frame are coded with DCT, we can
// save 1 bit/block by not storing the DCT/motion bit
#define IFRAMES_ALWAYS_DCT

// output statistics???
#define STATISTICS

// use quantization??
#define QUANTIZING

// when NODIVIDE is defined, X/33 will be done as (X*31775)>>20 (BAD IDEA!!)
//#define NODIVIDE

// when ALLBITS is defined, all stored DCT values will used 6 bits
//#define ALLBITS

// MOTIONSEARCH defines how many blocks should be checked when doing
// motion-search - use 225 (all), 10 (fast), 1 (very fast) or 0 (none)
#define MOTIONSEARCH   225

// no. of bits to use in fixed-point math
#define ENCSHIFT              6
#define DECSHIFT              2

// various...
#define BLOCK_MOTION          0
#define BLOCK_DCT             1

#define VECTOR_SHORT          0
#define VECTOR_LONG           1

#define NONZERO_VALUE         1
#define ZERO_VALUE            0

// constants used by DCT
#define SCALE(x,bits)         (((x)+(1<<(10-bits-1))>>(10-bits)))
#define ENC_C1                SCALE(946,  ENCSHIFT)
#define ENC_C2                SCALE(724,  ENCSHIFT)
#define ENC_C3                SCALE(392,  ENCSHIFT)
#define DEC_K1                SCALE(1338, DECSHIFT)
#define DEC_K2                SCALE(783,  DECSHIFT)
#define DEC_K3                SCALE(1892, DECSHIFT)

// k1 = cos(pi/8)*sqrt(2)
// k2 = (cos(pi/8)-cos(3*pi/8))*sqrt(2)
// k3 = (cos(pi/8)+cos(3*pi/8))*sqrt(2)
// C1 = cos(pi/8)
// C3 = cos(3*pi/8)
// C2 = sqrt(0.5)


#define PLANE_Y     0
#define PLANE_U     1
#define PLANE_V     2


static int initialised = 0;

#ifdef QUANTIZING
static int quantmatrix[4][16] = { { 8, 12, 16, 20,
                                   12, 16, 20, 24,
                                   16, 20, 24, 32,
                                   20, 24, 32, 40 },
                                 {  8,  8, 12, 14,
                                    8, 12, 14, 16,
                                   12, 14, 16, 24,
                                   14, 16, 24, 24 },
                                  { 8,  8, 10, 12,
                                    8, 10, 12, 14,
                                   10, 12, 14, 16,
                                   12, 14, 16, 16 },
                                  { 8,  8,  8,  8,
                                    8,  8,  8,  8,
                                    8,  8, 10, 10,
                                    8,  8, 10, 12 }};

static quantmatrix_y[4][16], iquantmatrix_y[4][16];
static quantmatrix_uv[4][16], iquantmatrix_uv[4][16];
#endif

static int izigzag[16]  = { 0, 4, 1, 2, 5, 8, 12, 9, 6, 3, 7 ,10, 13, 14, 11, 15 };
static int zigzag[16] = { 0, 2, 3, 9, 1, 4, 8, 10, 5, 7, 11, 14, 6, 12, 13, 15 };

#ifdef ALLBITS
static int maxbits[16] = { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
#else
static int maxbits[16] = { 6, 6, 6, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2};
#endif

#define VECTOR_X    0
#define VECTOR_Y    1
static int vector[256][2];

static void init(void);
static void dct(int dctblock[16]);
static void idct(int dctblock[16]);
static int encode_plane(unsigned char *cur, unsigned char *prev, int bx, int by, int targetsize, unsigned char *out, int plane, unsigned char *dctage);
static int compare_blocks(unsigned char *cur, unsigned char *prev, int linelength, int perr, int berr);
static int decode_plane(unsigned char *cur, unsigned char *prev, int bx, int by, unsigned char *in, int plane);



int encode_frame(FRAME *current, FRAME *previous, int sizex, int sizey, int targetsize, unsigned char *buffer, unsigned char *dctage) {

  int bytessofar;
  unsigned char *y, *u, *v;

  if (!initialised)   init();

  if (previous) {
    y = previous->Y;
    u = previous->U;
    v = previous->V;
  } else
    y = u = v = NULL;

  bytessofar = 0;

  bytessofar += 3;            // frame-size, written after encoding
  buffer[bytessofar++] = 1;   // bit 0 set ->U and V included

  bytessofar += encode_plane(current->Y, y, sizex/4, sizey/4, targetsize*12/16,
                            buffer+bytessofar, PLANE_Y, dctage);
  targetsize = (targetsize-bytessofar)/2;
  if (targetsize < 400)  targetsize = 400;
  if (dctage)   dctage += sizex*sizey/16;
  bytessofar += encode_plane(current->U, u, sizex/8, sizey/8, targetsize,
                             buffer+bytessofar, PLANE_U, dctage);
  if (dctage)   dctage += sizex*sizey/64;
  bytessofar += encode_plane(current->V, v, sizex/8, sizey/8, targetsize,
                             buffer+bytessofar, PLANE_V, dctage);

  buffer[0] = bytessofar       & 255;
  buffer[1] = (bytessofar>>8)  & 255;
  buffer[2] = (bytessofar>>16) & 255;
  return bytessofar;
}



int decode_frame(FRAME *current, FRAME *previous, int sizex, int sizey, unsigned char *buffer, int buffersize) {

  int bytessofar, planeinfo;
  unsigned char *y, *u, *v;

  if (!initialised)   init();

  if (previous) {
    y = previous->Y;
    u = previous->U;
    v = previous->V;
  } else
    y = u = v = NULL;

  bytessofar = 3;                       // skip frame-size

  planeinfo = buffer[bytessofar++];     // ignore info about U and V presence

  bytessofar += decode_plane(current->Y, y, sizex/4, sizey/4, buffer+bytessofar, PLANE_Y);
  bytessofar += decode_plane(current->U, u, sizex/8, sizey/8, buffer+bytessofar, PLANE_U);
  bytessofar += decode_plane(current->V, v, sizex/8, sizey/8, buffer+bytessofar, PLANE_V);

  return bytessofar;
}

// --------------------------------------------------------------------------------

#define WRITEBYTE      { out[written++] = bucket &255; bucket >>= 8; bits -= 8; }
#ifdef STATISTICS
#define WRITEBITS(n,v) { bucket |= ((v) & ((1<<(n))-1))<<bits; bits += n; bitswritten += n; }
#else
#define WRITEBITS(n,v) { bucket |= ((v) & ((1<<(n))-1))<<bits; bits += n; }
#endif


int encode_plane(unsigned char *cur, unsigned char *prev, int bx, int by, int targetsize, unsigned char *out, int plane, unsigned char *dctage) {

  int i, x, y, bits, bitsperline, *matrix, *imatrix;
  int maxpixelerror, maxblockerror;
  unsigned int bucket, written;

  // quality is used to select quant-matrix - it is adjusted for each slice
  static int quality = 2<<2;     // preserved between slices/planes/frames

#ifdef STATISTICS
  int bitswritten = 0, ndct = 0, nmotion = 0, nhifreq = 0;
  int coefcount[16], coefmax[16], coefmin[16];
  for (i = 0; i < 16; i++) {
    coefcount[i] = 0;
    coefmax[i] = -10000;
    coefmin[i] = 10000;
  }
#endif

  written = 0;
  bucket = 0;
  bits = 0;

  bitsperline = (targetsize*8)/(by+1);  // by+1 is just to make sure...

  maxpixelerror = 4;         // squared
  maxblockerror = 10;        // squared

  // write plane info
  WRITEBITS(2, 0)

  for (y = 0; y < by; y++) {
    // adjust quality (allowed motion-error and quant-matrix) every 8th slice
    if ((y &7) == 0) {
      int oldq;
      oldq = quality>>2;
      if (written*8 > y*bitsperline) {
        // reduce quality
        maxpixelerror = 9;
        maxblockerror = 24;
        quality--;
        if (quality < 0)   quality = 0;
      } else {
        // increase quality
        maxpixelerror = 4;
        maxblockerror = 10;
        quality += 2;
        if (quality > 15)  quality = 15;
      }
      if (quality>>2 != oldq)  printf("QUALITY = %d\n", quality>>2);
    }

#ifdef QUANTIZING
  if (plane == PLANE_Y) {
      matrix = quantmatrix_y[quality>>2];
      imatrix = iquantmatrix_y[quality>>2];
    } else {
      matrix = quantmatrix_uv[quality>>2];
      imatrix = iquantmatrix_uv[quality>>2];
    }
#endif

    // write slice info
    WRITEBITS(2, quality>>2)
    for (x = 0; x < bx; x++) {
      int motionx, motiony, usemotion, block[16], xd, yd, non0, age;

      // extract block
      for (yd = 0; yd < 4; yd++)
        for (xd = 0; xd < 4; xd++)
          block[xd+4*yd] = cur[x*4+xd + (y*4+yd)*4*bx];

      // do forward-DCT
      dct(block);

#ifdef CODE_FOR_TV
      block[12] = block[13] = block[14] = block[15] = 0;
#endif

#ifdef STATISTICS
      for (i = 0; i < 16; i++) {
        if (block[i] > coefmax[i])  coefmax[i] = block[i];
        if (block[i] < coefmin[i])  coefmin[i] = block[i];
        if (block[i])  coefcount[i]++;
      }
#endif

      // quantize - currently always use matrix #2
      non0 = 0;
      for (i = 0; i < 16; i++) {
#ifdef QUANTIZING
        if (block[i] < 0)
          block[i] = (matrix[i]*block[i]-31)/64;
        else
          block[i] = (matrix[i]*block[i]+31)/64;
#endif
        if (block[i])  non0 |= 1<<zigzag[i];
      }
      // clamp to legal values
      if (block[0] > 63)  block[0] = 63;
      for (i = 1; i < 16; i++) {
        int max, min;
        max = (1<<(maxbits[zigzag[i]]-1))-1;
        min = -max-1;
        if (block[i] < min)
          block[i] = min;
        else if (block[i] > max)
          block[i] = max;
      }

      usemotion = 0;

      // find out how many frames ago the block was last coded using DCT
      if (dctage)
        age = *dctage;
      else
        age = 0;

      if ((prev) && (age < 8)) {
        // do motion search
        unsigned char *curblock, *prevblock;
        int v, bestsofar;

        curblock = cur + 4*x + 4*y*4*bx;
        bestsofar = 10000000;

        for (v = 0; (v < MOTIONSEARCH) && (!usemotion); v++) {
          int searchx, searchy, err;

          searchx = vector[v][VECTOR_X] + 4*x;
          searchy = vector[v][VECTOR_Y] + 4*y;
          if ((searchx >= 0) && (searchx+4 <= 4*bx) && (searchy >= 0) && (searchy+4 <= 4*by)) {
            prevblock = prev + searchx + searchy*4*bx;
            err = compare_blocks(curblock, prevblock, 4*bx, maxpixelerror, maxblockerror);
            if ( (err < maxblockerror && v != 0) || (err < maxblockerror/2)) {
#ifdef EXHAUST_SEARCH
              if (err < bestsofar) {
                motionx = searchx - 4*x;
                motiony = searchy - 4*y;
                bestsofar = err;
              }
#else
              usemotion = 1;
              motionx = searchx - 4*x;
              motiony = searchy - 4*y;
#endif
            }
          }
        }
#ifdef EXHAUST_SEARCH
        if (bestsofar < maxblockerror)    usemotion = 1;
#endif
      }

      // flush bit-bucket
      while (bits > 8)  WRITEBYTE

      // write coded data
      if (usemotion) {
        if (dctage)   *dctage += 1;
        if (motionx == 0 && motiony == 0)   *dctage += 1;
#ifdef STATISTICS
        nmotion++;
#endif
        WRITEBITS(1, BLOCK_MOTION)
        if ((motionx >= -2) && (motionx <= 1) &&
            (motiony >= -2) && (motiony <= 1)) {
          WRITEBITS(1, VECTOR_SHORT)
          WRITEBITS(2, motionx)
          WRITEBITS(2, motiony)
        } else {
          WRITEBITS(1, VECTOR_LONG)
          WRITEBITS(4, motionx)
          WRITEBITS(4, motiony)
        }

      } else {
        if (dctage)   *dctage = rand() &3;
#ifdef STATISTICS
        ndct++;
#endif

#ifdef IFRAMES_ALWAYS_DCT
        if (prev)  WRITEBITS(1, BLOCK_DCT)
#else
        WRITEBITS(1, BLOCK_DCT)
#endif

        WRITEBITS(6, block[0])
        if (block[4]) {
          WRITEBITS(1, NONZERO_VALUE)
          WRITEBITS(6, block[4])
        } else
          WRITEBITS(1, ZERO_VALUE)
        if (block[1]) {
          WRITEBITS(1, NONZERO_VALUE)
          WRITEBITS(6, block[1])
        } else
          WRITEBITS(1, ZERO_VALUE)

        while (bits > 8)  WRITEBYTE

        if (non0 & 0xffec) {
          int step;

          WRITEBITS(1, NONZERO_VALUE)
          step = 1;
          for (i = 3; i < 16; i++) {              // write list of non-0
            if (non0 & (1<<i)) {
#ifdef STATISTICS
              nhifreq++;
#endif
              if (step >= 7)   break;             // ignore very long runs
              WRITEBITS(3, step)
              WRITEBITS(maxbits[i], block[izigzag[i]])
              while (bits > 8)  WRITEBYTE
              step = 1;
            } else
              step++;
          }
          WRITEBITS(3, 0)
        } else
          WRITEBITS(1, ZERO_VALUE)
      }

      dctage++;
    }
  }

  while (bits > 0)  WRITEBYTE

#ifdef STATISTICS
  printf("FRAME CODING STATISTICS: MOTION: %d   DCT: %d (%d hi-freq)\n", nmotion, ndct, nhifreq);
  printf("DCT INFO (min,max,non-0) : %3.d,%3.d,%3.d   %3.d,%3.d,%3.d   %3.d,%3.d,%3.d  %3.d,%3.d,%3.d\n", coefmin[0], coefmax[0], coefcount[0],
                    coefmin[1], coefmax[1], coefcount[1],
                    coefmin[2], coefmax[2], coefcount[2],
                    coefmin[3], coefmax[3], coefcount[3]);
  printf("                           %3.d,%3.d,%3.d   %3.d,%3.d,%3.d   %3.d,%3.d,%3.d  %3.d,%3.d,%3.d\n", coefmin[4], coefmax[4], coefcount[4],
                    coefmin[5], coefmax[5], coefcount[5],
                    coefmin[6], coefmax[6], coefcount[6],
                    coefmin[7], coefmax[7], coefcount[7]);
  printf("                           %3.d,%3.d,%3.d   %3.d,%3.d,%3.d   %3.d,%3.d,%3.d  %3.d,%3.d,%3.d\n", coefmin[8], coefmax[8], coefcount[8],
                    coefmin[9], coefmax[9], coefcount[9],
                    coefmin[10], coefmax[10], coefcount[10],
                    coefmin[11], coefmax[11], coefcount[11]);
  printf("                           %3.d,%3.d,%3.d   %3.d,%3.d,%3.d   %3.d,%3.d,%3.d  %3.d,%3.d,%3.d\n", coefmin[12], coefmax[12], coefcount[12],
                    coefmin[13], coefmax[13], coefcount[13],
                    coefmin[14], coefmax[14], coefcount[14],
                    coefmin[15], coefmax[15], coefcount[15]);
#endif


  return written;
}



int compare_blocks(unsigned char *cur, unsigned char *prev, int linelength, int perr, int berr) {
  int bdiff, pdiff, x, y, i;

#define NO_MATCH_FOUND        0xf000000
  i = x = y = 0;
  bdiff = 0;
  while ((i < 16) && (bdiff < berr)) {
    pdiff = cur[x]-prev[x];   // diff between pixel in cur and prev frame
    pdiff = pdiff*pdiff;      // squared diff
    if (pdiff > perr)  return NO_MATCH_FOUND;
    bdiff += pdiff;           // accumulated err
    i++;
    x++;
    if (x == 4) {             // end of line in block
      x = 0;
      y++;
      cur += linelength;
      prev += linelength;
    }
  }
  if (bdiff < berr)  return bdiff;

  return NO_MATCH_FOUND;
}


// --------------------------------------------------------------------------------

void dct(int dctblock[16]) {

  int v0, v1, v2, v3, y, half, temp;

  half = 33<<(2*ENCSHIFT+1);
  for (y = 0; y < 4; y++) {
    v0 = 33*dctblock[0+4*y];
    v1 = 33*dctblock[1+4*y];
    v2 = 33*dctblock[2+4*y];
    v3 = 33*dctblock[3+4*y];
    dctblock[0+4*y] = ENC_C2*(v0+v1+v2+v3);
    dctblock[1+4*y] = ENC_C1*(v0-v3)+ENC_C3*(v1-v2);
    dctblock[2+4*y] = ENC_C2*(v0-v1-v2+v3);
    dctblock[3+4*y] = ENC_C3*(v0-v3)-ENC_C1*(v1-v2);
  }

  for (y = 0; y < 4; y++) {
    v0 = dctblock[y+0];
    v1 = dctblock[y+4];
    v2 = dctblock[y+8];
    v3 = dctblock[y+12];

    temp = ENC_C2*(v0+v1+v2+v3);
    if (temp < 0)
      temp -= half;
    else
      temp += half;
    temp >>= 2*ENCSHIFT+2;
#ifdef NODIVIDE
    dctblock[y+0] = (temp*31775)>>20;
#else
    dctblock[y+0] = temp/33;
#endif

    temp = ENC_C1*(v0-v3)+ENC_C3*(v1-v2);
    if (temp < 0)
      temp -= half;
    else
      temp += half;
    temp >>= 2*ENCSHIFT+2;
#ifdef NODIVIDE
    dctblock[y+4] = (temp*31775)>>20;
#else
    dctblock[y+4] = temp/33;
#endif

    temp = ENC_C2*(v0-v1-v2+v3);
    if (temp < 0)
      temp -= half;
    else
      temp += half;
    temp >>= 2*ENCSHIFT+2;
#ifdef NODIVIDE
    dctblock[y+8] = (temp*31775)>>20;
#else
    dctblock[y+8] = temp/33;
#endif

    temp = ENC_C3*(v0-v3)-ENC_C1*(v1-v2);
    if (temp < 0)
      temp -= half;
    else
      temp += half;
    temp >>= 2*ENCSHIFT+2;
#ifdef NODIVIDE
    dctblock[y+12] = (temp*31775)>>20;
#else
    dctblock[y+12] = temp/33;
#endif
  }
}


void idct(int dctblock[16]) {

  int y, t, v0, v1, v2, v3, half, temp;

  half = 1<<(2*DECSHIFT);
  for (y = 0; y < 4; y++) {
    v0 = dctblock[y+0];
    v1 = dctblock[y+4];
    v2 = dctblock[y+8];
    v3 = dctblock[y+12];
    t = DEC_K1*(v1+v3);
    v1 *= DEC_K3;
    v3 *= DEC_K2;
    dctblock[y+0] = ((v0+v2)<<DECSHIFT) + t - v3;
    dctblock[y+4] = ((v0-v2)<<DECSHIFT) - t + v1;
    dctblock[y+8] = ((v0-v2)<<DECSHIFT) + t - v1;
    dctblock[y+12] = ((v0+v2)<<DECSHIFT) - t + v3;
  }

  for (y = 0; y < 4; y++) {
    v0 = dctblock[0+4*y];
    v1 = dctblock[1+4*y];
    v2 = dctblock[2+4*y];
    v3 = dctblock[3+4*y];
    t = DEC_K1*(v1+v3);
    v1 *= DEC_K3;
    v3 *= DEC_K2;

    temp = ((v0+v2)<<DECSHIFT) + t - v3;
    if (temp >= 0)
      temp += half;
    else
      temp -= half;
    temp >>= 2*DECSHIFT+1;
    if (temp < 0)  temp = 0; else if (temp > 31)  temp = 31;
    dctblock[0+4*y] = temp;

    temp = ((v0-v2)<<DECSHIFT) - t + v1;
    if (temp >= 0)
      temp += half;
    else
      temp -= half;
    temp >>= 2*DECSHIFT+1;
    if (temp < 0)  temp = 0; else if (temp > 31)  temp = 31;
    dctblock[1+4*y] = temp;

    temp = ((v0-v2)<<DECSHIFT) + t - v1;
    if (temp >= 0)
      temp += half;
    else
      temp -= half;
    temp >>= 2*DECSHIFT+1;
    if (temp < 0)  temp = 0; else if (temp > 31)  temp = 31;
    dctblock[2+4*y] = temp;

    temp = ((v0+v2)<<DECSHIFT) - t + v3;
    if (temp >= 0)
      temp += half;
    else
      temp -= half;
    temp >>= 2*DECSHIFT+1;
    if (temp < 0)  temp = 0; else if (temp > 31)  temp = 31;
    dctblock[3+4*y] = temp;
  }
}

// --------------------------------------------------------------------------------

#define READBYTE       { bucket |= (in[read++])<<bits; bits += 8; }
#ifdef STATISTICS
#define FLUSHBITS(n)   { bucket >>= (n); bits -= n; bitsread+=n; }
#else
#define FLUSHBITS(n)   { bucket >>= (n); bits -= n; }
#endif

static int decode_plane(unsigned char *cur, unsigned char *prev, int bx, int by, unsigned char *in, int plane) {

  unsigned int bucket, read;
  int i, x, y, bits, *imatrix;
#ifdef STATISTICS
  int bitsread = 0;
#endif


  read = 0;
  bucket = 0;
  bits = 0;
  READBYTE

  // skip plane info
  FLUSHBITS(2)

  for (y = 0; y < by; y++) {
    int q;

    while (bits < 16)  READBYTE

    q = bucket &3;
    FLUSHBITS(2)
#ifdef QUANTIZING
    if (plane == PLANE_Y)
      imatrix = iquantmatrix_y[q];
    else
      imatrix = iquantmatrix_uv[q];
#endif

    for (x = 0; x < bx; x++) {
      int blocktype;

      while (bits < 16)  READBYTE
#ifdef IFRAMES_ALWAYS_DCT
      if (prev) {
        blocktype = bucket & 1;
        FLUSHBITS(1)
      } else
        blocktype = BLOCK_DCT;
#else
      blocktype = bucket & 1;
      FLUSHBITS(1)
#endif

      if (blocktype == BLOCK_MOTION) {
        // 1 bit vector length
        // 2 or 5 bits x/y vectors
        int motionx, motiony;
        unsigned char *prevblock;

        if ((bucket &1) == VECTOR_SHORT) {
          motionx = (bucket>>1) &3;
          motionx = (motionx<<30)>>30;
          motiony = (bucket>>3) &3;
          motiony = (motiony<<30)>>30;
          FLUSHBITS(5)
        } else {
          motionx = (bucket>>1) &15;
          motionx = (motionx<<28)>>28;
          motiony = (bucket>>5) &15;
          motiony = (motiony<<28)>>28;
          FLUSHBITS(9)
        }

        prevblock = prev + 4*x + motionx + 4*bx*(4*y + motiony);
        // copy block from previous frame
        for (i = 0; i < 4; i++) {
          cur[0+i*4*bx] = prevblock[0];
          cur[1+i*4*bx] = prevblock[1];
          cur[2+i*4*bx] = prevblock[2];
          cur[3+i*4*bx] = prevblock[3];
          prevblock += 4*bx;
        }

      } else if (blocktype == BLOCK_DCT) {
        // 6 bit DC
        // 1 bit zero-indicator
        //       6 bit value  (1,0)
        // 1 bit zero-indicator
        //       6 bit value  (0,1)
        // 1 bit end-of-data indicator
        //       list of 3 bit step (0 for end, 7 for 3 bit step) + n bit value
        int block[16], i;

        for (i = 1; i < 16; i++)   block[i] = 0;
        // get DC value
        block[0] = bucket &63;
        FLUSHBITS(6)
        // get AC value for 0,1
        if ((bucket &1) == NONZERO_VALUE) {
          block[4] = ((int)(((bucket>>1) &63)<<26))>>26;
          FLUSHBITS(7)
        } else {
          FLUSHBITS(1)
        }
        while (bits < 16)  READBYTE
        // get AC value for 1,0
        if ((bucket &1) == NONZERO_VALUE) {
          block[1] = ((int)(((bucket>>1) &63)<<26))>>26;
          FLUSHBITS(7)
        } else {
          FLUSHBITS(1)
        }
        if ((bucket &1) == NONZERO_VALUE) {
          int i = 2;
          FLUSHBITS(1)
          // more AC values
          while (bucket &7) {
            int v, n;
            // get index and value
            while (bits < 16)  READBYTE
            i += bucket &7;
            n = maxbits[i];
            v = (bucket>>3) & ((1<<n)-1);
            v <<= 32-n;
            block[izigzag[i]] = v>>(32-n);
            FLUSHBITS(3+n)
          }
          // flush end-of-AC-values marker (4 0-bits)
          FLUSHBITS(3)
        } else {
          // no more AC values
          FLUSHBITS(1)
        }

#ifdef QUANTIZING
        // de-quantize
        for (i = 0; i < 16; i++) {
          if (block[i] < 0)
            block[i] = (imatrix[i]*block[i]-31)/64;
          else
            block[i] = (imatrix[i]*block[i]+31)/64;
        }
#endif

        idct(block);

        cur[0+0*4*bx] = block[0];
        cur[1+0*4*bx] = block[1];
        cur[2+0*4*bx] = block[2];
        cur[3+0*4*bx] = block[3];
        cur[0+1*4*bx] = block[4];
        cur[1+1*4*bx] = block[5];
        cur[2+1*4*bx] = block[6];
        cur[3+1*4*bx] = block[7];
        cur[0+2*4*bx] = block[8];
        cur[1+2*4*bx] = block[9];
        cur[2+2*4*bx] = block[10];
        cur[3+2*4*bx] = block[11];
        cur[0+3*4*bx] = block[12];
        cur[1+3*4*bx] = block[13];
        cur[2+3*4*bx] = block[14];
        cur[3+3*4*bx] = block[15];
      }
      cur += 4;
    }
    cur += 3*4*bx;
  }

  // in case we read too many bytes
  while (bits >= 8)  read -= 1, bits -= 8;
  return read;
}

// --------------------------------------------------------------------------------

void init() {

  int i, q, x, y, r;

#ifdef QUANTIZING
  // build quantizations-matrices
  for (q = 0; q < 4; q++)
    for (i = 0; i < 16; i++) {
      quantmatrix_y[q][i]   = (64*64)/(8*quantmatrix[q][i]);
      iquantmatrix_y[q][i]  = 8*quantmatrix[q][i];
      quantmatrix_uv[q][i]  = (64*64)/(12*quantmatrix[q][i]);
      iquantmatrix_uv[q][i] = 12*quantmatrix[q][i];
    }
#endif

  // generate motion-vector-search-order-spiral
  vector[0][VECTOR_X] = vector[0][VECTOR_Y] = 0;
  i = 1;
  for (r = 1; r <= 7; r++) {
    x = r;
    y = -r-1;
    do {
      y++;
      vector[i][VECTOR_X] = x;
      vector[i][VECTOR_Y] = y;
      i++;
    } while (y < r);
    do {
      x--;
      vector[i][VECTOR_X] = x;
      vector[i][VECTOR_Y] = y;
      i++;
    } while (x > -r);
    do {
      y--;
      vector[i][VECTOR_X] = x;
      vector[i][VECTOR_Y] = y;
      i++;
    } while (y > -r);
    do {
      x++;
      vector[i][VECTOR_X] = x;
      vector[i][VECTOR_Y] = y;
      i++;
    } while (x < r-1);
  }

  initialised = 1;
}
