/*
*Copyright(c)2016, Jeffrey Lee
*Allrightsreserved.
*
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
*/

#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <limits.h>
#include "proto.h"
#include "zrle.h"
#include "debug.h"
#include "profiling.h"
#include "pixtrans.h"

#include <oslib/zlib.h>
#include <kernel.h>
#include <swis.h>

#define ZRLE_RAW 0
#define ZRLE_SOLID 1
#define ZRLE_PACKED(N) (N)
#define ZRLE_RLE 128
#define ZRLE_PALETTE_RLE(N) (128+(N))

struct zrle_state
{
  zlib_stream_control_block strm;
};

typedef struct {
  uint16_t x;
  uint16_t y;
  uint16_t w;
  uint16_t h;
  uint32_t enc;
  uint32_t len;
} zrle_rect;

typedef struct {
  uint32_t pixel;
  uint32_t used; /* Allocated flag / palette index */
} hash_entry;

#define ZRLE_HASH_SHIFT_LOG2 7 /* Number of bits to index the hash table */

#define ZRLE_HASH_LIMIT 127 /* Max number of entries to store in the hash table */

#define ZRLE_HASH_SIZE ((1<<ZRLE_HASH_SHIFT_LOG2) + ZRLE_HASH_LIMIT) /* When we get a hash collision, we just shift all the following entries in the table up. So if all the entries collide on the last hash, we need ZRLE_HASH_LIMIT extra entries in the table in order to store them all. (Technically only LIMIT-1 extra spaces are needed, but having a spare at the end saves a table size limit check) */

typedef struct {
  hash_entry table[ZRLE_HASH_SIZE];
  int count; /* Number of colours */
  int ones; /* Number of 1-length RLE runs */
} hash_table;

zrle_state *zrle_init(int level)
{
  zrle_state *state = (zrle_state *) malloc(sizeof(zrle_state));
  if (!state)
  {
    return NULL;
  }
  memset(state, 0, sizeof(zrle_state));
  zlib_return_code ret;
  os_error *err = xzlib_deflate_init(&state->strm, (zlib_compression_level) level, "1.1.4", sizeof(state->strm), &ret);
  if (err || ret)
  {
    free(state);
    if (err)
    {
      dprintf("zrle: zlib_deflate_init err %08x %s\n", err->errnum, err->errmess);
    }
    else
    {
      dprintf("zrle: zlib_deflate_init err %d\n", ret);
    }
    return NULL;
  }
  _swix(0x53AE0, _INR(0,1), &state->strm, 0); /* ZLib_TaskAssociate (SWI not defined correctly in oslib) */

  dprintf("zrle: Init OK\n");

  return state;
}

void zrle_destroy(zrle_state *state)
{
  if (!state)
  {
    return;
  }
  zlib_return_code ret;
  xzlib_deflate_end(&state->strm, &ret);
  free(state);
}

int zrle_num_rects(vncserv *serv, const area *rect)
{
  /* Because each ZRLE rectangle needs to have the length included in the header, we split rectangles into tile-sized chunks so that we can ensure there's always enough space in the temp buffer to store the full rectangle */
  /* We use the same strategy for zlib, even though zlib doesn't use tiles */
  int tile_w = (rect->w + (ZRLE_TILE-1)) >> ZRLE_TILE_SHIFT;
  int tile_h = (rect->h + (ZRLE_TILE-1)) >> ZRLE_TILE_SHIFT;
  return tile_w*tile_h;
}

#define Static static

Static zrle_rle_ent *calc_rle124(vncserv *serv, int x, int y, int w, int h, zrle_rle_ent *out)
{
  int bpp = serv->screen.format.bits_per_pixel;
  int x_bits = x*bpp;
  int mask = (1<<bpp)-1;
  const uint32_t * restrict row = ((uint32_t *) (((uint8_t *) serv->screen.framebuffer) + y*serv->screen.stride)) + (x_bits>>5);
  zrle_rle_ent cur = (zrle_rle_ent) { ((*row) >> (x_bits & 31)) & mask, 0};
  while (h--)
  {
    int shift = x_bits & 31;
    const uint32_t * restrict screen = row;
    uint32_t pix = *screen++;
    for(int w2=w-1;w2>=0;w2--)
    {
      uint32_t pixel = (pix>>shift) & mask;
      if (pixel == cur.pixel)
      {
        cur.count++;
      }
      else
      {
        *out++ = cur;
        cur = (zrle_rle_ent) {pixel, 1};
      }
      shift += bpp;
      if (shift == 32)
      {
        shift = 0;
        pix = *screen++;
      }
    }
    row += serv->screen.stride>>2;
  }
  /* Assume non-zero size, so must have something to write */
  *out++ = cur;
  return out;
}

Static zrle_rle_ent *calc_rle8(vncserv *serv, int x, int y, int w, int h, zrle_rle_ent *out)
{
  const uint8_t * restrict screen = ((uint8_t *) serv->screen.framebuffer) + y*serv->screen.stride + x;
  zrle_rle_ent cur = (zrle_rle_ent) {*screen, 0};
  int stride = serv->screen.stride - w;
  while (h--)
  {
    for(int w2=w-1;w2>=0;w2--)
    {
      uint32_t pixel = *screen++;
      if (pixel == cur.pixel)
      {
        cur.count++;
      }
      else
      {
        *out++ = cur;
        cur = (zrle_rle_ent) {pixel, 1};
      }
    }
    screen += stride;
  }
  /* Assume non-zero size, so must have something to write */
  *out++ = cur;
  return out;
}

Static zrle_rle_ent *calc_rle16(vncserv *serv, int x, int y, int w, int h, zrle_rle_ent *out)
{
  const uint16_t * restrict screen = ((uint16_t *) (((uint8_t *) serv->screen.framebuffer) + y*serv->screen.stride)) + x;
  zrle_rle_ent cur = (zrle_rle_ent) {*screen, 0};
  int stride = (serv->screen.stride>>1) - w;
  while (h--)
  {
    for(int w2=w-1;w2>=0;w2--)
    {
      uint32_t pixel = *screen++;
      if (pixel == cur.pixel)
      {
        cur.count++;
      }
      else
      {
        *out++ = cur;
        cur = (zrle_rle_ent) {pixel, 1};
      }
    }
    screen += stride;
  }
  /* Assume non-zero size, so must have something to write */
  *out++ = cur;
  return out;
}

Static zrle_rle_ent *calc_rle32(vncserv *serv, int x, int y, int w, int h, zrle_rle_ent *out)
{
  const uint32_t * restrict screen = ((uint32_t *) (((uint8_t *) serv->screen.framebuffer) + y*serv->screen.stride)) + x;
  zrle_rle_ent cur = (zrle_rle_ent) {*screen, 0};
  int stride = (serv->screen.stride>>2) - w;
  while (h--)
  {
    for(int w2=w-1;w2>=0;w2--)
    {
      uint32_t pixel = *screen++;
      if (pixel == cur.pixel)
      {
        cur.count++;
      }
      else
      {
        *out++ = cur;
        cur = (zrle_rle_ent) {pixel, 1};
      }
    }
    screen += stride;
  }
  /* Assume non-zero size, so must have something to write */
  *out++ = cur;
  return out;
}

Static zrle_rle_ent *calc_rle(vncserv *serv, int x, int y, int w, int h, zrle_rle_ent *out)
{
  switch(serv->screen.format.bits_per_pixel)
  {
  case 1:
  case 2:
  case 4:
    return calc_rle124(serv, x, y, w, h, out);
  case 8:
    return calc_rle8(serv, x, y, w, h, out);
  case 16:
    return calc_rle16(serv, x, y, w, h, out);
  case 32:
    return calc_rle32(serv, x, y, w, h, out);
  }
  return NULL;
}

Static zrle_rle_ent *pixtrans_rle(vncserv *serv, zrle_rle_ent *in, zrle_rle_ent *end, bool use_cpixel)
{
  int count = end - in;
  if (use_cpixel)
  {
    pixtrans_do_CPIXEL_table(serv, &in->pixel, count, sizeof(zrle_rle_ent)/sizeof(uint32_t));
  }
  else
  {
    pixtrans_do_table(serv, &in->pixel, count, sizeof(zrle_rle_ent)/sizeof(uint32_t));
  }

  /* Re-pack data if we've lost precision
     Otherwise assume no precision loss has occurred */
  const PIXEL_FORMAT *srcfmt = &serv->screen.format;
  const PIXEL_FORMAT *destfmt = &serv->clientformat;
  bool pack;
  if (srcfmt->true_colour_flag)
  {
    pack = srcfmt->depth > destfmt->depth;
  }
  else
  {
    pack = destfmt->depth < 24;
  }
  if (!pack || (end == in+1))
  {
    return end;
  }
  zrle_rle_ent *out = in;
  zrle_rle_ent cur = (zrle_rle_ent) {in->pixel, 0};
  while (in != end)
  {
    zrle_rle_ent this = *in++;
    if (this.pixel != cur.pixel)
    {
      *out++ = cur;
      cur = this;
    }
    else
    {
      cur.count += this.count;
    }
  }
  *out++ = cur;
  return out;
}

/* Basic RLE and solid support */

Static int write_rle1(const zrle_rle_ent * restrict in, uint8_t * restrict out, int in_len)
{
  const zrle_rle_ent * restrict end = in + in_len;
  uint8_t * restrict start = out;
  if (in_len == 1)
  {
    *out++ = ZRLE_SOLID;
    *out++ = in->pixel;
    return out-start;
  }
  *out++ = ZRLE_RLE;
  while (in != end)
  {
    *out++ = in->pixel;
    uint32_t count = in->count-1;
    while (count >= 255)
    {
      *out++ = 255;
      count -= 255;
    }
    *out++ = count;
    in++;
  }
  return out-start;
}

Static int write_rle2(const zrle_rle_ent * restrict in, uint8_t * restrict out, int in_len)
{
  const zrle_rle_ent * restrict end = in + in_len;
  uint8_t * restrict start = out;
  if (in_len == 1)
  {
    *out++ = ZRLE_SOLID;
    *out++ = in->pixel;
    *out++ = in->pixel>>8;
    return out-start;
  }
  *out++ = ZRLE_RLE;
  while (in != end)
  {
    *out++ = in->pixel;
    *out++ = in->pixel>>8;
    uint32_t count = in->count-1;
    while (count >= 255)
    {
      *out++ = 255;
      count -= 255;
    }
    *out++ = count;
    in++;
  }
  return out-start;
}

Static int write_rle3(const zrle_rle_ent * restrict in, uint8_t * restrict out, int in_len)
{
  const zrle_rle_ent * restrict end = in + in_len;
  uint8_t * restrict start = out;
  if (in_len == 1)
  {
    *out++ = ZRLE_SOLID;
    *out++ = in->pixel;
    *out++ = in->pixel>>8;
    *out++ = in->pixel>>16;
    return out-start;
  }
  *out++ = ZRLE_RLE;
  while (in != end)
  {
    *out++ = in->pixel;
    *out++ = in->pixel>>8;
    *out++ = in->pixel>>16;
    uint32_t count = in->count-1;
    while (count >= 255)
    {
      *out++ = 255;
      count -= 255;
    }
    *out++ = count;
    in++;
  }
  return out-start;
}

Static int write_rle4(const zrle_rle_ent * restrict in, uint8_t * restrict out, int in_len)
{
  const zrle_rle_ent * restrict end = in + in_len;
  uint8_t * restrict start = out;
  if (in_len == 1)
  {
    *out++ = ZRLE_SOLID;
    *out++ = in->pixel;
    *out++ = in->pixel>>8;
    *out++ = in->pixel>>16;
    *out++ = in->pixel>>24;
    return out-start;
  }
  *out++ = ZRLE_RLE;
  while (in != end)
  {
    *out++ = in->pixel;
    *out++ = in->pixel>>8;
    *out++ = in->pixel>>16;
    *out++ = in->pixel>>24;
    uint32_t count = in->count-1;
    while (count >= 255)
    {
      *out++ = 255;
      count -= 255;
    }
    *out++ = count;
    in++;
  }
  return out-start;
}

/* 'raw RLE' functions, i.e. when we abandon RLE and generate a raw zrle tile */

Static int write_raw_rle1(const zrle_rle_ent * restrict in, uint8_t * restrict out, int in_len)
{
  const zrle_rle_ent * restrict end = in + in_len;
  uint8_t * restrict start = out;
  *out++ = ZRLE_RAW;
  while (in != end)
  {
    uint32_t count = in->count;
    uint32_t pixel = in->pixel;
    while (count--)
    {
      *out++ = pixel;
    }
    in++;
  }
  return out-start;
}

Static int write_raw_rle2(const zrle_rle_ent * restrict in, uint8_t * restrict out, int in_len)
{
  const zrle_rle_ent * restrict end = in + in_len;
  uint8_t * restrict start = out;
  *out++ = ZRLE_RAW;
  uint16_t * restrict out16 = (uint16_t *) out;
  while (in != end)
  {
    uint32_t count = in->count;
    uint32_t pixel = in->pixel;
    while (count--)
    {
      *out16++ = pixel;
    }
    in++;
  }
  return ((uint8_t *) out16)-start;
}

Static int write_raw_rle3(const zrle_rle_ent * restrict in, uint8_t * restrict out, int in_len)
{
  const zrle_rle_ent * restrict end = in + in_len;
  uint8_t * restrict start = out;
  *out++ = ZRLE_RAW;
  while (in != end)
  {
    uint32_t count = in->count;
    uint32_t pixel = in->pixel;
    while (count--)
    {
      *out++ = pixel;
      *out++ = pixel>>8;
      *out++ = pixel>>16;
    }
    in++;
  }
  return out-start;
}

Static int write_raw_rle4(const zrle_rle_ent * restrict in, uint8_t * restrict out, int in_len)
{
  const zrle_rle_ent * restrict end = in + in_len;
  uint8_t * restrict start = out;
  *out++ = ZRLE_RAW;
  uint32_t * restrict out32 = (uint32_t *) out;
  while (in != end)
  {
    uint32_t count = in->count;
    uint32_t pixel = in->pixel;
    while (count--)
    {
      *out32++ = pixel;
    }
    in++;
  }
  return ((uint8_t *) out32)-start;
}

/* Common palette functions */

Static uint8_t *write_palette_table1(uint8_t * restrict out, hash_table * restrict tbl)
{
  int count = tbl->count;
  int idx = 0;
  for(hash_entry * restrict t = tbl->table;;t++)
  {
    if (t->used)
    {
      t->used = idx++; /* Fill in palette index */
      *out++ = t->pixel;
      if (!--count)
      {
        return out;
      }
    }
  }
}

Static uint8_t *write_palette_table2(uint8_t * restrict out, hash_table * restrict tbl)
{
  int count = tbl->count;
  int idx = 0;
  uint16_t * restrict out16 = (uint16_t *) out;
  for(hash_entry * restrict t = tbl->table;;t++)
  {
    if (t->used)
    {
      t->used = idx++; /* Fill in palette index */
      *out16++ = t->pixel;
      if (!--count)
      {
        return (uint8_t *) out16;
      }
    }
  }
}

Static uint8_t *write_palette_table3(uint8_t * restrict out, hash_table * restrict tbl)
{
  int count = tbl->count;
  int idx = 0;
  for(hash_entry * restrict t = tbl->table;;t++)
  {
    if (t->used)
    {
      t->used = idx++; /* Fill in palette index */
      *out++ = t->pixel;
      *out++ = t->pixel>>8;
      *out++ = t->pixel>>16;
      if (!--count)
      {
        return out;
      }
    }
  }
}

Static uint8_t *write_palette_table4(uint8_t * restrict out, hash_table * restrict tbl)
{
  int count = tbl->count;
  int idx = 0;
  uint32_t * restrict out32 = (uint32_t *) out;
  for(hash_entry * restrict t = tbl->table;;t++)
  {
    if (t->used)
    {
      t->used = idx++; /* Fill in palette index */
      *out32++ = t->pixel;
      if (!--count)
      {
        return (uint8_t *) out32;
      }
    }
  }
}

Static int hash_find(uint32_t pixel, const hash_table * restrict tbl, int hash_shift)
{
  uint32_t idx = pixel>>hash_shift;
  while (tbl->table[idx].pixel != pixel)
  {
    idx++;
  }
  return tbl->table[idx].used;
}

/* RLE palette functions */

Static uint8_t *write_palette_rle_common(zrle_rle_ent * restrict in, uint8_t * restrict out, int in_len, hash_table * restrict tbl, int hash_shift)
{
  const zrle_rle_ent * restrict end = in + in_len;
  while (in != end)
  {
    int val = hash_find(in->pixel, tbl, hash_shift);
    uint32_t count = in->count-1;
    if (!count)
    {
      *out++ = val;
    }
    else
    {
      *out++ = val | 128;
      while (count >= 255)
      {
        *out++ = 255;
        count -= 255;
      }
      *out++ = count;
    }
    in++;
  }
  return out;
}

Static int write_palette_rle1(zrle_rle_ent * restrict in, uint8_t * restrict out, int in_len, hash_table * restrict tbl, int hash_shift)
{
  uint8_t * restrict start = out;
  *out++ = ZRLE_PALETTE_RLE(tbl->count);
  out = write_palette_table1(out, tbl);
  return write_palette_rle_common(in,out,in_len,tbl,hash_shift) - start;
}

Static int write_palette_rle2(zrle_rle_ent * restrict in, uint8_t * restrict out, int in_len, hash_table * restrict tbl, int hash_shift)
{
  uint8_t * restrict start = out;
  *out++ = ZRLE_PALETTE_RLE(tbl->count);
  out = write_palette_table2(out, tbl);
  return write_palette_rle_common(in,out,in_len,tbl,hash_shift) - start;
}

Static int write_palette_rle3(zrle_rle_ent * restrict in, uint8_t * restrict out, int in_len, hash_table * restrict tbl, int hash_shift)
{
  uint8_t * restrict start = out;
  *out++ = ZRLE_PALETTE_RLE(tbl->count);
  out = write_palette_table3(out, tbl);
  return write_palette_rle_common(in,out,in_len,tbl,hash_shift) - start;
}

Static int write_palette_rle4(zrle_rle_ent * restrict in, uint8_t * restrict out, int in_len, hash_table * restrict tbl, int hash_shift)
{
  uint8_t * restrict start = out;
  *out++ = ZRLE_PALETTE_RLE(tbl->count);
  out = write_palette_table4(out, tbl);
  return write_palette_rle_common(in,out,in_len,tbl,hash_shift) - start;
}

/* Packed palette functions */

Static uint8_t *write_packed_common(zrle_rle_ent * restrict in, uint8_t * restrict out, int in_len, hash_table * restrict tbl, int hash_shift)
{
  const zrle_rle_ent * restrict end = in + in_len;
  int shift = 8;
  int byte = 0;
  int bits = (tbl->count == 2 ? 1 : (tbl->count <= 4 ? 2 : 4));
  while (in != end)
  {
    int val = hash_find(in->pixel, tbl, hash_shift);
    uint32_t count = in->count;
    in++;
    while (count--)
    {
      shift-=bits;
      byte |= val << shift;
      if (!shift)
      {
        *out++ = byte;
        byte = 0;
        shift = 8;
      }
    }
  }
  return out;
}

Static int write_packed1(zrle_rle_ent * restrict in, uint8_t * restrict out, int in_len, hash_table * restrict tbl, int hash_shift)
{
  uint8_t * restrict start = out;
  *out++ = ZRLE_PACKED(tbl->count);
  out = write_palette_table1(out, tbl);
  return write_packed_common(in, out, in_len, tbl, hash_shift) - start;
}

Static int write_packed2(zrle_rle_ent * restrict in, uint8_t * restrict out, int in_len, hash_table * restrict tbl, int hash_shift)
{
  uint8_t * restrict start = out;
  *out++ = ZRLE_PACKED(tbl->count);
  out = write_palette_table2(out, tbl);
  return write_packed_common(in, out, in_len, tbl, hash_shift) - start;
}

Static int write_packed3(zrle_rle_ent * restrict in, uint8_t * restrict out, int in_len, hash_table * restrict tbl, int hash_shift)
{
  uint8_t * restrict start = out;
  *out++ = ZRLE_PACKED(tbl->count);
  out = write_palette_table3(out, tbl);
  return write_packed_common(in, out, in_len, tbl, hash_shift) - start;
}

Static int write_packed4(zrle_rle_ent * restrict in, uint8_t * restrict out, int in_len, hash_table * restrict tbl, int hash_shift)
{
  uint8_t * restrict start = out;
  *out++ = ZRLE_PACKED(tbl->count);
  out = write_palette_table4(out, tbl);
  return write_packed_common(in, out, in_len, tbl, hash_shift) - start;
}

/* Core code */

Static hash_table *build_hash(zrle_rle_ent * restrict in, int in_len, int hash_shift)
{
  static hash_table tbl;
  memset(&tbl,0,sizeof(tbl));
  /* Build a sorted hash table
     Hash function is just a right shift of the pixel value */
  while(in_len--)
  {
    uint32_t pixel = in->pixel;
    if (in->count == 1)
    {
      tbl.ones++;
    }
    in++;

    uint32_t idx = pixel>>hash_shift;
    hash_entry this = tbl.table[idx];
    while (this.used && (this.pixel < pixel))
    {
      if (++idx == ZRLE_HASH_SIZE)
      {
        /* Hash table full, can't do palette optimisation */
        return NULL;
      }
      this = tbl.table[idx];
    }

    if (this.used && (this.pixel == pixel))
      continue;

    /* Insert/shuffle up */
    if (++tbl.count == ZRLE_HASH_LIMIT)
    {
      /* Too many colours */
      return NULL;
    }

    tbl.table[idx++] = (hash_entry) { pixel, 1 };

    /* Shuffle up any following entries */
    while (this.used)
    {
      hash_entry temp = tbl.table[idx];
      tbl.table[idx++] = this;
      this = temp;
    }
  }
  return &tbl;
}

Static int write_rle(zrle_rle_ent *in, uint8_t *out, int in_len, int bytesPerCPixel, int pixels, int w, PIXEL_FORMAT *format)
{
  /* Estimate minimum RLE size for this data */
  int rle_size = (bytesPerCPixel+1)*in_len;
  int raw_size = pixels*bytesPerCPixel;
  int packed_size = INT_MAX;
  int palette_rle_size = INT_MAX;
  hash_table *tbl = NULL;
  int best = ZRLE_RAW;
  if (in_len == 1)
  {
    best = ZRLE_SOLID;
  }
  else
  {
    int best_size = raw_size;
    if (rle_size < raw_size)
    {
      best = ZRLE_RLE;
      best_size = rle_size;
    }
    int hash_shift = ((bytesPerCPixel == 3) || format->big_endian_flag) ? (bytesPerCPixel<<3) : format->depth;
    hash_shift = MAX(0, hash_shift - ZRLE_HASH_SHIFT_LOG2);
    tbl = build_hash(in, in_len, hash_shift);
//    dprintf("in_len %d hash_shift %d tbl->count %d tbl->ones %d\n",in_len,hash_shift,(tbl?tbl->count:-1),(tbl?tbl->ones:-1));
    if (tbl && (tbl->count > 1)) /* Table count of 1 is possible in case where we've skipped an RLE merge - for now don't deal with it here */
    {
      if (tbl->count == 2)
      {
        /* 1BPP */
        if (!(w & 7)) /* Not dealing with row alignment atm - only allow rows which don't require padding */
        {
          packed_size = 2*bytesPerCPixel + (pixels>>3);
        }
      }
      else if (tbl->count <= 4)
      {
        /* 2BPP */
        if (!(w & 3))
        {
          packed_size = tbl->count*bytesPerCPixel + (pixels>>2);
        }
      }
      else if (tbl->count <= 16)
      {
        /* 4BPP */
        if (!(w & 1))
        {
          packed_size = tbl->count*bytesPerCPixel + (pixels>>1);
        }
      }
      if (packed_size < best_size)
      {
        best = ZRLE_PACKED(tbl->count);
        best_size = packed_size;
      }
      if (tbl->count <= ZRLE_HASH_LIMIT) /* (guaranteed?) */
      {
        palette_rle_size = tbl->count*bytesPerCPixel + in_len + (in_len - tbl->ones);

        if (palette_rle_size < best_size)
        {
          best = ZRLE_PALETTE_RLE(tbl->count);
          switch(bytesPerCPixel)
          {
          case 1: return write_palette_rle1(in, out, in_len, tbl, hash_shift);
          case 2: return write_palette_rle2(in, out, in_len, tbl, hash_shift);
          case 3: return write_palette_rle3(in, out, in_len, tbl, hash_shift);
          case 4: return write_palette_rle4(in, out, in_len, tbl, hash_shift);
          }
        }
      }
      if (best_size == packed_size)
      {
        switch(bytesPerCPixel)
        {
        case 1: return write_packed1(in, out, in_len, tbl, hash_shift);
        case 2: return write_packed2(in, out, in_len, tbl, hash_shift);
        case 3: return write_packed3(in, out, in_len, tbl, hash_shift);
        case 4: return write_packed4(in, out, in_len, tbl, hash_shift);
        }
      }
    }
  }

  if (best == ZRLE_RAW)
  {
    switch(bytesPerCPixel)
    {
    case 1: return write_raw_rle1(in, out, in_len);
    case 2: return write_raw_rle2(in, out, in_len);
    case 3: return write_raw_rle3(in, out, in_len);
    case 4: return write_raw_rle4(in, out, in_len);
    }
  }
  else
  {
    /* ZRLE_SOLID or ZRLE_RLE, routine handles both */
    switch(bytesPerCPixel)
    {
    case 1: return write_rle1(in, out, in_len);
    case 2: return write_rle2(in, out, in_len);
    case 3: return write_rle3(in, out, in_len);
    case 4: return write_rle4(in, out, in_len);
    }
  }

  return 0;
}

Static os_error *xxzlib_deflate(zlib_stream_control_block *scb, zlib_flush_type flush_type, zlib_return_code *return_code)
{
  PROFILE_FUNC_BEGIN
  PROFILE_FUNC_METRIC(scb->avail_in)
  os_error *e = xzlib_deflate(scb, flush_type, return_code);
  PROFILE_FUNC_END
  return e;
}

bool zrle_send(vncserv *serv, send_rect_state *state, zrle_temp_buffer *temp)
{
  if (vncserv_get_tx_space(serv) < ZRLE_TILE_SIZE)
  {
    return false;
  }

  if (state->x == -1)
  {
    state->x = state->rect.x;
    state->y = state->rect.y;
  }

  zrle_state *zstate = serv->zrle;

  PROFILE_FUNC_BEGIN

  bool use_cpixel = PIXEL_FORMAT_is_CPIXEL(&serv->clientformat);

//char bufnam[64];
//static int bufcount;

  int max_tiles = 4;

  /* Split into blocks */
  int bh = (state->rect.y + state->rect.h) - state->y;
  if (bh > ZRLE_TILE)
  {
    bh = ZRLE_TILE;
  }
  while ((vncserv_get_tx_space(serv) >= ZRLE_TILE_SIZE) && max_tiles--)
  {
    int bw = (state->rect.x + state->rect.w) - state->x;
    if (bw > ZRLE_TILE)
    {
      bw = ZRLE_TILE;
    }

    PROFILE_FUNC_METRIC(bw*bh)

    int x = state->x;
    int y = state->y;

    /* Build RLE list */
    zrle_rle_ent *rle = (zrle_rle_ent *) temp->buf[0];
    zrle_rle_ent *rle_end = calc_rle(serv, x, y, bw, bh, rle);
    if (!rle_end)
    {
      PROFILE_FUNC_END
      dprintf("zrle: calc_rle failed\n");
      return false;
    }
//  sprintf(bufnam,"$.dbg.%d_calc",++bufcount);
//  if (bufcount < 50) _swix(OS_File,_INR(0,5),0,bufnam,0,0,rle,rle_end);
    /* Do pixel translation */
    rle_end = pixtrans_rle(serv, rle, rle_end, use_cpixel);
//  sprintf(bufnam,"$.dbg.%d_pix",bufcount);
//  if (bufcount < 50) _swix(OS_File,_INR(0,5),0,bufnam,0,0,rle,rle_end);
    /* Write out */
    uint8_t *zrle_in = temp->buf[1] + 3; /* Make data word aligned for easy raw */
    int in_len = write_rle(rle, zrle_in, rle_end-rle, (use_cpixel ? 3 : (serv->clientformat.bits_per_pixel>>3)), bw*bh, bw, &serv->clientformat);
//  sprintf(bufnam,"$.dbg.%d_rle",bufcount);
//  if (bufcount < 50) _swix(OS_File,_INR(0,5),0,bufnam,0,0,zrle_in,zrle_in+in_len);

    uint8_t *zrle_out = temp->buf[0];

    /* Compress */
    zstate->strm.next_in = (int)zrle_in;
    zstate->strm.avail_in = in_len;
    zstate->strm.next_out = (int)zrle_out + sizeof(zrle_rect);
    zstate->strm.avail_out = ZRLE_BUFFER_SIZE - sizeof(zrle_rect);
    zlib_return_code ret;
    os_error *err = xxzlib_deflate(&zstate->strm, zlib_SYNC_FLUSH, &ret);
    if (err)
    {
      PROFILE_FUNC_END
      dprintf("zrle: Error %08x %s\n", err->errnum, err->errmess);
      return false;
    }
    if (ret < 0)
    {
      PROFILE_FUNC_END
      dprintf("zrle: Error %d\n", ret);
      return false;
    }

    uint32_t out_len = zstate->strm.next_out - ((int)zrle_out + sizeof(zrle_rect));

//    dprintf("zrle: %dx%d %02x %d\n", bw, bh, *zrle_in, in_len);
//    dprintf("zrle: %d (%d) -> %d\n", in_len, zstate->strm.avail_in, out_len);

    /* Fill in the rectangle header */
    zrle_rect *rect = (zrle_rect *) zrle_out;
    rect->x = Swap16IfLE(x);
    rect->y = Swap16IfLE(y);
    rect->w = Swap16IfLE(bw);
    rect->h = Swap16IfLE(bh);
    rect->enc = Swap32IfLE(ZRLE_ENCODING);
    rect->len = Swap32IfLE(out_len);

    /* Write rect header + data in one go to cut down on overhead a bit */
    vncserv_write(serv, zrle_out, out_len + sizeof(zrle_rect));

    state->x += bw;
    if (state->x == state->rect.x + state->rect.w)
    {
      state->y += bh;
      if (state->y == state->rect.y + state->rect.h)
      {
        break;
      }
      state->x = state->rect.x;
      bh = (state->rect.y + state->rect.h) - state->y;
      if (bh > ZRLE_TILE)
      {
        bh = ZRLE_TILE;
      }
    }
  }

  PROFILE_FUNC_END
  return true;
}

/* This just breaks the input into a series of ZRLE-sized tiles. It may be more optimal to encode bigger tiles, but for now this is sufficient. */
bool zlib_send(vncserv *serv, send_rect_state *state, zrle_temp_buffer *temp)
{
  if (vncserv_get_tx_space(serv) < ZRLE_TILE_SIZE)
  {
    return false;
  }

  if (state->x == -1)
  {
    state->x = state->rect.x;
    state->y = state->rect.y;
  }

  zrle_state *zstate = serv->zlib;

  PROFILE_FUNC_BEGIN

  uint8_t *zrle_in = temp->buf[0];
  uint8_t *zrle_out = temp->buf[1];

  int max_tiles = 4;

  /* Split into blocks */
  int bh = (state->rect.y + state->rect.h) - state->y;
  if (bh > ZRLE_TILE)
  {
    bh = ZRLE_TILE;
  }
  while ((vncserv_get_tx_space(serv) >= ZRLE_TILE_SIZE) && max_tiles--)
  {
    int bw = (state->rect.x + state->rect.w) - state->x;
    if (bw > ZRLE_TILE)
    {
      bw = ZRLE_TILE;
    }

    PROFILE_FUNC_METRIC(bw*bh)

    int x = state->x;
    int y = state->y;

    pixtrans_do_it(serv, x, y, bw, bh, zrle_in);
    int in_len = (bw*bh*serv->clientformat.bits_per_pixel)>>3;

    /* Compress */
    zstate->strm.next_in = (int)zrle_in;
    zstate->strm.avail_in = in_len;
    zstate->strm.next_out = (int)zrle_out + sizeof(zrle_rect);
    zstate->strm.avail_out = ZRLE_BUFFER_SIZE - sizeof(zrle_rect);
    zlib_return_code ret;
    os_error *err = xxzlib_deflate(&zstate->strm, zlib_SYNC_FLUSH, &ret);
    if (err)
    {
      PROFILE_FUNC_END
      dprintf("zlib: Error %08x %s\n", err->errnum, err->errmess);
      return false;
    }
    if (ret < 0)
    {
      PROFILE_FUNC_END
      dprintf("zlib: Error %d\n", ret);
      return false;
    }

    uint32_t out_len = zstate->strm.next_out - ((int)zrle_out + sizeof(zrle_rect));

//    dprintf("zlib: %d (%d) -> %d\n", in_len, zstate->strm.avail_in, out_len);

    /* Fill in the rectangle header */
    zrle_rect *rect = (zrle_rect *) zrle_out;
    rect->x = Swap16IfLE(x);
    rect->y = Swap16IfLE(y);
    rect->w = Swap16IfLE(bw);
    rect->h = Swap16IfLE(bh);
    rect->enc = Swap32IfLE(ZLIB_ENCODING);
    rect->len = Swap32IfLE(out_len);

    /* Write rect header + data in one go to cut down on overhead a bit */
    vncserv_write(serv, zrle_out, out_len + sizeof(zrle_rect));

    state->x += bw;
    if (state->x == state->rect.x + state->rect.w)
    {
      state->y += bh;
      if (state->y == state->rect.y + state->rect.h)
      {
        break;
      }
      state->x = state->rect.x;
      bh = (state->rect.y + state->rect.h) - state->y;
      if (bh > ZRLE_TILE)
      {
        bh = ZRLE_TILE;
      }
    }
  }

  PROFILE_FUNC_END
  return true;
}

