/*
*Copyright(c)2014, Jeffrey Lee
*Allrightsreserved.
*
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
*/

#include "proto.h"
#include "pixtrans.h"
#include "profiling.h"

#include <string.h>
#include <kernel.h>

#include <oslib/os.h>
#include <oslib/osbyte.h>
#include <oslib/osword.h>
#include <oslib/osspriteop.h>
#include <oslib/colourtrans.h>

static const uint8_t bpp_to_sprite_type[] =
{
1, // 0bpp -> 2 colour
1, // 1bpp -> 2 colour
2, // 2bpp -> 4 colour
3, // 3bpp -> 16 colour
3, // 4bpp -> 16 colour
4, // 5bpp -> 256 colour
4, // 6bpp -> 256 colour
4, // 7bpp -> 256 colour
4, // 8bpp -> 256 colour
};

void pixtrans_build(vncserv *serv)
{
  serv->pixtrans = NULL;

  const PIXEL_FORMAT *srcfmt = &serv->screen.format;
  const PIXEL_FORMAT *destfmt = &serv->clientformat;
  if (!memcmp(srcfmt,destfmt,sizeof(PIXEL_FORMAT)))
  {
    return;
  }

  if (destfmt->true_colour_flag)
  {
    if (srcfmt->true_colour_flag)
    {
      /* No special logic required */
    }
    else
    {
      /* No special logic required */
    }
  }
  else
  {
    if (srcfmt->true_colour_flag)
    {
      /* Use the standard RISC OS 256 colour palette */
      unsigned int palette[256];
      os_mode src_mode = (os_mode) ((5<<27)+(90<<14)+(90<<1)+1);
      os_mode dest_mode = (os_mode) ((bpp_to_sprite_type[destfmt->depth]<<27)+(90<<14)+(90<<1)+1);
      xcolourtrans_read_palette((osspriteop_area *)dest_mode, (osspriteop_id)colourtrans_DEFAULT_PALETTE,
                                (os_palette *)palette, 1024, 0, NULL);
      vncserv_set_8bpp_palette(serv, palette);

      /* Get a translation table for it. Standard 32k table for compatibility. */
      void *buf[3];
      int size=0;
      xcolourtrans_generate_table(src_mode, colourtrans_DEFAULT_PALETTE, dest_mode, (os_palette*) palette, NULL, (colourtrans_table_flags) 0, NULL, NULL, &size);
      if (size == 12)
      {
        buf[1] = 0;
        xcolourtrans_generate_table(src_mode, colourtrans_DEFAULT_PALETTE, dest_mode, (os_palette*) palette, (osspriteop_trans_tab *) buf, (colourtrans_table_flags) 0, NULL, NULL, &size);
        serv->pixtrans = buf[1];
      }
    }
    else
    {
      /* TODO convert palette to dest format
         TODO also get translation table to facilitate do_8888 */
    }
  }
}

static uint32_t readbits(const void *buf,int offset,int count)
{
  const uint8_t *b = (const uint8_t *) buf;
  uint32_t mask = (1<<count)-1;

  b += offset>>3;
  offset &= 7;

  int shift = 8-offset;
  uint32_t val = (*b++) >> offset;
  count -= shift;

  while (count > 0)
  {
    val |= ((uint32_t) (*b++)) << shift;
    count -= 8;
    shift += 8;
  }

  return val & mask;
}

static uint32_t expandbits(uint32_t val, uint32_t inwidth, uint32_t outwidth)
{
  while (inwidth < outwidth)
  {
    val |= val<<inwidth;
    inwidth <<= 1;
  }
  if (inwidth > outwidth)
    val >>= (inwidth-outwidth);
  return val;
}

static const PIXEL_FORMAT palformat = {
  .bits_per_pixel = 32,
  .depth = 32,
  .big_endian_flag = 0,
  .true_colour_flag = 1,
  .red_max = 255,
  .green_max = 255,
  .blue_max = 255,
  .red_shift = 8,
  .green_shift = 16,
  .blue_shift = 24
};

void pixtrans_do_it(vncserv *serv, int x, int y, int w, int h, uint8_t * restrict output)
{
  const PIXEL_FORMAT * restrict srcfmt = &serv->screen.format;
  const PIXEL_FORMAT * restrict destfmt = &serv->clientformat;

  PROFILE_FUNC_BEGIN
  PROFILE_FUNC_METRIC(w*h)

  int srcoffset = x*srcfmt->bits_per_pixel + ((y*serv->screen.stride)<<3);

  if (!memcmp(srcfmt,destfmt,sizeof(PIXEL_FORMAT)))
  {
    /* N.B. assuming >= 8bpp */
    int end = srcoffset+w*srcfmt->bits_per_pixel;
    srcoffset >>= 3; /* bits -> bytes */
    end >>= 3;
    int len = end-srcoffset;
    while(h--)
    {
      memcpy(output, ((char *)serv->screen.framebuffer)+srcoffset, len);
      srcoffset += serv->screen.stride;
      output += len;
    }
    PROFILE_FUNC_END
    return;
  }

  if (destfmt->true_colour_flag)
  {
    const PIXEL_FORMAT * restrict usefmt = (srcfmt->true_colour_flag?srcfmt:&palformat);

    uint8_t src_red_bits = (sizeof(unsigned int)*8)-__builtin_clz(usefmt->red_max);
    uint8_t src_green_bits = (sizeof(unsigned int)*8)-__builtin_clz(usefmt->green_max);
    uint8_t src_blue_bits = (sizeof(unsigned int)*8)-__builtin_clz(usefmt->blue_max);
    uint8_t dest_red_bits = (sizeof(unsigned int)*8)-__builtin_clz(destfmt->red_max);
    uint8_t dest_green_bits = (sizeof(unsigned int)*8)-__builtin_clz(destfmt->green_max);
    uint8_t dest_blue_bits = (sizeof(unsigned int)*8)-__builtin_clz(destfmt->blue_max);

    /* Optimise true-colour case if we don't need to do any component expansion (e.g. just R/B swapping needed, or reducing 64K colour to 32K/4K) */
    if (srcfmt->true_colour_flag
    && (srcfmt->bits_per_pixel == destfmt->bits_per_pixel)
    && (src_red_bits >= dest_red_bits)
    && (src_green_bits >= dest_green_bits)
    && (src_blue_bits >= dest_blue_bits))
    {
      if (srcfmt->bits_per_pixel == 16)
      {
        /* To keep reg usage low, src will be shifted 16 bits to the high half of the word, then for each component shift right and mask */
        uint8_t red_shift = 16 + srcfmt->red_shift + src_red_bits - (destfmt->red_shift + dest_red_bits);
        uint8_t green_shift = 16 + srcfmt->green_shift + src_green_bits - (destfmt->green_shift + dest_green_bits);
        uint8_t blue_shift = 16 + srcfmt->blue_shift + src_blue_bits - (destfmt->blue_shift + dest_blue_bits);
        uint16_t red_mask = destfmt->red_max << destfmt->red_shift;
        uint16_t green_mask = destfmt->green_max << destfmt->green_shift;
        uint16_t blue_mask = destfmt->blue_max << destfmt->blue_shift;
        uint8_t big_endian = destfmt->big_endian_flag;

        srcoffset >>= 4;
        while(h--)
        {
          const uint16_t * restrict screen = ((uint16_t *) serv->screen.framebuffer) + srcoffset;
          for(x=0; x<w; x++)
          {
            /* Grab src pixel */
            uint32_t src = *screen++;
            src <<= 16;
            /* Extract components */
            uint32_t r = (src >> red_shift) & red_mask;
            uint32_t g = (src >> green_shift) & green_mask;
            uint32_t b = (src >> blue_shift) & blue_mask;
            /* Recompose */
            uint32_t dest = r | g | b;
            /* Write out */
            if (big_endian)
            {
              dest = (dest>>8) | (dest<<8);
            }
            *output++ = dest;
            *output++ = dest>>8;
          }
          srcoffset += serv->screen.stride>>1;
        }
        PROFILE_FUNC_END
        return;
      }
      else if ((srcfmt->bits_per_pixel == 32)
            && (dest_red_bits == 8)
            && (dest_green_bits == 8)
            && (dest_blue_bits == 8)
            && !((destfmt->red_shift | destfmt->green_shift | destfmt->blue_shift) & 3))
      {
        /* Simple byte remapping operation
           Work out which dest byte corresponds to each src byte
           (assumes src components are 8 bit, byte aligned - true for all current RISC OS modes!) */
        uint32_t src_mask = (0xff<<srcfmt->red_shift) | (0xff<<srcfmt->green_shift) | (0xff<<srcfmt->blue_shift);
        uint8_t blank = (src_mask & 0xff ? (src_mask & 0xff00 ? (src_mask & 0xff0000 ? 3 : 2) : 1) : 0);
        uint8_t b[4];
        for(int i=0;i<4;i++)
        {
          int shift = i<<3;
          if (destfmt->big_endian_flag)
            shift ^= 3<<3;
          if (srcfmt->red_shift == shift)
            b[i] = destfmt->red_shift>>3;
          else if (srcfmt->green_shift == shift)
            b[i] = destfmt->green_shift>>3;
          else if (srcfmt->blue_shift == shift)
            b[i] = destfmt->blue_shift>>3;
          else
            b[i] = blank;
        }

        srcoffset >>= 5;
        while(h--)
        {
          const uint32_t * restrict screen = ((uint32_t *) serv->screen.framebuffer) + srcoffset;
          for(x=0; x<w; x++)
          {
            uint32_t src = *screen++;
            src &= src_mask;
            output[b[0]] = src;
            output[b[1]] = src>>8;
            output[b[2]] = src>>16;
            output[b[3]] = src>>24;
            output += 4;
          }
          srcoffset += serv->screen.stride>>2;
        }
        PROFILE_FUNC_END
        return;
      }
    }

    while(h--)
    {
      int myoffset = srcoffset;
      for(x=0; x<w; x++)
      {
        /* Grab src pixel */
        uint32_t src = readbits(serv->screen.framebuffer, myoffset, srcfmt->depth);
        myoffset += srcfmt->bits_per_pixel;
        /* Palette lookup */
        if (!srcfmt->true_colour_flag)
        {
          src = serv->pal8bpp[src];
        }
        /* Decompose to RGB
           Assume src is little endian */
        uint32_t r,g,b;
        r = (src >> usefmt->red_shift) & usefmt->red_max;
        g = (src >> usefmt->green_shift) & usefmt->green_max;
        b = (src >> usefmt->blue_shift) & usefmt->blue_max;
        /* Expand/shrink */
        r = expandbits(r,src_red_bits,dest_red_bits);
        g = expandbits(g,src_green_bits,dest_green_bits);
        b = expandbits(b,src_blue_bits,dest_blue_bits);
        /* Recompose */
        uint32_t dest = (r<<destfmt->red_shift) | (g<<destfmt->green_shift) | (b<<destfmt->blue_shift);
        /* Write out */
        if (destfmt->bits_per_pixel == 8)
        {
          *output++ = dest;
        }
        else if (destfmt->bits_per_pixel == 16)
        {
          if (destfmt->big_endian_flag)
            dest = Swap16IfLE(dest);
          /* Dest may be unaligned... yuck */
          *output++ = dest;
          *output++ = dest>>8;
        }
        else
        {
          if (destfmt->big_endian_flag)
            dest = Swap32IfLE(dest);
          *output++ = dest;
          *output++ = dest>>8;
          *output++ = dest>>16;
          *output++ = dest>>24;
        }
      }
      srcoffset += serv->screen.stride<<3;
    }
  }
  else
  {
    if (srcfmt->true_colour_flag && serv->pixtrans)
    {
      uint8_t src_red_bits = (sizeof(unsigned int)*8)-__builtin_clz(srcfmt->red_max);
      uint8_t src_green_bits = (sizeof(unsigned int)*8)-__builtin_clz(srcfmt->green_max);
      uint8_t src_blue_bits = (sizeof(unsigned int)*8)-__builtin_clz(srcfmt->blue_max);

      while(h--)
      {
        int myoffset = srcoffset;
        for(x=0; x<w; x++)
        {
          /* Grab src pixel */
          uint32_t src = readbits(serv->screen.framebuffer, myoffset, srcfmt->depth);
          myoffset += srcfmt->bits_per_pixel;

          /* Convert to 32K */

          /* Decompose to RGB
             Assume src is little endian */
          uint32_t r,g,b;
          r = (src >> srcfmt->red_shift) & srcfmt->red_max;
          g = (src >> srcfmt->green_shift) & srcfmt->green_max;
          b = (src >> srcfmt->blue_shift) & srcfmt->blue_max;
          /* Expand/shrink */
          r = expandbits(r,src_red_bits,5);
          g = expandbits(g,src_green_bits,5);
          b = expandbits(b,src_blue_bits,5);
          /* Recompose */
          uint32_t dest = r | (g<<5) | (b<<10);
          /* Translate */
          dest = ((const uint8_t *)serv->pixtrans)[dest];
          /* Write out */
          if (destfmt->bits_per_pixel == 8)
          {
            *output++ = dest;
          }
          else if (destfmt->bits_per_pixel == 16)
          {
            if (destfmt->big_endian_flag)
              dest = Swap16IfLE(dest);
            /* Dest may be unaligned... yuck */
            *output++ = dest;
            *output++ = dest>>8;
          }
          else
          {
            if (destfmt->big_endian_flag)
              dest = Swap32IfLE(dest);
            *output++ = dest;
            *output++ = dest>>8;
            *output++ = dest>>16;
            *output++ = dest>>24;
          }
        }
        srcoffset += serv->screen.stride<<3;
      }
    }
    else
    {
      while(h--)
      {
        int myoffset = srcoffset;
        for(x=0; x<w; x++)
        {
          /* Grab src pixel */
          uint32_t src = readbits(serv->screen.framebuffer, myoffset, srcfmt->depth);
          myoffset += srcfmt->bits_per_pixel;
          /* 'Translate' */
          uint32_t dest = src;
          /* Write out */
          if (destfmt->bits_per_pixel == 8)
          {
            *output++ = dest;
          }
          else if (destfmt->bits_per_pixel == 16)
          {
            if (destfmt->big_endian_flag)
              dest = Swap16IfLE(dest);
            /* Dest may be unaligned... yuck */
            *output++ = dest;
            *output++ = dest>>8;
          }
          else
          {
            if (destfmt->big_endian_flag)
              dest = Swap32IfLE(dest);
            *output++ = dest;
            *output++ = dest>>8;
            *output++ = dest>>16;
            *output++ = dest>>24;
          }
        }
        srcoffset += serv->screen.stride<<3;
      }
    }
  }
  PROFILE_FUNC_END
}

uint32_t pixtrans_do_8888(vncserv *serv, uint32_t col)
{
  /* Convert RO palette entry to dest colour format */
  const PIXEL_FORMAT *destfmt = &serv->clientformat;


  if (destfmt->true_colour_flag)
  {
    uint32_t r,g,b;

    /* Extract src components, expand to 16bpp for simplicity */
    r = (col>>8) & 0xff;
    g = (col>>16) & 0xff;
    b = (col>>24) & 0xff;

    r |= r<<8;
    g |= g<<8;
    b |= b<<8;

    /* Reduce to target width */
    uint8_t dest_red_bits = (sizeof(unsigned int)*8)-__builtin_clz(destfmt->red_max);
    uint8_t dest_green_bits = (sizeof(unsigned int)*8)-__builtin_clz(destfmt->green_max);
    uint8_t dest_blue_bits = (sizeof(unsigned int)*8)-__builtin_clz(destfmt->blue_max);

    r >>= 16-dest_red_bits;
    g >>= 16-dest_green_bits;
    b >>= 16-dest_blue_bits;

    /* Recompose */
    col = (r<<destfmt->red_shift) | (g<<destfmt->green_shift) | (b<<destfmt->blue_shift);
  }
  else
  {
    /* Find closest colour. Just rely on the OS for now. */
    xcolourtrans_return_colour_number_for_mode((os_colour) col,(os_mode) ((bpp_to_sprite_type[destfmt->depth]<<27)+(90<<14)+(90<<1)+1),(os_palette *) serv->pal8bpp,(os_colour_number*) &col);
  }

  /* Endian swap */
  if (destfmt->big_endian_flag)
  {
    if (destfmt->bits_per_pixel == 16)
    {
      col = Swap16IfLE(col);
    }
    else if (destfmt->bits_per_pixel == 32)
    {
      col = Swap32IfLE(col);
    }
  }

  return col;
}

void pixtrans_do_CPIXEL(vncserv *serv, int x, int y, int w, int h, uint8_t * restrict output)
{
  const PIXEL_FORMAT * restrict srcfmt = &serv->screen.format;
  const PIXEL_FORMAT * restrict destfmt = &serv->clientformat;

  PROFILE_FUNC_BEGIN
  PROFILE_FUNC_METRIC(w*h)

  int srcoffset = x*srcfmt->bits_per_pixel + ((y*serv->screen.stride)<<3);

  const PIXEL_FORMAT * restrict usefmt = (srcfmt->true_colour_flag?srcfmt:&palformat);

  uint8_t src_red_bits = (sizeof(unsigned int)*8)-__builtin_clz(usefmt->red_max);
  uint8_t src_green_bits = (sizeof(unsigned int)*8)-__builtin_clz(usefmt->green_max);
  uint8_t src_blue_bits = (sizeof(unsigned int)*8)-__builtin_clz(usefmt->blue_max);
  uint8_t dest_red_bits = (sizeof(unsigned int)*8)-__builtin_clz(destfmt->red_max);
  uint8_t dest_green_bits = (sizeof(unsigned int)*8)-__builtin_clz(destfmt->green_max);
  uint8_t dest_blue_bits = (sizeof(unsigned int)*8)-__builtin_clz(destfmt->blue_max);

  /* If the high byte is unused, drop it. This is the recommended behaviour
     when dealing with bits_per_pixel of 32 and depth of <= 16. */
  uint32_t full = (((uint32_t) destfmt->red_max) << destfmt->red_shift) | (((uint32_t) destfmt->green_max) << destfmt->green_shift) | (((uint32_t) destfmt->blue_max) << destfmt->blue_shift);
  bool drop_high = ((full & 0xff000000) == 0);

  /* Optimise for simple byte remapping operation, similar to standard code */
  if (srcfmt->true_colour_flag
  && (srcfmt->bits_per_pixel == destfmt->bits_per_pixel)
  && (src_red_bits >= dest_red_bits)
  && (src_green_bits >= dest_green_bits)
  && (src_blue_bits >= dest_blue_bits))
  {
    if ((srcfmt->bits_per_pixel == 32)
     && (dest_red_bits == 8)
     && (dest_green_bits == 8)
     && (dest_blue_bits == 8)
     && !((destfmt->red_shift | destfmt->green_shift | destfmt->blue_shift) & 3))
    {
      uint32_t src_mask = (0xff<<srcfmt->red_shift) | (0xff<<srcfmt->green_shift) | (0xff<<srcfmt->blue_shift);
      uint8_t b[3];
      for(int i=0;i<3;i++)
      {
        int shift = i<<3;
        if (destfmt->big_endian_flag)
          shift ^= 3<<3;
        if (srcfmt->red_shift == shift)
          b[i] = destfmt->red_shift>>3;
        else if (srcfmt->green_shift == shift)
          b[i] = destfmt->green_shift>>3;
        else /* if (srcfmt->blue_shift == shift) */
          b[i] = destfmt->blue_shift>>3;
      }

      srcoffset >>= 5;
      while(h--)
      {
        const uint32_t * restrict screen = ((uint32_t *) serv->screen.framebuffer) + srcoffset;
        for(x=0; x<w; x++)
        {
          uint32_t src = *screen++;
          src &= src_mask;
          if (!drop_high)
            src = src>>8;
          output[b[0]] = src;
          output[b[1]] = src>>8;
          output[b[2]] = src>>16;
          output += 3;
        }
        srcoffset += serv->screen.stride>>2;
      }
      PROFILE_FUNC_END
      return;
    }
  }

  /* Flip the flag if big-endian output */
  if (destfmt->big_endian_flag)
  {
    drop_high = !drop_high;
  }

  while(h--)
  {
    int myoffset = srcoffset;
    for(x=0; x<w; x++)
    {
      /* Grab src pixel */
      uint32_t src = readbits(serv->screen.framebuffer, myoffset, srcfmt->depth);
      myoffset += srcfmt->bits_per_pixel;
      /* Palette lookup */
      if (!srcfmt->true_colour_flag)
      {
        src = serv->pal8bpp[src];
      }
      /* Decompose to RGB
         Assume src is little endian */
      uint32_t r,g,b;
      r = (src >> usefmt->red_shift) & usefmt->red_max;
      g = (src >> usefmt->green_shift) & usefmt->green_max;
      b = (src >> usefmt->blue_shift) & usefmt->blue_max;
      /* Expand/shrink */
      r = expandbits(r,src_red_bits,dest_red_bits);
      g = expandbits(g,src_green_bits,dest_green_bits);
      b = expandbits(b,src_blue_bits,dest_blue_bits);
      /* Recompose */
      uint32_t dest = (r<<destfmt->red_shift) | (g<<destfmt->green_shift) | (b<<destfmt->blue_shift);
      /* Write out */
      if (destfmt->big_endian_flag)
        dest = Swap32IfLE(dest);
      if (!drop_high)
        dest = dest>>8;
      *output++ = dest;
      *output++ = dest>>8;
      *output++ = dest>>16;
    }
    srcoffset += serv->screen.stride<<3;
  }
  PROFILE_FUNC_END
}

void pixtrans_do_table(vncserv *serv, uint32_t * restrict io, int count, int stride)
{
  const PIXEL_FORMAT * restrict srcfmt = &serv->screen.format;
  const PIXEL_FORMAT * restrict destfmt = &serv->clientformat;

  if (!memcmp(srcfmt,destfmt,sizeof(PIXEL_FORMAT)) || !count)
  {
    return;
  }

  PROFILE_FUNC_BEGIN
  PROFILE_FUNC_METRIC(count)

  if (destfmt->true_colour_flag)
  {
    const PIXEL_FORMAT * restrict usefmt = (srcfmt->true_colour_flag?srcfmt:&palformat);

    uint8_t src_red_bits = (sizeof(unsigned int)*8)-__builtin_clz(usefmt->red_max);
    uint8_t src_green_bits = (sizeof(unsigned int)*8)-__builtin_clz(usefmt->green_max);
    uint8_t src_blue_bits = (sizeof(unsigned int)*8)-__builtin_clz(usefmt->blue_max);
    uint8_t dest_red_bits = (sizeof(unsigned int)*8)-__builtin_clz(destfmt->red_max);
    uint8_t dest_green_bits = (sizeof(unsigned int)*8)-__builtin_clz(destfmt->green_max);
    uint8_t dest_blue_bits = (sizeof(unsigned int)*8)-__builtin_clz(destfmt->blue_max);

    /* Optimise true-colour case if we don't need to do any component expansion (e.g. just R/B swapping needed, or reducing 64K colour to 32K/4K) */
    if (srcfmt->true_colour_flag
    && (srcfmt->bits_per_pixel == destfmt->bits_per_pixel)
    && (src_red_bits >= dest_red_bits)
    && (src_green_bits >= dest_green_bits)
    && (src_blue_bits >= dest_blue_bits))
    {
      if (srcfmt->bits_per_pixel == 16)
      {
        /* To keep reg usage low, src will be shifted 16 bits to the high half of the word, then for each component shift right and mask */
        uint8_t red_shift = 16 + srcfmt->red_shift + src_red_bits - (destfmt->red_shift + dest_red_bits);
        uint8_t green_shift = 16 + srcfmt->green_shift + src_green_bits - (destfmt->green_shift + dest_green_bits);
        uint8_t blue_shift = 16 + srcfmt->blue_shift + src_blue_bits - (destfmt->blue_shift + dest_blue_bits);
        uint16_t red_mask = destfmt->red_max << destfmt->red_shift;
        uint16_t green_mask = destfmt->green_max << destfmt->green_shift;
        uint16_t blue_mask = destfmt->blue_max << destfmt->blue_shift;
        uint8_t big_endian = destfmt->big_endian_flag;

        while (count--)
        {
          /* Grab src pixel */
          uint32_t src = *io;
          src <<= 16;
          /* Extract components */
          uint32_t r = (src >> red_shift) & red_mask;
          uint32_t g = (src >> green_shift) & green_mask;
          uint32_t b = (src >> blue_shift) & blue_mask;
          /* Recompose */
          uint32_t dest = r | g | b;
          /* Write out */
          if (big_endian)
          {
            dest = (dest>>8) | (dest<<8);
          }
          *io = dest;
          io += stride;
        }
        PROFILE_FUNC_END
        return;
      }
      else if ((srcfmt->bits_per_pixel == 32)
            && (dest_red_bits == 8)
            && (dest_green_bits == 8)
            && (dest_blue_bits == 8)
            && !((destfmt->red_shift | destfmt->green_shift | destfmt->blue_shift) & 3))
      {
        /* Simple byte remapping operation
           Work out which dest byte corresponds to each src byte
           (assumes src components are 8 bit, byte aligned - true for all current RISC OS modes!) */
        uint32_t src_mask = (0xff<<srcfmt->red_shift) | (0xff<<srcfmt->green_shift) | (0xff<<srcfmt->blue_shift);
        uint8_t blank = (src_mask & 0xff ? (src_mask & 0xff00 ? (src_mask & 0xff0000 ? 3 : 2) : 1) : 0);
        uint8_t b[4];
        for(int i=0;i<4;i++)
        {
          int shift = i<<3;
          if (destfmt->big_endian_flag)
            shift ^= 3<<3;
          if (srcfmt->red_shift == shift)
            b[i] = destfmt->red_shift>>3;
          else if (srcfmt->green_shift == shift)
            b[i] = destfmt->green_shift>>3;
          else if (srcfmt->blue_shift == shift)
            b[i] = destfmt->blue_shift>>3;
          else
            b[i] = blank;
        }

        while (count--)
        {
          uint32_t src = *io;
          src &= src_mask;
          uint8_t *output = (uint8_t *) io;
          output[b[0]] = src;
          output[b[1]] = src>>8;
          output[b[2]] = src>>16;
          output[b[3]] = src>>24;
          io += stride;
        }
        PROFILE_FUNC_END
        return;
      }
    }

    while(count--)
    {
      /* Grab src pixel */
      uint32_t src = *io;
      /* Palette lookup */
      if (!srcfmt->true_colour_flag)
      {
        src = serv->pal8bpp[src];
      }
      /* Decompose to RGB
         Assume src is little endian */
      uint32_t r,g,b;
      r = (src >> usefmt->red_shift) & usefmt->red_max;
      g = (src >> usefmt->green_shift) & usefmt->green_max;
      b = (src >> usefmt->blue_shift) & usefmt->blue_max;
      /* Expand/shrink */
      r = expandbits(r,src_red_bits,dest_red_bits);
      g = expandbits(g,src_green_bits,dest_green_bits);
      b = expandbits(b,src_blue_bits,dest_blue_bits);
      /* Recompose */
      uint32_t dest = (r<<destfmt->red_shift) | (g<<destfmt->green_shift) | (b<<destfmt->blue_shift);
      /* Write out */
      if (destfmt->big_endian_flag)
      {
        if (destfmt->bits_per_pixel == 16)
          dest = Swap16IfLE(dest);
        else if (destfmt->bits_per_pixel == 32)
          dest = Swap32IfLE(dest);
      }
      *io = dest;
      io += stride;
    }
  }
  else
  {
    if (srcfmt->true_colour_flag && serv->pixtrans)
    {
      uint8_t src_red_bits = (sizeof(unsigned int)*8)-__builtin_clz(srcfmt->red_max);
      uint8_t src_green_bits = (sizeof(unsigned int)*8)-__builtin_clz(srcfmt->green_max);
      uint8_t src_blue_bits = (sizeof(unsigned int)*8)-__builtin_clz(srcfmt->blue_max);

      while(count--)
      {
        /* Grab src pixel */
        uint32_t src = *io;

        /* Convert to 32K */

        /* Decompose to RGB
           Assume src is little endian */
        uint32_t r,g,b;
        r = (src >> srcfmt->red_shift) & srcfmt->red_max;
        g = (src >> srcfmt->green_shift) & srcfmt->green_max;
        b = (src >> srcfmt->blue_shift) & srcfmt->blue_max;
        /* Expand/shrink */
        r = expandbits(r,src_red_bits,5);
        g = expandbits(g,src_green_bits,5);
        b = expandbits(b,src_blue_bits,5);
        /* Recompose */
        uint32_t dest = r | (g<<5) | (b<<10);
        /* Translate */
        dest = ((const uint8_t *)serv->pixtrans)[dest];
        /* Write out */
        if (destfmt->big_endian_flag)
        {
          if (destfmt->bits_per_pixel == 16)
            dest = Swap16IfLE(dest);
          else if (destfmt->bits_per_pixel == 32)
            dest = Swap32IfLE(dest);
        }
        *io = dest;
        io += stride;
      }
    }
    else
    {
      /* No conversion needed */
    }
  }
  PROFILE_FUNC_END
}

void pixtrans_do_CPIXEL_table(vncserv *serv, uint32_t * restrict io, int count, int stride)
{
  const PIXEL_FORMAT * restrict srcfmt = &serv->screen.format;
  const PIXEL_FORMAT * restrict destfmt = &serv->clientformat;

  PROFILE_FUNC_BEGIN
  PROFILE_FUNC_METRIC(count)

  const PIXEL_FORMAT * restrict usefmt = (srcfmt->true_colour_flag?srcfmt:&palformat);

  uint8_t src_red_bits = (sizeof(unsigned int)*8)-__builtin_clz(usefmt->red_max);
  uint8_t src_green_bits = (sizeof(unsigned int)*8)-__builtin_clz(usefmt->green_max);
  uint8_t src_blue_bits = (sizeof(unsigned int)*8)-__builtin_clz(usefmt->blue_max);
  uint8_t dest_red_bits = (sizeof(unsigned int)*8)-__builtin_clz(destfmt->red_max);
  uint8_t dest_green_bits = (sizeof(unsigned int)*8)-__builtin_clz(destfmt->green_max);
  uint8_t dest_blue_bits = (sizeof(unsigned int)*8)-__builtin_clz(destfmt->blue_max);

  /* If the high byte is unused, drop it. This is the recommended behaviour
     when dealing with bits_per_pixel of 32 and depth of <= 16. */
  uint32_t full = (((uint32_t) destfmt->red_max) << destfmt->red_shift) | (((uint32_t) destfmt->green_max) << destfmt->green_shift) | (((uint32_t) destfmt->blue_max) << destfmt->blue_shift);
  bool drop_high = ((full & 0xff000000) == 0);

  /* Optimise for simple byte remapping operation, similar to standard code */
  if (srcfmt->true_colour_flag
  && (srcfmt->bits_per_pixel == destfmt->bits_per_pixel)
  && (src_red_bits >= dest_red_bits)
  && (src_green_bits >= dest_green_bits)
  && (src_blue_bits >= dest_blue_bits))
  {
    if ((srcfmt->bits_per_pixel == 32)
     && (dest_red_bits == 8)
     && (dest_green_bits == 8)
     && (dest_blue_bits == 8)
     && !((destfmt->red_shift | destfmt->green_shift | destfmt->blue_shift) & 3))
    {
      uint32_t src_mask = (0xff<<srcfmt->red_shift) | (0xff<<srcfmt->green_shift) | (0xff<<srcfmt->blue_shift);
      uint8_t blank = (src_mask & 0xff ? (src_mask & 0xff00 ? (src_mask & 0xff0000 ? 3 : 2) : 1) : 0);
      uint8_t b[4];
      for(int i=0;i<4;i++)
      {
        int shift = i<<3;
        if (destfmt->big_endian_flag)
          shift ^= 3<<3;
        if (srcfmt->red_shift == shift)
          b[i] = destfmt->red_shift>>3;
        else if (srcfmt->green_shift == shift)
          b[i] = destfmt->green_shift>>3;
        else if (srcfmt->blue_shift == shift)
          b[i] = destfmt->blue_shift>>3;
        else
          b[i] = blank;
      }

      while (count--)
      {
        uint32_t src = *io;
        src &= src_mask;
        if (!drop_high)
          src = src>>8;
        uint8_t *output = (uint8_t *) io;
        output[b[0]] = src;
        output[b[1]] = src>>8;
        output[b[2]] = src>>16;
        output[b[3]] = src>>24;
        io += stride;
      }

      PROFILE_FUNC_END
      return;
    }
  }

  /* Flip the flag if big-endian output */
  if (destfmt->big_endian_flag)
  {
    drop_high = !drop_high;
  }

  while(count--)
  {
    /* Grab src pixel */
    uint32_t src = *io;
    /* Palette lookup */
    if (!srcfmt->true_colour_flag)
    {
      src = serv->pal8bpp[src];
    }
    /* Decompose to RGB
       Assume src is little endian */
    uint32_t r,g,b;
    r = (src >> usefmt->red_shift) & usefmt->red_max;
    g = (src >> usefmt->green_shift) & usefmt->green_max;
    b = (src >> usefmt->blue_shift) & usefmt->blue_max;
    /* Expand/shrink */
    r = expandbits(r,src_red_bits,dest_red_bits);
    g = expandbits(g,src_green_bits,dest_green_bits);
    b = expandbits(b,src_blue_bits,dest_blue_bits);
    /* Recompose */
    uint32_t dest = (r<<destfmt->red_shift) | (g<<destfmt->green_shift) | (b<<destfmt->blue_shift);
    /* Write out */
    if (destfmt->big_endian_flag)
      dest = Swap32IfLE(dest);
    if (!drop_high)
      dest = dest>>8;
    *io = dest;
    io += stride;
  }
  PROFILE_FUNC_END
}
