/*
 * This file contains the YUV output functions for YUV420 modes.
 */

#include <string.h>
#include "inttypes.h"
#include "ka_drawers.h"
#include <assert.h>

extern uint8_t ka_Y_Map[256];
extern uint8_t ka_UV_Map[256];

/**
 * Assumes src and dst are word aligned.
 */
static void translate(uint8_t* dst, uint8_t* src, int size, uint8_t* map)
{
  uint32_t* dst32 = (uint32_t*) dst;
  uint32_t* src32 = (uint32_t*) src;
  int i;

  for (i = size; i > 0;i-=4)
  {
    uint32_t vals = *src32++;
    uint32_t vald = map[vals >> 24];
    vald = map[(vals >> 16) & 0xff] | (vald << 8);
    vald = map[(vals >> 8) & 0xff] | (vald << 8);
    vald = map[vals & 0xff] | (vald << 8);
    *dst32++ = vald;
  }

  if (i > 0)
  {
    dst = (uint8_t*) dst32;
    src = (uint8_t*) src32;

    for (; i--;)
      *dst++ = map[*src++];
  }
}

/**
 * Converts an YUV420 source into a NV12 destination, 100% zoom.
 * NV12: Plane 0 is 8bpp Y, Plane 1 is 16bpp &Cr.Cb, 2x2 sub-sampled
 *
 * @param  paint  draw parameters
 */
void ka_drawyuv420_z1_NV12(const ka_paint_t* paint)
{
  int height;
  uint8_t *dst_y;
  uint16_t *dst_uv;
  uint8_t *y, *u, *v;
  int uv_skip, dst_uv_skip;
  int i;

  dst_y = paint->dst;
  dst_uv = (uint16_t*) paint->dst_u;
  dst_uv_skip = (paint->dst_u_bpr >> 1) - (paint->yc_width >> 1); // cf. int16_t

  y = paint->base.y;                // luminance address
  u = paint->base.cb;               // chroma b source address
  v = paint->base.cr;               // chroma r source address
  uv_skip = paint->uv_skip;

  // Loop: copy 2 lines of luma, 1 half-line of uv chroma
  for (height = paint->yc_height; height > 0; height -= 2)
  {
    // Copy even luma row
    translate(dst_y, y, paint->yc_width, ka_Y_Map);
    y += paint->yc_bpr;
    dst_y += paint->dst_bpr;
    // Copy odd luma row (beware of uneven number of rows)
    if (height != 1)
    {
      translate(dst_y, y, paint->yc_width, ka_Y_Map);
      y += paint->yc_bpr;
      dst_y += paint->dst_bpr;
    }

    // Copy chroma, half-line of uv chroma
    for (i = paint->yc_width >> 1; i >= 8; i -= 8)
    {
      uint32_t val1,val2,val3,val4;

      val1 = ka_UV_Map[*u++];
      val1 |= ka_UV_Map[*u++] << 16;
      val2 = ka_UV_Map[*u++];
      val2 |= ka_UV_Map[*u++] << 16;
      val3 = ka_UV_Map[*u++];
      val3 |= ka_UV_Map[*u++] << 16;
      val4 = ka_UV_Map[*u++];
      val4 |= ka_UV_Map[*u++] << 16;

      val1 |= ka_UV_Map[*v++] << 8;
      val1 |= ka_UV_Map[*v++] << 24;
      val2 |= ka_UV_Map[*v++] << 8;
      val2 |= ka_UV_Map[*v++] << 24;
      val3 |= ka_UV_Map[*v++] << 8;
      val3 |= ka_UV_Map[*v++] << 24;
      val4 |= ka_UV_Map[*v++] << 8;
      val4 |= ka_UV_Map[*v++] << 24;

      ((uint32_t *)dst_uv)[0] = val1;
      ((uint32_t *)dst_uv)[1] = val2;
      ((uint32_t *)dst_uv)[2] = val3;
      ((uint32_t *)dst_uv)[3] = val4;
      dst_uv += 8;
    }

    while(i--)
    {
      *dst_uv++ = ka_UV_Map[*u++] | (ka_UV_Map[*v++] << 8);
    }

    u += uv_skip;
    v += uv_skip;
    dst_uv += dst_uv_skip;
  }

  assert(dst_y == paint->dst + paint->yc_height * paint->dst_bpr);
  assert(dst_uv == (uint16_t*) (paint->dst_u + (paint->yc_height >> 1) * paint->dst_u_bpr));
  assert(y == paint->base.y + paint->yc_height * paint->yc_bpr);
  assert(u == paint->base.cb + (paint->yc_height >> 1) * (paint->yc_bpr >> 1));
  assert(v == paint->base.cr + (paint->yc_height >> 1) * (paint->yc_bpr >> 1));
}

/**
 * Converts an YUV420 source into a NV21 destination, 100% zoom.
 * NV21: Plane 0 is 8bpp Y, Plane 1 is 16bpp &Cb.Cr, 2x2 sub-sampled
 *
 * @param  paint  draw parameters
 */
void ka_drawyuv420_z1_NV21(const ka_paint_t* paint)
{
  int height;
  uint8_t *dst_y;
  uint16_t *dst_uv;
  uint8_t *y, *u, *v;
  int uv_skip, dst_uv_skip;
  int i;

  dst_y = paint->dst;
  dst_uv = (uint16_t*) paint->dst_u;
  dst_uv_skip = (paint->dst_u_bpr >> 1) - (paint->yc_width >> 1); // cf. int16_t

  y = paint->base.y;                // luminance address
  u = paint->base.cb;               // chroma b source address
  v = paint->base.cr;               // chroma r source address
  uv_skip = paint->uv_skip;

  // Loop: copy 2 lines of luma, 1 half-line of uv chroma
  for (height = paint->yc_height; height > 0; height -= 2)
  {
    // Copy even luma row
    translate(dst_y, y, paint->yc_width, ka_Y_Map);
    y += paint->yc_bpr;
    dst_y += paint->dst_bpr;
    // Copy odd luma row (beware of uneven number of rows)
    if (height != 1)
    {
      translate(dst_y, y, paint->yc_width, ka_Y_Map);
      y += paint->yc_bpr;
      dst_y += paint->dst_bpr;
    }

    // Copy chroma, half-line of uv chroma
    for (i = paint->yc_width >> 1; i >= 8; i -= 8)
    {
      uint32_t val1,val2,val3,val4;

      val1 = ka_UV_Map[*v++];
      val1 |= ka_UV_Map[*v++] << 16;
      val2 = ka_UV_Map[*v++];
      val2 |= ka_UV_Map[*v++] << 16;
      val3 = ka_UV_Map[*v++];
      val3 |= ka_UV_Map[*v++] << 16;
      val4 = ka_UV_Map[*v++];
      val4 |= ka_UV_Map[*v++] << 16;

      val1 |= ka_UV_Map[*u++] << 8;
      val1 |= ka_UV_Map[*u++] << 24;
      val2 |= ka_UV_Map[*u++] << 8;
      val2 |= ka_UV_Map[*u++] << 24;
      val3 |= ka_UV_Map[*u++] << 8;
      val3 |= ka_UV_Map[*u++] << 24;
      val4 |= ka_UV_Map[*u++] << 8;
      val4 |= ka_UV_Map[*u++] << 24;

      ((uint32_t *)dst_uv)[0] = val1;
      ((uint32_t *)dst_uv)[1] = val2;
      ((uint32_t *)dst_uv)[2] = val3;
      ((uint32_t *)dst_uv)[3] = val4;
      dst_uv += 8;
    }

    while(i--)
    {
      *dst_uv++ = ka_UV_Map[*v++] | (ka_UV_Map[*u++] << 8);
    }

    u += uv_skip;
    v += uv_skip;
    dst_uv += dst_uv_skip;
  }

  assert(dst_y == paint->dst + paint->yc_height * paint->dst_bpr);
  assert(dst_uv == (uint16_t*) (paint->dst_u + (paint->yc_height >> 1) * paint->dst_u_bpr));
  assert(y == paint->base.y + paint->yc_height * paint->yc_bpr);
  assert(u == paint->base.cb + (paint->yc_height >> 1) * (paint->yc_bpr >> 1));
  assert(v == paint->base.cr + (paint->yc_height >> 1) * (paint->yc_bpr >> 1));
}

/**
 * Converts an YUV420 source into a UVVY destination, 100% zoom.
 * UYVY: 32bpp words of &Y1.Cr.Y0.Cb, 2x1 sub-sampled
 *
 * @param  paint  draw parameters
 */
void ka_drawyuv420_z1_UYVY(const ka_paint_t* paint)
{
  int height;
  uint32_t *dst1, *dst2;
  uint8_t *y1, *y2;
  uint8_t *u, *v;
  int dst_skip, yc_skip, uv_skip;
  int i;

  dst1 = (uint32_t*) paint->dst;    // even row screen address
  dst2 = dst1 + paint->dst_bpr/4;   // odd row screen address
  dst_skip = (paint->dst_bpr + paint->dst_skip) >> 2; // cf. int32_t

  y1 = paint->base.y;               // even row luminance address
  y2 = y1 + paint->yc_bpr;          // odd row luminance address
  yc_skip = paint->yc_bpr + paint->yc_skip;

  u = paint->base.cb;               // chroma b source address
  v = paint->base.cr;               // chroma r source address
  uv_skip = paint->uv_skip;

  // 2 rows at a time as source chroma is sub-sampled vertically
  for (height = paint->yc_height >> 1; height--;)
  {
    register uint32_t yuv1, yuv2;

    // 4 pixels at a time
    for (i = paint->yc_width >> 2; i--;)
    {
      // Chroma for the first block of 2x2 pixels
      yuv1 = ka_UV_Map[*u++] | (ka_UV_Map[*v++] << 16);

      // Chroma for the second block of 2x2 pixels
      yuv2 = ka_UV_Map[*u++] | (ka_UV_Map[*v++] << 16);

      // Top row
      yuv1 |= ka_Y_Map[*y1++] << 8;
      yuv1 |= ka_Y_Map[*y1++] << 24;
      yuv2 |= ka_Y_Map[*y1++] << 8;
      yuv2 |= ka_Y_Map[*y1++] << 24;

      *dst1++ = yuv1;
      *dst1++ = yuv2;

      // Bottom row
      yuv1 &= 0x00ff00ff;
      yuv2 &= 0x00ff00ff;
      yuv1 |= ka_Y_Map[*y2++] << 8;
      yuv1 |= ka_Y_Map[*y2++] << 24;
      yuv2 |= ka_Y_Map[*y2++] << 8;
      yuv2 |= ka_Y_Map[*y2++] << 24;

      *dst2++ = yuv1;
      *dst2++ = yuv2;
    }

    // 2 pixels at a time
    if (paint->yc_width & 2)
    {
      // Chroma for the block of 2x2 pixels
      yuv1 = ka_UV_Map[*u++] | (ka_UV_Map[*v++] << 16);
      // Top row
      yuv1 |= ka_Y_Map[*y1++] << 8;
      yuv1 |= ka_Y_Map[*y1++] << 24;
      *dst1++ = yuv1;

      // Bottom row
      yuv1 &= 0x00ff00ff;
      yuv1 |= ka_Y_Map[*y2++] << 8;
      yuv1 |= ka_Y_Map[*y2++] << 24;
      *dst2++ = yuv1;
    }

    u += uv_skip;
    v += uv_skip;
    y1 += yc_skip;
    y2 += yc_skip;
    dst1 += dst_skip;
    dst2 += dst_skip;
  }

  // Uneven number of displayed rows ?
  if (paint->yc_height & 1)
  {
    register uint32_t yuv1, yuv2;

    // 4 pixels at a time
    for (i = paint->yc_width >> 2; i--;)
    {

      // Chroma for the first block of 2 pixels
      yuv1 = ka_UV_Map[*u++] | (ka_UV_Map[*v++] << 16);

      // Chroma for the second block of 2 pixels
      yuv2 = ka_UV_Map[*u++] | (ka_UV_Map[*v++] << 16);

      // Top row
      yuv1 |= ka_Y_Map[*y1++] << 8;
      yuv1 |= ka_Y_Map[*y1++] << 24;
      yuv2 |= ka_Y_Map[*y1++] << 8;
      yuv2 |= ka_Y_Map[*y1++] << 24;

      *dst1++ = yuv1;
      *dst1++ = yuv2;
    }

    // 2 pixels at a time
    if (paint->yc_width & 2)
    {
      // Chroma for the block of 2 pixels
      yuv1 = ka_UV_Map[*u++] | (ka_UV_Map[*v++] << 16);
      // Top row
      yuv1 |= ka_Y_Map[*y1++] << 8;
      yuv1 |= ka_Y_Map[*y1++] << 24;
      *dst1++ = yuv1;
    }
  }

  assert(dst1 == (uint32_t*) (paint->dst + paint->yc_height * paint->dst_bpr));
  assert(y1 == paint->base.y + paint->yc_height * paint->yc_bpr);
  assert(u == paint->base.cb + (paint->yc_height >> 1) * (paint->yc_bpr >> 1));
  assert(v == paint->base.cr + (paint->yc_height >> 1) * (paint->yc_bpr >> 1));
}

/**
 * Converts an YUV420 source into a YUY2 destination, 100% zoom.
 * YUY2: 32bpp words of &Cr.Y1.Cb.Y0, 2x1 sub-sampled
 *
 * @param  paint  draw parameters
 */
void ka_drawyuv420_z1_YUY2(const ka_paint_t* paint)
{
  int height;
  uint32_t *dst1, *dst2;
  uint8_t *y1, *y2;
  uint8_t *u, *v;
  int dst_skip, yc_skip, uv_skip;
  int i;

  dst1 = (uint32_t*) paint->dst;    // even row screen address
  dst2 = dst1 + paint->dst_bpr/4;   // odd row screen address
  dst_skip = (paint->dst_bpr + paint->dst_skip) >> 2; // cf. int32_t

  y1 = paint->base.y;               // even row luminance address
  y2 = y1 + paint->yc_bpr;          // odd row luminance address
  yc_skip = paint->yc_bpr + paint->yc_skip;

  u = paint->base.cb;               // chroma b source address
  v = paint->base.cr;               // chroma r source address
  uv_skip = paint->uv_skip;

  // 2 rows at a time as source chroma is sub-sampled vertically
  for (height = paint->yc_height >> 1; height--;)
  {
    register uint32_t yuv1, yuv2;

    // 4 pixels at a time
    for (i = paint->yc_width >> 2; i--;)
    {
      // Chroma for the first block of 2x2 pixels
      yuv1 = (ka_UV_Map[*u++] << 8) | (ka_UV_Map[*v++] << 24);

      // Chroma for the second block of 2x2 pixels
      yuv2 = (ka_UV_Map[*u++] << 8) | (ka_UV_Map[*v++] << 24);

      // Top row
      yuv1 |= ka_Y_Map[*y1++] << 0;
      yuv1 |= ka_Y_Map[*y1++] << 16;
      yuv2 |= ka_Y_Map[*y1++] << 0;
      yuv2 |= ka_Y_Map[*y1++] << 16;

      *dst1++ = yuv1;
      *dst1++ = yuv2;

      // Bottom row
      yuv1 &= 0xff00ff00;
      yuv2 &= 0xff00ff00;
      yuv1 |= ka_Y_Map[*y2++] << 0;
      yuv1 |= ka_Y_Map[*y2++] << 16;
      yuv2 |= ka_Y_Map[*y2++] << 0;
      yuv2 |= ka_Y_Map[*y2++] << 16;

      *dst2++ = yuv1;
      *dst2++ = yuv2;
    }

    // 2 pixels at a time
    if (paint->yc_width & 2)
    {
      // Chroma for the block of 2x2 pixels
      yuv1 = (ka_UV_Map[*u++] << 8) | (ka_UV_Map[*v++] << 24);
      // Top row
      yuv1 |= ka_Y_Map[*y1++] << 0;
      yuv1 |= ka_Y_Map[*y1++] << 16;
      *dst1++ = yuv1;

      // Bottom row
      yuv1 &= 0xff00ff00;
      yuv1 |= ka_Y_Map[*y2++] << 0;
      yuv1 |= ka_Y_Map[*y2++] << 16;
      *dst2++ = yuv1;
    }

    u += uv_skip;
    v += uv_skip;
    y1 += yc_skip;
    y2 += yc_skip;
    dst1 += dst_skip;
    dst2 += dst_skip;
  }

  // Uneven number of displayed rows ?
  if (paint->yc_height & 1)
  {
    register uint32_t yuv1, yuv2;

    // 4 pixels at a time
    for (i = paint->yc_width >> 2; i--;)
    {

      // Chroma for the first block of 2 pixels
      yuv1 = (ka_UV_Map[*u++] << 8) | (ka_UV_Map[*v++] << 24);

      // Chroma for the second block of 2 pixels
      yuv2 = (ka_UV_Map[*u++] << 8) | (ka_UV_Map[*v++] << 24);

      // Top row
      yuv1 |= ka_Y_Map[*y1++] << 0;
      yuv1 |= ka_Y_Map[*y1++] << 16;
      yuv2 |= ka_Y_Map[*y1++] << 0;
      yuv2 |= ka_Y_Map[*y1++] << 16;

      *dst1++ = yuv1;
      *dst1++ = yuv2;
    }

    // 2 pixels at a time
    if (paint->yc_width & 2)
    {
      // Chroma for the block of 2 pixels
      yuv1 = (ka_UV_Map[*u++] << 8) | (ka_UV_Map[*v++] << 24);
      // Top row
      yuv1 |= ka_Y_Map[*y1++] << 0;
      yuv1 |= ka_Y_Map[*y1++] << 16;
      *dst1++ = yuv1;
    }
  }

  assert(dst1 == (uint32_t*) (paint->dst + paint->yc_height * paint->dst_bpr));
  assert(y1 == paint->base.y + paint->yc_height * paint->yc_bpr);
  assert(u == paint->base.cb + (paint->yc_height >> 1) * (paint->yc_bpr >> 1));
  assert(v == paint->base.cr + (paint->yc_height >> 1) * (paint->yc_bpr >> 1));
}

/**
 * Converts an YUV420 source into a YV12 destination, 100% zoom.
 * YV12: Plane 0 is 8bpp Y, Plane 1 Cb, Plane 2 Cr, 2x2 sub-sampled
 *
 * @param  paint  draw parameters
 */
void ka_drawyuv420_z1_YV12(const ka_paint_t* paint)
{
  int height;
  uint8_t *dst_y, *dst_u, *dst_v;
  uint8_t *y, *u, *v;
  int uv_delta;

  dst_y = paint->dst;
  dst_u = paint->dst_u;
  dst_v = paint->dst_v;

  y = paint->base.y;                // luminance address
  u = paint->base.cb;               // chroma b source address
  v = paint->base.cr;               // chroma r source address
  uv_delta = (paint->yc_width >> 1) + paint->uv_skip; // 1 u/v line

  // Loop: copy 2 lines of luma, 1 half-line of u/v chroma
  for (height = paint->yc_height; height > 0; height -= 2)
  {
    // Copy even luma row
    translate(dst_y, y, paint->yc_width, ka_Y_Map);
    y += paint->yc_bpr;
    dst_y += paint->dst_bpr;
    // Copy odd luma row (beware of uneven number of rows)
    if (height != 1)
    {
      translate(dst_y, y, paint->yc_width, ka_Y_Map);
      y += paint->yc_bpr;
      dst_y += paint->dst_bpr;
    }

    // Copy chroma, half-line of u chroma, half-line of v chroma
    // source sub-sampling = destination sub-sampling
    translate(dst_u, u, paint->yc_width >> 1, ka_UV_Map);
    translate(dst_v, v, paint->yc_width >> 1, ka_UV_Map);
    dst_u += paint->dst_u_bpr;
    dst_v += paint->dst_v_bpr;
    u += uv_delta;
    v += uv_delta;
  }

  assert(dst_y == paint->dst + paint->yc_height * paint->dst_bpr);
  assert(dst_u == paint->dst_u + (paint->yc_height >> 1) * paint->dst_u_bpr);
  assert(dst_v == paint->dst_v + (paint->yc_height >> 1) * paint->dst_v_bpr);
  assert(y == paint->base.y + paint->yc_height * paint->yc_bpr);
  assert(u == paint->base.cb + (paint->yc_height >> 1) * (paint->yc_bpr >> 1));
  assert(v == paint->base.cr + (paint->yc_height >> 1) * (paint->yc_bpr >> 1));
}

/**
 * Converts an YUV420 source into a YV16 destination, 100% zoom.
 * YV16: Plane 0 is 8bpp Y, Plane 1 Cb, Plane 2 Cr, 2x1 sub-sampled
 *
 * @param  paint  draw parameters
 */
void ka_drawyuv420_z1_YV16(const ka_paint_t* paint)
{
  int height;
  uint8_t *dst_y, *dst_u, *dst_v;
  uint8_t *y, *u, *v;
  int uv_delta;

  dst_y = paint->dst;
  dst_u = paint->dst_u;
  dst_v = paint->dst_v;

  y = paint->base.y;                // luminance address
  u = paint->base.cb;               // chroma b source address
  v = paint->base.cr;               // chroma r source address
  uv_delta = (paint->yc_width >> 1) + paint->uv_skip; // 1 u/v line

  // Loop: copy 1 lines of luma, 1 half-line of uv chroma
  for (height = 0; height < paint->yc_height; height++)
  {
    // Copy luma row
    translate(dst_y, y, paint->yc_width, ka_Y_Map);
    y += paint->yc_bpr;
    dst_y += paint->dst_bpr;

    // Copy chroma, half-line of u chroma, half-line of v chroma
    translate(dst_u, u, paint->yc_width >> 1, ka_UV_Map);
    translate(dst_v, v, paint->yc_width >> 1, ka_UV_Map);
    dst_u += paint->dst_u_bpr;
    dst_v += paint->dst_v_bpr;
    // Beware, source is sub-sampled vertically, not destination
    if (height & 1)
    {
      u += uv_delta;
      v += uv_delta;
    }
  }

  assert(dst_y == paint->dst + paint->yc_height * paint->dst_bpr);
  assert(dst_u == paint->dst_u + paint->yc_height * paint->dst_u_bpr);
  assert(dst_v == paint->dst_v + paint->yc_height * paint->dst_v_bpr);
  assert(y == paint->base.y + paint->yc_height * paint->yc_bpr);
  assert(u == paint->base.cb + (paint->yc_height >> 1) * (paint->yc_bpr >> 1));
  assert(v == paint->base.cr + (paint->yc_height >> 1) * (paint->yc_bpr >> 1));
}
