#include "ka_scalers.h"
#include "ka_mem.h"
#include <string.h>

#define FN(a, b) FNC(a, b) // needed to forced subsitution of LDRH
#define FNC(a, b) a ## b

/*
 * Merge word aligned lines with 16-bit pixels with 5-6-5 bits per component(TRGB16, TBGR16)
 * given a given linear interpolation factor [0-31].
 */
static void FN(merge_lines16_565, LDRH)(uint32_t* pdst, const uint32_t* line0, const uint32_t* line1, uint32_t sc, int width)
{
	const uint32_t mask = 0x07e0f81f;
	uint32_t val0a, val0b, val1a, val1b, b1, b2;
	int x;
	const uint32_t sc2 = 32 - sc;

	for (x = width; x & 3; x--)
	{
		val1a = *line1++;
		val0a = *line0++;
		val1b = val1a & mask;
		val0b = val0a & mask;
		val1a = (val1a & ~mask) >> 5;
		val0a = (val0a & ~mask) >> 5;
		b1 = val1a * sc + val0a * sc2;
		b2 = val1b * sc + val0b * sc2;
		b1 = b1 & ~mask;
		b1 |= (b2 >> 5) & mask;
		*pdst++ = b1;
	}

	for (; x > 0; x-= 4)
	{
		val1a = *line1++;
		val0a = *line0++;
		val1b = val1a & mask;
		val0b = val0a & mask;
		val1a = (val1a & ~mask) >> 5;
		val0a = (val0a & ~mask) >> 5;
		b1 = val1a * sc + val0a * sc2;
		b2 = val1b * sc + val0b * sc2;
		b1 = b1 & ~mask;
		b1 |= (b2 >> 5) & mask;
		*pdst++ = b1;

		val1a = *line1++;
		val0a = *line0++;
		val1b = val1a & mask;
		val0b = val0a & mask;
		val1a = (val1a & ~mask) >> 5;
		val0a = (val0a & ~mask) >> 5;
		b1 = val1a * sc + val0a * sc2;
		b2 = val1b * sc + val0b * sc2;
		b1 = b1 & ~mask;
		b1 |= (b2 >> 5) & mask;
		*pdst++ = b1;

		val1a = *line1++;
		val0a = *line0++;
		val1b = val1a & mask;
		val0b = val0a & mask;
		val1a = (val1a & ~mask) >> 5;
		val0a = (val0a & ~mask) >> 5;
		b1 = val1a * sc + val0a * sc2;
		b2 = val1b * sc + val0b * sc2;
		b1 = b1 & ~mask;
		b1 |= (b2 >> 5) & mask;
		*pdst++ = b1;

		val1a = *line1++;
		val0a = *line0++;
		val1b = val1a & mask;
		val0b = val0a & mask;
		val1a = (val1a & ~mask) >> 5;
		val0a = (val0a & ~mask) >> 5;
		b1 = val1a * sc + val0a * sc2;
		b2 = val1b * sc + val0b * sc2;
		b1 = b1 & ~mask;
		b1 |= (b2 >> 5) & mask;
		*pdst++ = b1;
	}
}

/*
 * Scale line with 16-bit pixels (Halfword aligned) with 5-6-5 bits per component(TRGB16, TBGR16).
 * Use linear interpolation when zooming in, no interpolation when zooming out.
 */
static void FN(scale_line16_565, LDRH)(uint16_t* pdst, uint32_t width, const uint16_t* psrc, int x0, int x_mag)
{
	const uint32_t mask = 0xf81f;

	if (x_mag < (1 << 16))
	{
		uint32_t val0a, val0b, val1a, val1b, sc, b1, b2;
		int x;

		psrc += x0 >> 16;
		x0 &= 0xffff;
		val0a = *psrc++;
		val0b = val0a & mask;
		val0a = val0a & ~mask;
		val1b = val0b;
		val1a = val0a;

		for (x = width; x & 3; x--)
		{
			sc = x0 >> 11;
			b1 = (val0a << 5) + (val1a - val0a) * sc;
			b2 = val1b * sc + val0b * (32 - sc);
			b1 = (b1 >> 5) & ~mask;
			b1 |= (b2 >> 5) & mask;
			*pdst++ = b1;

			x0 += x_mag;
			if (x0 >= (1<<16))
			{
				x0 &= 0xffff;
				val0a = val1a;
				val1a = *psrc++;
				val0b = val1b;
				val1b = val1a & mask;
				val1a = val1a & ~mask;
			}
		}

		for (; x > 0; x-=4)
		{
			sc = x0 >> 11;
			b1 = (val0a << 5) + (val1a - val0a) * sc;
			b2 = val1b * sc + val0b * (32 - sc);
			b1 = (b1 >> 5) & ~mask;
			b1 |= (b2 >> 5) & mask;
			*pdst++ = b1;

			x0 += x_mag;
			if (x0 >= (1<<16))
			{
				x0 &= 0xffff;
				val0a = val1a;
				val1a = *psrc++;
				val0b = val1b;
				val1b = val1a & mask;
				val1a = val1a & ~mask;
			}

			sc = x0 >> 11;
			b1 = (val0a << 5) + (val1a - val0a) * sc;
			b2 = val1b * sc + val0b * (32 - sc);
			b1 = (b1 >> 5) & ~mask;
			b1 |= (b2 >> 5) & mask;
			*pdst++ = b1;

			x0 += x_mag;
			if (x0 >= (1<<16))
			{
				x0 &= 0xffff;
				val0a = val1a;
				val1a = *psrc++;
				val0b = val1b;
				val1b = val1a & mask;
				val1a = val1a & ~mask;
			}

			sc = x0 >> 11;
			b1 = (val0a << 5) + (val1a - val0a) * sc;
			b2 = val1b * sc + val0b * (32 - sc);
			b1 = (b1 >> 5) & ~mask;
			b1 |= (b2 >> 5) & mask;
			*pdst++ = b1;

			x0 += x_mag;
			if (x0 >= (1<<16))
			{
				x0 &= 0xffff;
				val0a = val1a;
				val1a = *psrc++;
				val0b = val1b;
				val1b = val1a & mask;
				val1a = val1a & ~mask;
			}

			sc = x0 >> 11;
			b1 = (val0a << 5) + (val1a - val0a) * sc;
			b2 = val1b * sc + val0b * (32 - sc);
			b1 = (b1 >> 5) & ~mask;
			b1 |= (b2 >> 5) & mask;
			*pdst++ = b1;

			x0 += x_mag;
			if (x0 >= (1<<16))
			{
				x0 &= 0xffff;
				val0a = val1a;
				val1a = *psrc++;
				val0b = val1b;
				val1b = val1a & mask;
				val1a = val1a & ~mask;
			}
		}
	}
	else if (x_mag == (1 << 16))
	{
		memcpy(pdst, psrc + (x0 >> 16), 2 * width);
	}
	else
	{
		int x;

		for (x = width; x & 3; x--)
		{
			*pdst++ = psrc[x0 >> 16];
			x0 += x_mag;
		}
		for (; x > 0; x-= 4)
		{
			*pdst++ = psrc[x0 >> 16];
			x0 += x_mag;
			*pdst++ = psrc[x0 >> 16];
			x0 += x_mag;
			*pdst++ = psrc[x0 >> 16];
			x0 += x_mag;
			*pdst++ = psrc[x0 >> 16];
			x0 += x_mag;
		}
	}
}

/*
 * Scale image with 16-bit pixels (Halfword aligned) with 5-6-5 bits per component(TRGB16, TBGR16).
 * Use linear interpolation (on the horizontal) when zooming in, no interpolation when zooming out.
 */
void FN(ka_scale_linear_16bpp, LDRH)(ka_scale_t* scale)
{
	uint16_t* pdst = (uint16_t*) scale->wdst;
	const int sx0 = scale->x_mag * scale->wdst_x0;
	int sy0 = scale->y_mag * scale->wdst_y0;

	if (scale->y_mag < (1 << 16))
	{
		// assume increased mem usage is compensated by less computation
		const uint16_t* psrc = ((const uint16_t*) scale->src) + scale->src_pix_width * (sy0 >> 16);

		int bsize = 2 * scale->wdst_width;
		if (scale->wb_size < bsize)
		{
			void* ptr = ka_mem_alloc(bsize);
			if (!ptr)
				goto fallback; // use fallback method

			scale->wb_ptr = ptr;
			scale->wb_size = bsize;
		}
		uint16_t* line = scale->wb_ptr;
		FN(scale_line16_565, LDRH)(line, scale->wdst_width, psrc, sx0, scale->x_mag);
		int sy16 = sy0 & 0xffff;
		sy0 >>= 16;

		for (int y = scale->wdst_height; y > 0; y--)
		{
			memcpy(pdst, line, 2 * scale->wdst_width);

			sy16 += scale->y_mag;
			if (sy16 >= (1<<16))
			{
				sy0 += 1;
				sy16 &= 0xffff;
				if (sy0 < scale->src_height - 1)
				{
					psrc += scale->src_pix_width;
					FN(scale_line16_565, LDRH)(line, scale->wdst_width, psrc, sx0, scale->x_mag);
				}
			}

			pdst += (scale->wdst_bpr / 2);
		}
		return;
	}

fallback:

	for (int y = scale->wdst_height; y > 0; y--)
	{
		const uint16_t* psrc = ((const uint16_t*) scale->src) + scale->src_pix_width * (sy0 >> 16);
		FN(scale_line16_565, LDRH)(pdst, scale->wdst_width, psrc, sx0, scale->x_mag);

		sy0 += scale->y_mag;
		pdst += (scale->wdst_bpr / 2);
	}
}

/*
 * Scale image with 16-bit pixels (Halfword aligned) with 5-6-5 bits per component(TRGB16, TBGR16).
 * Use linear interpolation (on horiz. & vert.) when zooming in, no interpolation when zooming out.
 * Precondition: source byte width is word aligned.
 */
void FN(ka_scale_bilinear_16bpp, LDRH)(ka_scale_t* scale)
{
	uint16_t* pdst = (uint16_t*) scale->wdst;
	const int sx0 = scale->x_mag * scale->wdst_x0;
	int sy0 = scale->y_mag * scale->wdst_y0;

	if (scale->y_mag < (1 << 16))
	{
		const uint16_t* psrc = ((const uint16_t*) scale->src) + scale->src_pix_width * (sy0 >> 16);
		const int skipb = (((int) scale->wdst) >> 1) & 1;
		int width = skipb + scale->wdst_width;
		const int skipe = width & 1;
		pdst -= skipb;
		width += skipe;

		if (scale->wb_size < 4 * width)
		{
			void* ptr = ka_mem_alloc(4 * width);
			if (ptr)
			{
				scale->wb_ptr = ptr;
				scale->wb_size = 4 * width;
			}
			else
			{
				FN(ka_scale_linear_16bpp, LDRH)(scale);
				return;
			}
		}
		uint16_t* line0 = scale->wb_ptr;
		uint16_t* line1 = line0 + width;

		FN(scale_line16_565, LDRH)(line0 + skipb, scale->wdst_width, psrc, sx0, scale->x_mag);
		psrc += scale->src_pix_width;
		memcpy(line1, line0, 2*width);
		sy0 &= 0xffff;

		for (int y = scale->wdst_height; y > 0; y--)
		{
			int x;

			for (x = 0; x < skipb; x++)
			{
				line0[x] = line1[x] = pdst[x];
			}
			for (x = width - skipe; x < width; x++)
			{
				line0[x] = line1[x] = pdst[x];
			}

			FN(merge_lines16_565, LDRH)((uint32_t*) pdst, (uint32_t*) line0, (uint32_t*) line1, sy0 >> 11, width / 2);

			sy0 += scale->y_mag;
			if (sy0 >= (1<<16))
			{
				uint16_t* linex = line0;
				line0 = line1;
				line1 = linex;
				FN(scale_line16_565, LDRH)(line1 + skipb, scale->wdst_width, psrc, sx0, scale->x_mag);
				psrc += scale->src_pix_width;
				sy0 &= 0xffff;
			}

			pdst += (scale->wdst_bpr / 2);
		}
	}
	else
	{
		for (int y = scale->wdst_height; y > 0; y--)
		{
			const uint16_t* psrc = ((const uint16_t*) scale->src) + scale->src_pix_width * (sy0 >> 16);
			FN(scale_line16_565, LDRH)(pdst, scale->wdst_width, psrc, sx0, scale->x_mag);

			sy0 += scale->y_mag;
			pdst += (scale->wdst_bpr / 2);
		}
	}
}

/*
 * Merge word aligned lines with 16-bit pixels with 5-5-5 bits per component(TRGB15, TBGR15)
 * given a given linear interpolation factor [0-31].
 */
static void FN(merge_lines16_555, LDRH)(uint32_t* pdst, const uint32_t* line0, const uint32_t* line1, uint32_t sc, int width)
{
	const uint32_t mask = 0x03e07c1f;
	uint32_t val0a, val0b, val1a, val1b, b1, b2;
	int x;
	const uint32_t sc2 = 32 - sc;

	for (x = width; x & 3; x--)
	{
		val1a = *line1++;
		val0a = *line0++;
		val1b = val1a & mask;
		val0b = val0a & mask;
		val1a = (val1a & ~mask) >> 5;
		val0a = (val0a & ~mask) >> 5;
		b1 = val1a * sc + val0a * sc2;
		b2 = val1b * sc + val0b * sc2;
		b1 = b1 & ~mask;
		b1 |= (b2 >> 5) & mask;
		*pdst++ = b1;
	}

	for (; x > 0; x-= 4)
	{
		val1a = *line1++;
		val0a = *line0++;
		val1b = val1a & mask;
		val0b = val0a & mask;
		val1a = (val1a & ~mask) >> 5;
		val0a = (val0a & ~mask) >> 5;
		b1 = val1a * sc + val0a * sc2;
		b2 = val1b * sc + val0b * sc2;
		b1 = b1 & ~mask;
		b1 |= (b2 >> 5) & mask;
		*pdst++ = b1;

		val1a = *line1++;
		val0a = *line0++;
		val1b = val1a & mask;
		val0b = val0a & mask;
		val1a = (val1a & ~mask) >> 5;
		val0a = (val0a & ~mask) >> 5;
		b1 = val1a * sc + val0a * sc2;
		b2 = val1b * sc + val0b * sc2;
		b1 = b1 & ~mask;
		b1 |= (b2 >> 5) & mask;
		*pdst++ = b1;

		val1a = *line1++;
		val0a = *line0++;
		val1b = val1a & mask;
		val0b = val0a & mask;
		val1a = (val1a & ~mask) >> 5;
		val0a = (val0a & ~mask) >> 5;
		b1 = val1a * sc + val0a * sc2;
		b2 = val1b * sc + val0b * sc2;
		b1 = b1 & ~mask;
		b1 |= (b2 >> 5) & mask;
		*pdst++ = b1;

		val1a = *line1++;
		val0a = *line0++;
		val1b = val1a & mask;
		val0b = val0a & mask;
		val1a = (val1a & ~mask) >> 5;
		val0a = (val0a & ~mask) >> 5;
		b1 = val1a * sc + val0a * sc2;
		b2 = val1b * sc + val0b * sc2;
		b1 = b1 & ~mask;
		b1 |= (b2 >> 5) & mask;
		*pdst++ = b1;
	}
}

/*
 * Scale line with 16-bit pixels (Halfword aligned) with 5-5-5 bits per component(TRGB15, TBGR15).
 * Use linear interpolation when zooming in, no interpolation when zooming out.
 */
static void FN(scale_line16_555, LDRH)(uint16_t* pdst, uint32_t width, const uint16_t* psrc, int x0, int x_mag)
{
	const uint32_t mask = 0x7c1f;

	if (x_mag < (1 << 16))
	{
		uint32_t val0a, val0b, val1a, val1b, sc, b1, b2;
		int x;

		psrc += x0 >> 16;
		x0 &= 0xffff;
		val0a = *psrc++;
		val0b = val0a & mask;
		val0a = val0a & ~mask;
		val1b = val0b;
		val1a = val0a;

		for (x = width; x & 3; x--)
		{
			sc = x0 >> 11;
			b1 = (val0a << 5) + (val1a - val0a) * sc;
			b2 = val1b * sc + val0b * (32 - sc);
			b1 = (b1 >> 5) & ~mask;
			b1 |= (b2 >> 5) & mask;
			*pdst++ = b1;

			x0 += x_mag;
			if (x0 >= (1<<16))
			{
				x0 &= 0xffff;
				val0a = val1a;
				val1a = *psrc++;
				val0b = val1b;
				val1b = val1a & mask;
				val1a = val1a & ~mask;
			}
		}

		for (; x > 0; x-=4)
		{
			sc = x0 >> 11;
			b1 = (val0a << 5) + (val1a - val0a) * sc;
			b2 = val1b * sc + val0b * (32 - sc);
			b1 = (b1 >> 5) & ~mask;
			b1 |= (b2 >> 5) & mask;
			*pdst++ = b1;

			x0 += x_mag;
			if (x0 >= (1<<16))
			{
				x0 &= 0xffff;
				val0a = val1a;
				val1a = *psrc++;
				val0b = val1b;
				val1b = val1a & mask;
				val1a = val1a & ~mask;
			}

			sc = x0 >> 11;
			b1 = (val0a << 5) + (val1a - val0a) * sc;
			b2 = val1b * sc + val0b * (32 - sc);
			b1 = (b1 >> 5) & ~mask;
			b1 |= (b2 >> 5) & mask;
			*pdst++ = b1;

			x0 += x_mag;
			if (x0 >= (1<<16))
			{
				x0 &= 0xffff;
				val0a = val1a;
				val1a = *psrc++;
				val0b = val1b;
				val1b = val1a & mask;
				val1a = val1a & ~mask;
			}

			sc = x0 >> 11;
			b1 = (val0a << 5) + (val1a - val0a) * sc;
			b2 = val1b * sc + val0b * (32 - sc);
			b1 = (b1 >> 5) & ~mask;
			b1 |= (b2 >> 5) & mask;
			*pdst++ = b1;

			x0 += x_mag;
			if (x0 >= (1<<16))
			{
				x0 &= 0xffff;
				val0a = val1a;
				val1a = *psrc++;
				val0b = val1b;
				val1b = val1a & mask;
				val1a = val1a & ~mask;
			}

			sc = x0 >> 11;
			b1 = (val0a << 5) + (val1a - val0a) * sc;
			b2 = val1b * sc + val0b * (32 - sc);
			b1 = (b1 >> 5) & ~mask;
			b1 |= (b2 >> 5) & mask;
			*pdst++ = b1;

			x0 += x_mag;
			if (x0 >= (1<<16))
			{
				x0 &= 0xffff;
				val0a = val1a;
				val1a = *psrc++;
				val0b = val1b;
				val1b = val1a & mask;
				val1a = val1a & ~mask;
			}
		}
	}
	else if (x_mag == (1 << 16))
	{
		memcpy(pdst, psrc + (x0 >> 16), 2 * width);
	}
	else
	{
		int x;

		for (x = width; x & 3; x--)
		{
			*pdst++ = psrc[x0 >> 16];
			x0 += x_mag;
		}
		for (; x > 0; x-= 4)
		{
			*pdst++ = psrc[x0 >> 16];
			x0 += x_mag;
			*pdst++ = psrc[x0 >> 16];
			x0 += x_mag;
			*pdst++ = psrc[x0 >> 16];
			x0 += x_mag;
			*pdst++ = psrc[x0 >> 16];
			x0 += x_mag;
		}
	}
}

/*
 * Scale image with 16-bit pixels (Halfword aligned) with 5-5-5 bits per component(TRGB15, TBGR15).
 * Use linear interpolation (on the horizontal) when zooming in, no interpolation when zooming out.
 */
void FN(ka_scale_linear_15bpp, LDRH)(ka_scale_t* scale)
{
	uint16_t* pdst = (uint16_t*) scale->wdst;
	const int sx0 = scale->x_mag * scale->wdst_x0;
	int sy0 = scale->y_mag * scale->wdst_y0;

	if (scale->y_mag < (1 << 16))
	{
		// assume increased mem usage is compensated by less computation
		const uint16_t* psrc = ((const uint16_t*) scale->src) + scale->src_pix_width * (sy0 >> 16);

		int bsize = 2 * scale->wdst_width;
		if (scale->wb_size < bsize)
		{
			void* ptr = ka_mem_alloc(bsize);
			if (!ptr)
				goto fallback; // use fallback method

			scale->wb_ptr = ptr;
			scale->wb_size = bsize;
		}
		uint16_t* line = scale->wb_ptr;
		FN(scale_line16_555, LDRH)(line, scale->wdst_width, psrc, sx0, scale->x_mag);
		int sy16 = sy0 & 0xffff;
		sy0 >>= 16;

		for (int y = scale->wdst_height; y > 0; y--)
		{
			memcpy(pdst, line, 2 * scale->wdst_width);

			sy16 += scale->y_mag;
			if (sy16 >= (1<<16))
			{
				sy0 += 1;
				sy16 &= 0xffff;
				if (sy0 < scale->src_height - 1)
				{
					psrc += scale->src_pix_width;
					FN(scale_line16_555, LDRH)(line, scale->wdst_width, psrc, sx0, scale->x_mag);
				}
			}

			pdst += (scale->wdst_bpr / 2);
		}
		return;
	}

fallback:


	for (int y = scale->wdst_height; y > 0; y--)
	{
		const uint16_t* psrc = ((const uint16_t*) scale->src) + scale->src_pix_width * (sy0 >> 16);

		FN(scale_line16_555, LDRH)(pdst, scale->wdst_width, psrc, sx0, scale->x_mag);

		sy0 += scale->y_mag;
		pdst += (scale->wdst_bpr / 2);
	}
}

/*
 * Scale image with 16-bit pixels (Halfword aligned) with 5-5-5 bits per component(TRGB15, TBGR15).
 * Use linear interpolation (on horiz. & vert.) when zooming in, no interpolation when zooming out.
 * Precondition: source byte width is word aligned.
 */
void FN(ka_scale_bilinear_15bpp, LDRH)(ka_scale_t* scale)
{
	uint16_t* pdst = (uint16_t*) scale->wdst;
	const int sx0 = scale->x_mag * scale->wdst_x0;
	int sy0 = scale->y_mag * scale->wdst_y0;

	if (scale->y_mag < (1 << 16))
	{
		const uint16_t* psrc = ((const uint16_t*) scale->src) + scale->src_pix_width * (sy0 >> 16);
		const int skipb = (((int) scale->wdst) >> 1) & 1;
		int width = skipb + scale->wdst_width;
		const int skipe = width & 1;
		pdst -= skipb;
		width += skipe;

		if (scale->wb_size < 4 * width)
		{
			void* ptr = ka_mem_alloc(4 * width);
			if (ptr)
			{
				scale->wb_ptr = ptr;
				scale->wb_size = 4 * width;
			}
			else
			{
				FN(ka_scale_linear_15bpp, LDRH)(scale);
				return;
			}
		}
		uint16_t* line0 = scale->wb_ptr;
		uint16_t* line1 = line0 + width;

		FN(scale_line16_555, LDRH)(line0 + skipb, scale->wdst_width, psrc, sx0, scale->x_mag);
		psrc += scale->src_pix_width;
		memcpy(line1, line0, 2*width);
		sy0 &= 0xffff;

		for (int y = scale->wdst_height; y > 0; y--)
		{
			int x;

			for (x = 0; x < skipb; x++)
			{
				line0[x] = line1[x] = pdst[x];
			}
			for (x = width - skipe; x < width; x++)
			{
				line0[x] = line1[x] = pdst[x];
			}

			FN(merge_lines16_555, LDRH)((uint32_t*) pdst, (uint32_t*) line0, (uint32_t*) line1, sy0 >> 11, width / 2);

			sy0 += scale->y_mag;
			if (sy0 >= (1<<16))
			{
				uint16_t* linex = line0;
				line0 = line1;
				line1 = linex;
				FN(scale_line16_555, LDRH)(line1 + skipb, scale->wdst_width, psrc, sx0, scale->x_mag);
				psrc += scale->src_pix_width;
				sy0 &= 0xffff;
			}

			pdst += (scale->wdst_bpr / 2);
		}
	}
	else
	{
		for (int y = scale->wdst_height; y > 0; y--)
		{
			const uint16_t* psrc = ((const uint16_t*) scale->src) + scale->src_pix_width * (sy0 >> 16);
			FN(scale_line16_555, LDRH)(pdst, scale->wdst_width, psrc, sx0, scale->x_mag);

			sy0 += scale->y_mag;
			pdst += (scale->wdst_bpr / 2);
		}
	}
}

/*
 * Merge word aligned lines with 16-bit pixels with 4-4-4 bits per component(TRGB12, TBGR12)
 * given a given linear interpolation factor [0-15].
 */
static void FN(merge_lines16_444, LDRH)(uint32_t* pdst, const uint32_t* line0, const uint32_t* line1, uint32_t sc, int width)
{
	const uint32_t mask = 0x0f0f0f0f;
	uint32_t val0a, val0b, val1a, val1b, b1, b2;
	int x;
	const uint32_t sc2 = 16 - sc;

	for (x = width; x & 3; x--)
	{
		val1a = *line1++;
		val0a = *line0++;
		val1b = val1a & mask;
		val0b = val0a & mask;
		val1a = (val1a & ~mask) >> 4;
		val0a = (val0a & ~mask) >> 4;
		b1 = val1a * sc + val0a * sc2;
		b2 = val1b * sc + val0b * sc2;
		b1 = b1 & ~mask;
		b1 |= (b2 >> 4) & mask;
		*pdst++ = b1;
	}

	for (; x > 0; x-= 4)
	{
		val1a = *line1++;
		val0a = *line0++;
		val1b = val1a & mask;
		val0b = val0a & mask;
		val1a = (val1a & ~mask) >> 4;
		val0a = (val0a & ~mask) >> 4;
		b1 = val1a * sc + val0a * sc2;
		b2 = val1b * sc + val0b * sc2;
		b1 = b1 & ~mask;
		b1 |= (b2 >> 4) & mask;
		*pdst++ = b1;

		val1a = *line1++;
		val0a = *line0++;
		val1b = val1a & mask;
		val0b = val0a & mask;
		val1a = (val1a & ~mask) >> 4;
		val0a = (val0a & ~mask) >> 4;
		b1 = val1a * sc + val0a * sc2;
		b2 = val1b * sc + val0b * sc2;
		b1 = b1 & ~mask;
		b1 |= (b2 >> 4) & mask;
		*pdst++ = b1;

		val1a = *line1++;
		val0a = *line0++;
		val1b = val1a & mask;
		val0b = val0a & mask;
		val1a = (val1a & ~mask) >> 4;
		val0a = (val0a & ~mask) >> 4;
		b1 = val1a * sc + val0a * sc2;
		b2 = val1b * sc + val0b * sc2;
		b1 = b1 & ~mask;
		b1 |= (b2 >> 4) & mask;
		*pdst++ = b1;

		val1a = *line1++;
		val0a = *line0++;
		val1b = val1a & mask;
		val0b = val0a & mask;
		val1a = (val1a & ~mask) >> 4;
		val0a = (val0a & ~mask) >> 4;
		b1 = val1a * sc + val0a * sc2;
		b2 = val1b * sc + val0b * sc2;
		b1 = b1 & ~mask;
		b1 |= (b2 >> 4) & mask;
		*pdst++ = b1;
	}
}

/*
 * Scale line with 16-bit pixels (Halfword aligned) with 4-4-4 bits per component(TRGB12, TBGR12).
 * Use linear interpolation when zooming in, no interpolation when zooming out.
 */
static void FN(scale_line16_444, LDRH)(uint16_t* pdst, uint32_t width, const uint16_t* psrc, int x0, int x_mag)
{
	const uint32_t mask = 0x0f0f;

	if (x_mag < (1 << 16))
	{
		uint32_t val0a, val0b, val1a, val1b, sc, b1, b2;
		int x;

		psrc += x0 >> 16;
		x0 &= 0xffff;
		val0a = *psrc++;
		val0b = val0a & mask;
		val0a = val0a & ~mask;
		val1b = val0b;
		val1a = val0a;

		for (x = width; x & 3; x--)
		{
			sc = x0 >> 12;
			b1 = (val0a << 4) + (val1a - val0a) * sc;
			b2 = val1b * sc + val0b * (16 - sc);
			b1 = (b1 >> 4) & ~mask;
			b1 |= (b2 >> 4) & mask;
			*pdst++ = b1;

			x0 += x_mag;
			if (x0 >= (1<<16))
			{
				x0 &= 0xffff;
				val0a = val1a;
				val1a = *psrc++;
				val0b = val1b;
				val1b = val1a & mask;
				val1a = val1a & ~mask;
			}
		}

		for (; x > 0; x-=4)
		{
			sc = x0 >> 12;
			b1 = (val0a << 4) + (val1a - val0a) * sc;
			b2 = val1b * sc + val0b * (16 - sc);
			b1 = (b1 >> 4) & ~mask;
			b1 |= (b2 >> 4) & mask;
			*pdst++ = b1;

			x0 += x_mag;
			if (x0 >= (1<<16))
			{
				x0 &= 0xffff;
				val0a = val1a;
				val1a = *psrc++;
				val0b = val1b;
				val1b = val1a & mask;
				val1a = val1a & ~mask;
			}

			sc = x0 >> 12;
			b1 = (val0a << 4) + (val1a - val0a) * sc;
			b2 = val1b * sc + val0b * (16 - sc);
			b1 = (b1 >> 4) & ~mask;
			b1 |= (b2 >> 4) & mask;
			*pdst++ = b1;

			x0 += x_mag;
			if (x0 >= (1<<16))
			{
				x0 &= 0xffff;
				val0a = val1a;
				val1a = *psrc++;
				val0b = val1b;
				val1b = val1a & mask;
				val1a = val1a & ~mask;
			}

			sc = x0 >> 12;
			b1 = (val0a << 4) + (val1a - val0a) * sc;
			b2 = val1b * sc + val0b * (16 - sc);
			b1 = (b1 >> 4) & ~mask;
			b1 |= (b2 >> 4) & mask;
			*pdst++ = b1;

			x0 += x_mag;
			if (x0 >= (1<<16))
			{
				x0 &= 0xffff;
				val0a = val1a;
				val1a = *psrc++;
				val0b = val1b;
				val1b = val1a & mask;
				val1a = val1a & ~mask;
			}

			sc = x0 >> 12;
			b1 = (val0a << 4) + (val1a - val0a) * sc;
			b2 = val1b * sc + val0b * (16 - sc);
			b1 = (b1 >> 4) & ~mask;
			b1 |= (b2 >> 4) & mask;
			*pdst++ = b1;

			x0 += x_mag;
			if (x0 >= (1<<16))
			{
				x0 &= 0xffff;
				val0a = val1a;
				val1a = *psrc++;
				val0b = val1b;
				val1b = val1a & mask;
				val1a = val1a & ~mask;
			}
		}
	}
	else if (x_mag == (1 << 16))
	{
		memcpy(pdst, psrc + (x0 >> 16), 2 * width);
	}
	else
	{
		int x;

		for (x = width; x & 3; x--)
		{
			*pdst++ = psrc[x0 >> 16];
			x0 += x_mag;
		}
		for (; x > 0; x-= 4)
		{
			*pdst++ = psrc[x0 >> 16];
			x0 += x_mag;
			*pdst++ = psrc[x0 >> 16];
			x0 += x_mag;
			*pdst++ = psrc[x0 >> 16];
			x0 += x_mag;
			*pdst++ = psrc[x0 >> 16];
			x0 += x_mag;
		}
	}
}

/*
 * Scale image with 16-bit pixels (Halfword aligned) with 4-4-4 bits per component(TRGB12, TBGR12).
 * Use linear interpolation (on the horizontal) when zooming in, no interpolation when zooming out.
 */
void FN(ka_scale_linear_12bpp, LDRH)(ka_scale_t* scale)
{
	uint16_t* pdst = (uint16_t*) scale->wdst;
	const int sx0 = scale->x_mag * scale->wdst_x0;
	int sy0 = scale->y_mag * scale->wdst_y0;

	if (scale->y_mag < (1 << 16))
	{
		// assume increased mem usage is compensated by less computation
		const uint16_t* psrc = ((const uint16_t*) scale->src) + scale->src_pix_width * (sy0 >> 16);

		int bsize = 2 * scale->wdst_width;
		if (scale->wb_size < bsize)
		{
			void* ptr = ka_mem_alloc(bsize);
			if (!ptr)
				goto fallback; // use fallback method

			scale->wb_ptr = ptr;
			scale->wb_size = bsize;
		}
		uint16_t* line = scale->wb_ptr;
		FN(scale_line16_444, LDRH)(line, scale->wdst_width, psrc, sx0, scale->x_mag);
		int sy16 = sy0 & 0xffff;
		sy0 >>= 16;

		for (int y = scale->wdst_height; y > 0; y--)
		{
			memcpy(pdst, line, 2 * scale->wdst_width);

			sy16 += scale->y_mag;
			if (sy16 >= (1<<16))
			{
				sy0 += 1;
				sy16 &= 0xffff;
				if (sy0 < scale->src_height - 1)
					psrc += scale->src_pix_width;
				FN(scale_line16_444, LDRH)(line, scale->wdst_width, psrc, sx0, scale->x_mag);
			}

			pdst += (scale->wdst_bpr / 2);
		}
		return;
	}

fallback:


	for (int y = scale->wdst_height; y > 0; y--)
	{
		const uint16_t* psrc = ((const uint16_t*) scale->src) + scale->src_pix_width * (sy0 >> 16);

		FN(scale_line16_444, LDRH)(pdst, scale->wdst_width, psrc, sx0, scale->x_mag);

		sy0 += scale->y_mag;
		pdst += (scale->wdst_bpr / 2);
	}
}

/*
 * Scale image with 16-bit pixels (Halfword aligned) with 4-4-4 bits per component(TRGB12, TBGR12).
 * Use linear interpolation (on horiz. & vert.) when zooming in, no interpolation when zooming out.
 * Precondition: source byte width is word aligned.
 */
void FN(ka_scale_bilinear_12bpp, LDRH)(ka_scale_t* scale)
{
	uint16_t* pdst = (uint16_t*) scale->wdst;
	const int sx0 = scale->x_mag * scale->wdst_x0;
	int sy0 = scale->y_mag * scale->wdst_y0;

	if (scale->y_mag < (1 << 16))
	{
		const uint16_t* psrc = ((const uint16_t*) scale->src) + scale->src_pix_width * (sy0 >> 16);
		const int skipb = (((int) scale->wdst) >> 1) & 1;
		int width = skipb + scale->wdst_width;
		const int skipe = width & 1;
		pdst -= skipb;
		width += skipe;

		if (scale->wb_size < 4 * width)
		{
			void* ptr = ka_mem_alloc(4 * width);
			if (ptr)
			{
				scale->wb_ptr = ptr;
				scale->wb_size = 4 * width;
			}
			else
			{
				FN(ka_scale_linear_12bpp, LDRH)(scale);
				return;
			}
		}
		uint16_t* line0 = scale->wb_ptr;
		uint16_t* line1 = line0 + width;

		FN(scale_line16_444, LDRH)(line0 + skipb, scale->wdst_width, psrc, sx0, scale->x_mag);
		psrc += scale->src_pix_width;
		memcpy(line1, line0, 2*width);
		sy0 &= 0xffff;

		for (int y = scale->wdst_height; y > 0; y--)
		{
			int x;

			for (x = 0; x < skipb; x++)
			{
				line0[x] = line1[x] = pdst[x];
			}
			for (x = width - skipe; x < width; x++)
			{
				line0[x] = line1[x] = pdst[x];
			}

			FN(merge_lines16_444, LDRH)((uint32_t*) pdst, (uint32_t*) line0, (uint32_t*) line1, sy0 >> 12, width / 2);

			sy0 += scale->y_mag;
			if (sy0 >= (1<<16))
			{
				uint16_t* linex = line0;
				line0 = line1;
				line1 = linex;
				FN(scale_line16_444, LDRH)(line1 + skipb, scale->wdst_width, psrc, sx0, scale->x_mag);
				psrc += scale->src_pix_width;
				sy0 &= 0xffff;
			}

			pdst += (scale->wdst_bpr / 2);
		}
	}
	else
	{
		for (int y = scale->wdst_height; y > 0; y--)
		{
			const uint16_t* psrc = ((const uint16_t*) scale->src) + scale->src_pix_width * (sy0 >> 16);
			FN(scale_line16_444, LDRH)(pdst, scale->wdst_width, psrc, sx0, scale->x_mag);

			sy0 += scale->y_mag;
			pdst += (scale->wdst_bpr / 2);
		}
	}
}
