;
; plot.s
;
    AREA |C$$code|, CODE, READONLY

    EXPORT avg_fast8
    EXPORT dup_fast8
    EXPORT mix_fast8_neon

;-------------------------
; do
; {
;   for (i=0; i<width; i++)
;   {
;     dst1[i] = src[i];
;     dst2[i] = src[i];
;   }
;   dst1 += stride;
;   dst2 += stride;
;   src += stride;
; }
; while(--height);
;
; a1 = src, a2 = dst1, a3 = dst2, a4 = width, on stack = stride, height

    ALIGN 32
dup_fast8
    stmfd sp!, {v1-v6, sl, fp, lr}
    ldr   v6, [sp, #10*4]
    ldr   v5, [sp, #9*4]

dup_fast8_vloop
    stmfd sp!, {a4, v5, v6}
    subs  a4, a4, #16
    blt   dup_fast8_hb
dup_fast8_hloop
    ldmia a1!, {v1, v2, v3, v4}
    stmia a2!, {v1, v2, v3, v4}
    stmia a3!, {v1, v2, v3, v4}

    subs  a4, a4, #16
    bge   dup_fast8_hloop

dup_fast8_hb
    add   a4, a4, #16
    cmp   a4,#0
    ble   dup_fast8_hbend

dup_fast8_hbloop
    ldrb  v1, [a1],#1
    strb  v1, [a2],#1
    strb  v1, [a3],#1
    subs  a4, a4, #1
    bgt   dup_fast8_hbloop
dup_fast8_hbend

    ldmfd sp!, {a4, v5, v6}
    add   a1, a1, v5
    add   a2, a2, v5
    add   a3, a3, v5
    subs  v6, v6, #1
    bgt   dup_fast8_vloop
    ldmfd sp!, {v1-v6, sl, fp, pc}

;-------------------------
; do
; {
;   for (i=0; i<width; i++)
;     dst[i] = (scr1[i] + src2[i] + 1) / 2;
;   dst += stride;
;   src1 += stride;
;   src2 += stride;
; }
; while(--count);
;
; a1 = dst, a2 = src1, a3 = scr2, a4 = width, on stack: stride, height

; These constants are used when processing 4 bytes at a time.
; Relies on (a + b + 1) / 2 = a / 2 + b / 2 + ((a & 1) + (b & 1) + 1)/2
;                           = a / 2 + b / 2 + ((a | b) & 1)
avgmask1  DCD 0x01010101
avgmask2  DCD 0x7f7f7f7f

    ALIGN 32
avg_fast8
    stmfd sp!, {v1-v6, sl, fp, lr}
    ldr   lr, avgmask1
    ldr   ip, avgmask2
    ldr   v6, [sp, #10*4]
    ldr   v5, [sp, #9*4]

avg_fast8_vloop
    stmfd sp!, {a4, v5}
avg_fast8_hloop
    ldr   v1, [a2],#4
    ldr   v2, [a2],#4

    ldr   v4, [a3],#4
    orr   v3, v1, v4
    and   v3, v3, lr
    and   v1, ip, v1, lsr #1
    and   v4, ip, v4, lsr #1
    add   v1, v1, v3
    add   v1, v1, v4

    ldr   v4, [a3],#4
    orr   v3, v2, v4
    and   v3, v3, lr
    and   v2, ip, v2, lsr #1
    and   v4, ip, v4, lsr #1
    add   v2, v2, v3
    add   v2, v2, v4

    str   v1, [a1],#4
    str   v2, [a1],#4

    subs  a4, a4, #8
    bgt   avg_fast8_hloop

    ldmfd sp!, {a4, v5}
    add   a1, a1, v5
    add   a2, a2, v5
    add   a3, a3, v5
    subs  v6, v6, #1
    bgt   avg_fast8_vloop
    ldmfd sp!, {v1-v6, sl, fp, pc}

;-------------------------
;  const uint8_t* srcp = src1;
;  const uint8_t* srcc = src1;
;  const uint8_t* srcn = src2;
;  int val;
;
;  do
;  {
;    for (int i = width; i > 0; i--)
;    {
;      val = *srcc++;
;      val<<=1;
;      val += *srcp++;
;      val += *srcn++;
;      val += 2;
;      *dst++ = val >> 2;
;    }
;    srcp = srcc - width;
;    srcc = srcn - width;
;    srcn = (height <= 2) ? srcc : srcp + ((width + stride) << 1);
;    dst += stride;
;  }
;  while(--height);
;
; a1 = dst, a2 = src1, a3 = scr2, a4 = width, on stack: stride, height

    ALIGN 32
mix_fast8_neon
    stmfd sp!, {v3-v6, lr}
    ldr   v6, [sp, #6*4]
    ldr   v5, [sp, #5*4]
    mov   v4, a2

mix_fast8_neon_vloop
    mov   v3, a4

mix_fast8_neon_hloop
    vld1.8 {d0}, [a2]!
    vld1.8 {d1}, [a3]!
    vld1.8 {d2}, [v4]!
    vrhadd.u8 d3, d0, d1
    vrhadd.u8 d3, d3, d2
    vst1.8 {d3}, [a1]!
    subs  v3, v3, #8
    bgt   mix_fast8_neon_hloop

    sub   a2, v4, a4
    sub   v4, a3, a4
    cmp   v6,#2
    movle a3, v4
    addgt v3, a4, v5
    addgt a3, a2, v3, lsl #1
    add   a1, a1, v5
    subs  v6, v6, #1
    bgt   mix_fast8_neon_vloop
    ldmfd sp!, {v3-v6, pc}

;-----------------------------------------------------------
    ALIGN 32
    END
