;
; motion.s
; Copyright (C) 2002 P.Everett <peter@everett9981.freeserve.co.uk>
;
; This file is part of KinoAMP, a free RISCOS MPEG program stream decoder.
;
; KinoAMP is free software; you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation; either version 2 of the License, or
; (at your option) any later version.
;
; KinoAMP is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with this program; if not, write to the Free Software
; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
;

; !!!! WARNING !!!!:
;   Assumes height is multiple of 4.
;   We use this to buffer multiple lines for routines averaging lines.


; Motion Compensation functions.
;
; On entry,
; a1 = *dst
; a2 = *src
; a3 = stride
; a4 = loop count
;
; These functions consume about 30% of the total time. The inner loops are
; expanded. The destination addresses are ALWAYS word aligned but source
; may not be. So read and write destination as words but read source as
; bytes. It also helps if the frame buffer is quad word aligned.
;
; Some functions compensate for unaligned source, and nearly all functions
; process 4 bytes at a time.
; Thanks to Andr Timmermans for some improvements here.
;
; Most of the code now makes use of the following idea:
;
; mean = (a + b + 1) / 2
;      = a/2 + b/2 + ((a & 1) + (b & 1) + 1)/2
;
; The third term is 1 if either the last bit of a or b is 1
;               is 0 if both the last bits of a and b are 0
;
; so mean = a/2 + b/2 + (a | b) & 1
;

  AREA |A$$code|, CODE, READONLY

;-------------------------
; mc_function structure.
;
; typedef struct mc_functions_s
; {
;   void(*put[8])(uint8_t *dst, uint8_t *src, int32_t stride, int32_t count);
;   void(*avg[8])(uint8_t *dst, uint8_t *src, int32_t stride, int32_t count);
; } mc_functions_t;
;
; mc_functions_t mc_functions;

  EXPORT mc_functions_neon
mc_functions_neon
  DCD put_16
  DCD put_x16
  DCD put_y16
  DCD put_xy16
  DCD put_8
  DCD put_x8
  DCD put_y8
  DCD put_xy8
  DCD avg_16
  DCD avg_x16
  DCD avg_y16
  DCD avg_xy16
  DCD avg_8
  DCD avg_x8
  DCD avg_y8
  DCD avg_xy8
  DCD test_16
  DCD test_16
  DCD test_16
  DCD test_16
  DCD test_8
  DCD test_8
  DCD test_8
  DCD test_8


;-------------------------

; do
; {
;   for (i=0; i<16; i++)
;     dst[i] = &FF
;   dst += stride;
; }
; while(--count);

  ALIGN 32
test_16
  stmfd sp!, {v1-v4, lr}
  mov   v1, #&FF
  orr   v1, v1, v1, lsl #8
  orr   v1, v1, v1, lsl #16
  mov   v2, v1
  mov   v3, v1
  mov   v4, v1

test_16_loop
  stmia a1, {v1-v4}

  add   a2, a2, a3
  add   a1, a1, a3
  subs  a4, a4, #1
  bne   test_16_loop
  ldmfd sp!, {v1-v4, pc}

;-------------------------

; do
; {
;   for (i=0; i<8; i++)
;     dst[i] = &FF;
;   dst += stride;
; }
; while(--count);

  ALIGN 32
test_8
  stmfd sp!, {v1, lr}
  mov   v1, #&FF
  orr   v1, v1, v1, lsl #8
  orr   v1, v1, v1, lsl #16

test_8_loop
  str   v1, [a1, #4]
  str   v1, [a1], a3
  subs  a4, a4, #1
  bne   test_8_loop
  ldmfd sp!, {v1, pc}

;-------------------------

; do
; {
;   for (i=0; i<16; i++)
;     dst[i] = src[i];
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
put_16
  stmfd sp!, {v1, lr}

put_16_loop
  vld1.8 {q0}, [a2], a3
  vld1.8 {q1}, [a2], a3
  vld1.8 {q2}, [a2], a3
  vld1.8 {q3}, [a2], a3
  vst1.8 {q0}, [a1], a3
  vst1.8 {q1}, [a1], a3
  vst1.8 {q2}, [a1], a3
  vst1.8 {q3}, [a1], a3
  subs  a4, a4, #4
  bgt   put_16_loop
  ldmfd sp!, {v1, pc}

;-------------------------

; do
; {
;   for (i=0; i<8; i++)
;     dst[i] = src[i];
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
put_8
  stmfd sp!, {v1, lr}

put_8_loop
  vld1.8 {d0}, [a2], a3
  vld1.8 {d1}, [a2], a3
  vld1.8 {d2}, [a2], a3
  vld1.8 {d3}, [a2], a3
  vst1.8 {d0}, [a1], a3
  vst1.8 {d1}, [a1], a3
  vst1.8 {d2}, [a1], a3
  vst1.8 {d3}, [a1], a3
  subs  a4, a4, #4
  bgt   put_8_loop
  ldmfd sp!, {v1, pc}

;-------------------------
; do
; {
;   for (i=0; i<16; i++)
;     dst[i] = (dst[i] + src[i] + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
avg_16
  stmfd sp!, {v1, lr}

avg_16_loop
  vld1.8 {q0}, [a2], a3
  vld1.8 {q2}, [a2], a3
  vld1.8 {q4}, [a2], a3
  vld1.8 {q6}, [a2], a3

  vld1.8 {q1}, [a1]
  vrhadd.u8 q1, q0, q1
  vst1.8 {q1}, [a1], a3

  vld1.8 {q3}, [a1]
  vrhadd.u8 q3, q2, q3
  vst1.8 {q3}, [a1], a3

  vld1.8 {q5}, [a1]
  vrhadd.u8 q5, q4, q5
  vst1.8 {q5}, [a1], a3

  vld1.8 {q7}, [a1]
  vrhadd.u8 q7, q6, q7
  vst1.8 {q7}, [a1], a3

  subs  a4, a4, #4
  bgt   avg_16_loop
  ldmfd sp!, {v1, pc}

;-------------------------

; do
; {
;   for (i=0; i<8; i++)
;     dst[i] = (dst[i] + src[i] + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
avg_8
  stmfd sp!, {v1, lr}

avg_8_loop
  vld1.8 {d0}, [a2], a3
  vld1.8 {d2}, [a2], a3
  vld1.8 {d4}, [a2], a3
  vld1.8 {d6}, [a2], a3

  vld1.8 {d1}, [a1]
  vrhadd.u8 d1, d0, d1
  vst1.8 {d1}, [a1], a3

  vld1.8 {d3}, [a1]
  vrhadd.u8 d3, d2, d3
  vst1.8 {d3}, [a1], a3

  vld1.8 {d5}, [a1]
  vrhadd.u8 d5, d4, d5
  vst1.8 {d5}, [a1], a3

  vld1.8 {d7}, [a1]
  vrhadd.u8 d7, d6, d7
  vst1.8 {d7}, [a1], a3

  subs  a4, a4, #4
  bgt   avg_8_loop
  ldmfd sp!, {v1, pc}

;-------------------------

; do
; {
;   for (i=0; i<16; i++)
;     dst[i] = (src[i] + src[i+1] + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
put_x16
  stmfd sp!, {v1, lr}
  add   v1, a2, #16

put_x16_loop
  vld1.8 {q0}, [a2], a3
  vld1.8 {d2[0] }, [v1], a3
  vld1.8 {q3}, [a2], a3
  vld1.8 {d8[0] }, [v1], a3
  vld1.8 {q6}, [a2], a3
  vld1.8 {d14[0]}, [v1], a3
  vld1.8 {q9}, [a2], a3
  vld1.8 {d20[0]}, [v1], a3
  vext.8    q2 , q0, q1, #1
  vrhadd.u8 q2 , q0, q2
  vext.8    q5 , q3, q4, #1
  vrhadd.u8 q5 , q3, q5
  vst1.8 {q2 }, [a1], a3
  vst1.8 {q5 }, [a1], a3
  vext.8    q8 , q6, q7, #1
  vrhadd.u8 q8 , q6, q8
  vext.8    q11, q9, q10, #1
  vrhadd.u8 q11, q9, q11
  vst1.8 {q8 }, [a1], a3
  vst1.8 {q11}, [a1], a3

  subs  a4, a4, #4
  bgt   put_x16_loop
  ldmfd sp!, {v1, pc}

;-------------------------

; do
; {
;   for (i=0; i<8; i++)
;     dst[i] = (src[i] + src[i+1] + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
put_x8
  stmfd sp!, {v1, lr}
  add   v1, a2, #8

put_x8_loop
  vld1.8 {d0}, [a2], a3
  vld1.8 {d1[0] }, [v1], a3
  vld1.8 {d3}, [a2], a3
  vld1.8 {d4[0] }, [v1], a3
  vld1.8 {d6}, [a2], a3
  vld1.8 {d7[0] }, [v1], a3
  vld1.8 {d9}, [a2], a3
  vld1.8 {d10[0]}, [v1], a3
  vext.8    d2 , d0, d1, #1
  vrhadd.u8 d2 , d0, d2
  vext.8    d5 , d3, d4, #1
  vrhadd.u8 d5 , d3, d5
  vst1.8 {d2 }, [a1], a3
  vst1.8 {d5 }, [a1], a3
  vext.8    d8 , d6, d7, #1
  vrhadd.u8 d8 , d6, d8
  vext.8    d11, d9, d10, #1
  vrhadd.u8 d11, d9, d11
  vst1.8 {d8 }, [a1], a3
  vst1.8 {d11}, [a1], a3

  subs  a4, a4, #4
  bgt   put_x8_loop
  ldmfd sp!, {v1, pc}

;-------------------------

; do
; {
;   for (i=0; i<16; i++)
;     dst[i] = ((src[i] + src[i+1] + 1) / 2 + dst[i] + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
avg_x16
  stmfd sp!, {v1, lr}
  add   v1, a2, #16

avg_x16_loop
  vld1.8 {q0}, [a2], a3
  vld1.8 { d2[0]}, [v1], a3
  vld1.8 {q2}, [a2], a3
  vld1.8 { d6[0]}, [v1], a3
  vld1.8 {q4}, [a2], a3
  vld1.8 {d10[0]}, [v1], a3
  vld1.8 {q6}, [a2], a3
  vld1.8 {d14[0]}, [v1], a3

  vext.8 q1, q0, q1, #1
  vext.8 q3, q2, q3, #1
  vext.8 q5, q4, q5, #1
  vext.8 q7, q6, q7, #1

  vld1.8 {q8}, [a1]
  vrhadd.u8 q0, q0, q1
  vrhadd.u8 q0, q0, q8
  vst1.8 {q0}, [a1], a3

  vld1.8 {q9}, [a1]
  vrhadd.u8 q2, q2, q3
  vrhadd.u8 q2, q2, q9
  vst1.8 {q2}, [a1], a3

  vld1.8 {q10}, [a1]
  vrhadd.u8 q4, q4, q5
  vrhadd.u8 q4, q4, q10
  vst1.8 {q4}, [a1], a3

  vld1.8 {q11}, [a1]
  vrhadd.u8 q6, q6, q7
  vrhadd.u8 q6, q6, q11
  vst1.8 {q6}, [a1], a3

  subs  a4, a4, #4
  bgt   avg_x16_loop
  ldmfd sp!, {v1, pc}

;-------------------------

; do
; {
;   for (i=0; i<8; i++)
;     dst[i] = (src[i] + src[i+1] + 1) /2 + dst[i] + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
avg_x8
  stmfd sp!, {v1, lr}
  add   v1, a2, #8

avg_x8_loop
  vld1.8 {d0}, [a2], a3
  vld1.8 {d1[0]}, [v1], a3
  vld1.8 {d2}, [a2], a3
  vld1.8 {d3[0]}, [v1], a3
  vld1.8 {d4}, [a2], a3
  vld1.8 {d5[0]}, [v1], a3
  vld1.8 {d6}, [a2], a3
  vld1.8 {d7[0]}, [v1], a3

  vext.8 d1, d0, d1, #1
  vext.8 d3, d2, d3, #1
  vext.8 d5, d4, d5, #1
  vext.8 d7, d6, d7, #1

  vld1.8 {d8}, [a1]
  vrhadd.u8 d0, d0, d1
  vrhadd.u8 d0, d0, d8
  vst1.8 {d0}, [a1], a3

  vld1.8 {d9}, [a1]
  vrhadd.u8 d2, d2, d3
  vrhadd.u8 d2, d2, d9
  vst1.8 {d2}, [a1], a3

  vld1.8 {d10}, [a1]
  vrhadd.u8 d4, d4, d5
  vrhadd.u8 d4, d4, d10
  vst1.8 {d4}, [a1], a3

  vld1.8 {d11}, [a1]
  vrhadd.u8 d6, d6, d7
  vrhadd.u8 d6, d6, d11
  vst1.8 {d6}, [a1], a3

  subs  a4, a4, #4
  bgt   avg_x8_loop
  ldmfd sp!, {v1, pc}

;-------------------------

; do
; {
;   for (i=0; i<16; i++)
;     dst[i] = (src[i] + src[i+stride] + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
put_y16
  stmfd sp!, {v1, lr}

put_y16_loop
  vld1.8 {q0}, [a2], a3
  vld1.8 {q1}, [a2], a3
  vld1.8 {q2}, [a2], a3
  vld1.8 {q3}, [a2], a3
  vld1.8 {q4}, [a2], a3
  sub   a2, a2, a3

  vrhadd.u8 q0, q0, q1
  vrhadd.u8 q1, q1, q2
  vst1.8 {q0}, [a1], a3
  vst1.8 {q1}, [a1], a3
  vrhadd.u8 q2, q2, q3
  vrhadd.u8 q3, q3, q4
  vst1.8 {q2}, [a1], a3
  vst1.8 {q3}, [a1], a3

  subs  a4, a4, #4
  bgt   put_y16_loop
  ldmfd sp!, {v1, pc}

;-------------------------

; do
; {
;   for (i=0; i<8; i++)
;     dst[i] = (src[i] + src[i+stride] + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
put_y8
  stmfd sp!, {v1, lr}

put_y8_loop
  vld1.8 {d0}, [a2], a3
  vld1.8 {d1}, [a2], a3
  vld1.8 {d2}, [a2], a3
  vld1.8 {d3}, [a2], a3
  vld1.8 {d4}, [a2], a3
  sub   a2, a2, a3

  vrhadd.u8 d0, d0, d1
  vrhadd.u8 d1, d1, d2
  vrhadd.u8 d2, d2, d3
  vrhadd.u8 d3, d3, d4
  vst1.8 {d0}, [a1], a3
  vst1.8 {d1}, [a1], a3
  vst1.8 {d2}, [a1], a3
  vst1.8 {d3}, [a1], a3

  subs  a4, a4, #4
  bgt   put_y8_loop
  ldmfd sp!, {v1, pc}

;-------------------------

; do
; {
;   for (i=0; i<16; i++)
;     dst[i] = ((src[i] + src[i+stride] + 1) / 2 + dst[i] + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
avg_y16
  stmfd sp!, {v1, lr}

avg_y16_loop
  vld1.8 {q0}, [a2], a3
  vld1.8 {q1}, [a2], a3
  vld1.8 {q2}, [a2], a3
  vld1.8 {q3}, [a2], a3
  vld1.8 {q4}, [a2], a3
  sub   a2, a2, a3

  vrhadd.u8 q0, q0, q1
  vrhadd.u8 q1, q1, q2
  vrhadd.u8 q2, q2, q3
  vrhadd.u8 q3, q3, q4

  vld1.8 {q5}, [a1]
  vrhadd.u8 q0, q0, q5
  vst1.8 {q0}, [a1], a3

  vld1.8 {q6}, [a1]
  vrhadd.u8 q1, q1, q6
  vst1.8 {q1}, [a1], a3

  vld1.8 {q7}, [a1]
  vrhadd.u8 q2, q2, q7
  vst1.8 {q2}, [a1], a3

  vld1.8 {q8}, [a1]
  vrhadd.u8 q3, q3, q8
  vst1.8 {q3}, [a1], a3

  subs  a4, a4, #4
  bgt   avg_y16_loop
  ldmfd sp!, {v1, pc}

;-------------------------

; do
; {
;   for (i=0; i<8; i++)
;     dst[i] = ((src[i] + src[i+stride] + 1) / 2 + dst[i] + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
avg_y8
  stmfd sp!, {v1, lr}

avg_y8_loop
  vld1.8 {d0}, [a2], a3
  vld1.8 {d1}, [a2], a3
  vld1.8 {d2}, [a2], a3
  vld1.8 {d3}, [a2], a3
  vld1.8 {d4}, [a2], a3
  sub   a2, a2, a3

  vrhadd.u8 d0, d0, d1
  vrhadd.u8 d1, d1, d2
  vrhadd.u8 d2, d2, d3
  vrhadd.u8 d3, d3, d4

  vld1.8 {d5}, [a1]
  vrhadd.u8 d0, d0, d5
  vst1.8 {d0}, [a1], a3

  vld1.8 {d6}, [a1]
  vrhadd.u8 d1, d1, d6
  vst1.8 {d1}, [a1], a3

  vld1.8 {d7}, [a1]
  vrhadd.u8 d2, d2, d7
  vst1.8 {d2}, [a1], a3

  vld1.8 {d8}, [a1]
  vrhadd.u8 d3, d3, d8
  vst1.8 {d3}, [a1], a3

  subs  a4, a4, #4
  bgt   avg_y8_loop
  ldmfd sp!, {v1, pc}

;-------------------------

; do
; {
;   for (i=0; i<16; i++)
;     dst[i] = (src[i] + src[i+1] +
;               src[i+stride] + src[i+stride+1] + 2) / 4;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
put_xy16
  stmfd sp!, {v1, lr}
  add   v1, a2, #16

put_xy16_loop
  vld1.8 {q0}, [a2], a3
  vld1.8 { d2[0]}, [v1], a3
  vld1.8 {q2}, [a2], a3
  vld1.8 { d6[0]}, [v1], a3
  vld1.8 {q4}, [a2], a3
  vld1.8 {d10[0]}, [v1], a3
  vld1.8 {q6}, [a2], a3
  vld1.8 {d14[0]}, [v1], a3
  vld1.8 {q8}, [a2], a3
  vld1.8 {d18[0]}, [v1], a3
  sub   a2, a2, a3
  sub   v1, v1, a3

  vext.8 q1, q0, q1, #1
  vext.8 q3, q2, q3, #1
  vext.8 q5, q4, q5, #1
  vext.8 q7, q6, q7, #1
  vext.8 q9, q8, q9, #1

  vrhadd.u8 q0, q0, q1
  vrhadd.u8 q2, q2, q3
  vrhadd.u8 q4, q4, q5
  vrhadd.u8 q6, q6, q7
  vrhadd.u8 q8, q8, q9

  vrhadd.u8 q0, q0, q2
  vst1.8 {q0}, [a1], a3
  vrhadd.u8 q2, q2, q4
  vst1.8 {q2}, [a1], a3
  vrhadd.u8 q4, q4, q6
  vst1.8 {q4}, [a1], a3
  vrhadd.u8 q6, q6, q8
  vst1.8 {q6}, [a1], a3

  subs  a4, a4, #4
  bgt   put_xy16_loop
  ldmfd sp!, {v1, pc}

;-------------------------

; do
; {
;   for (i=0; i<8; i++)
;     dst[i] = (src[i] + src[i+1] +
;               src[i+stride] + src[i+stride+1] + 2) / 4;
; ; nearly the same is:
; ;    dst[i] = ((src[i] + src[i+1] + 1) / 2
; ;              (src[i+stride] + src[i+stride+1] + 1) / 2 + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
put_xy8
  stmfd sp!, {v1, lr}
  add   v1, a2, #8

put_xy8_loop
  vld1.8 {d0}, [a2], a3
  vld1.8 {d1[0]}, [v1], a3
  vld1.8 {d2}, [a2], a3
  vld1.8 {d3[0]}, [v1], a3
  vld1.8 {d4}, [a2], a3
  vld1.8 {d5[0]}, [v1], a3
  vld1.8 {d6}, [a2], a3
  vld1.8 {d7[0]}, [v1], a3
  vld1.8 {d8}, [a2], a3
  vld1.8 {d9[0]}, [v1], a3
  sub   a2, a2, a3
  sub   v1, v1, a3

  vext.8 d1, d0, d1, #1
  vext.8 d3, d2, d3, #1
  vext.8 d5, d4, d5, #1
  vext.8 d7, d6, d7, #1
  vext.8 d9, d8, d9, #1

  vrhadd.u8 d0, d0, d1
  vrhadd.u8 d2, d2, d3
  vrhadd.u8 d4, d4, d5
  vrhadd.u8 d6, d6, d7
  vrhadd.u8 d8, d8, d9

  vrhadd.u8 d0, d0, d2
  vst1.8 {d0}, [a1], a3
  vrhadd.u8 d2, d2, d4
  vst1.8 {d2}, [a1], a3
  vrhadd.u8 d4, d4, d6
  vst1.8 {d4}, [a1], a3
  vrhadd.u8 d6, d6, d8
  vst1.8 {d6}, [a1], a3

  subs  a4, a4, #4
  bgt   put_xy8_loop
  ldmfd sp!, {v1, pc}

;-------------------------

; do
; {
;   for (i=0; i<16; i++)
;     dst[i] = (src[i] + src[i+1] +
;               src[i+stride] + src[i+stride+1] + 4*dst[i] + 4) / 8;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
avg_xy16
  stmfd sp!, {v1, lr}
  add   v1, a2, #16

avg_xy16_loop
  vld1.8 {q0}, [a2], a3
  vld1.8 { d2[0]}, [v1], a3
  vld1.8 {q2}, [a2], a3
  vld1.8 { d6[0]}, [v1], a3
  vld1.8 {q4}, [a2], a3
  vld1.8 {d10[0]}, [v1], a3
  vld1.8 {q6}, [a2], a3
  vld1.8 {d14[0]}, [v1], a3
  vld1.8 {q8}, [a2], a3
  vld1.8 {d18[0]}, [v1], a3
  sub   a2, a2, a3
  sub   v1, v1, a3

  vext.8 q1, q0, q1, #1
  vext.8 q3, q2, q3, #1
  vext.8 q5, q4, q5, #1
  vext.8 q7, q6, q7, #1
  vext.8 q9, q8, q9, #1

  vrhadd.u8 q0, q0, q1
  vrhadd.u8 q2, q2, q3
  vrhadd.u8 q4, q4, q5
  vrhadd.u8 q6, q6, q7
  vrhadd.u8 q8, q8, q9

  vld1.8 {q1}, [a1]
  vrhadd.u8 q0, q0, q2
  vrhadd.u8 q0, q0, q1
  vst1.8 {q0}, [a1], a3

  vld1.8 {q3}, [a1]
  vrhadd.u8 q2, q2, q4
  vrhadd.u8 q2, q2, q3
  vst1.8 {q2}, [a1], a3

  vld1.8 {q5}, [a1]
  vrhadd.u8 q4, q4, q6
  vrhadd.u8 q4, q4, q5
  vst1.8 {q4}, [a1], a3

  vld1.8 {q7}, [a1]
  vrhadd.u8 q6, q6, q8
  vrhadd.u8 q6, q6, q7
  vst1.8 {q6}, [a1], a3

  subs  a4, a4, #4
  bgt   avg_xy16_loop
  ldmfd sp!, {v1, pc}

;-------------------------

; do
; {
;   for (i=0; i<8; i++)
;     dst[i] = (src[i] + src[i+1] +
;               src[i+stride] + src[i+stride+1] + 2) / 4 + dst[i] + 1) / 2;
; ; nearly the same is:
; ;   dst[i] = (((src[i] + src[i+1] + 1) / 2 +
; ;              (src[i+stride] + src[i+stride+1] + 1) / 2
; ;             ) / 2 + dst[i] + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
avg_xy8
  stmfd sp!, {v1, lr}
  add   v1, a2, #8

avg_xy8_loop
  vld1.8 {d0}, [a2], a3
  vld1.8 {d1[0]}, [v1], a3
  vld1.8 {d2}, [a2], a3
  vld1.8 {d3[0]}, [v1], a3
  vld1.8 {d4}, [a2], a3
  vld1.8 {d5[0]}, [v1], a3
  vld1.8 {d6}, [a2], a3
  vld1.8 {d7[0]}, [v1], a3
  vld1.8 {d8}, [a2], a3
  vld1.8 {d9[0]}, [v1], a3
  sub   a2, a2, a3
  sub   v1, v1, a3

  vext.8 d1, d0, d1, #1
  vext.8 d3, d2, d3, #1
  vext.8 d5, d4, d5, #1
  vext.8 d7, d6, d7, #1
  vext.8 d9, d8, d9, #1

  vrhadd.u8 d0, d0, d1
  vrhadd.u8 d2, d2, d3
  vrhadd.u8 d4, d4, d5
  vrhadd.u8 d6, d6, d7
  vrhadd.u8 d8, d8, d9

  vld1.8 {d1}, [a1]
  vrhadd.u8 d0, d0, d2
  vrhadd.u8 d0, d0, d1
  vst1.8 {d0}, [a1], a3

  vld1.8 {d3}, [a1]
  vrhadd.u8 d2, d2, d4
  vrhadd.u8 d2, d2, d3
  vst1.8 {d2}, [a1], a3

  vld1.8 {d5}, [a1]
  vrhadd.u8 d4, d4, d6
  vrhadd.u8 d4, d4, d5
  vst1.8 {d4}, [a1], a3

  vld1.8 {d7}, [a1]
  vrhadd.u8 d6, d6, d8
  vrhadd.u8 d6, d6, d7
  vst1.8 {d6}, [a1], a3

  subs  a4, a4, #4
  bgt   avg_xy8_loop
  ldmfd sp!, {v1, pc}

;-------------------------

  ALIGN 32 ; to ensure next file starts on correct boundary

  END
