;
; motion.s
; Copyright (C) 2002 P.Everett <peter@everett9981.freeserve.co.uk>
;
; This file is part of KinoAMP, a free RISCOS MPEG program stream decoder.
;
; KinoAMP is free software; you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation; either version 2 of the License, or
; (at your option) any later version.
;
; KinoAMP is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with this program; if not, write to the Free Software
; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
;



; Motion Compensation functions.
;
; On entry,
; a1 = *dst
; a2 = *src
; a3 = stride
; a4 = loop count
;
; These functions consume about 30% of the total time. The inner loops are
; expanded. The destination addresses are ALWAYS word aligned but source
; may not be. So read and write destination as words but read source as
; bytes. It also helps if the frame buffer is quad word aligned.
;
; Some functions compensate for unaligned source, and nearly all functions
; process 4 bytes at a time.
; Thanks to Andr Timmermans for some improvements here.
;
; Most of the code now makes use of the following idea:
;
; mean = (a + b + 1) / 2
;      = a/2 + b/2 + ((a & 1) + (b & 1) + 1)/2
;
; The third term is 1 if either the last bit of a or b is 1
;               is 0 if both the last bits of a and b are 0
;
; so mean = a/2 + b/2 + (a | b) & 1
;

  AREA |A$$code|, CODE, READONLY

;-------------------------
; mc_function structure.
;
; typedef struct mc_functions_s
; {
;   void(*put[8])(uint8_t *dst, uint8_t *src, int32_t stride, int32_t count);
;   void(*avg[8])(uint8_t *dst, uint8_t *src, int32_t stride, int32_t count);
; } mc_functions_t;
;
; mc_functions_t mc_functions;

  EXPORT mc_functions_arm
mc_functions_arm
  DCD put_16
  DCD put_x16
  DCD put_y16
  DCD put_xy16
  DCD put_8
  DCD put_x8
  DCD put_y8
  DCD put_xy8
  DCD avg_16
  DCD avg_x16
  DCD avg_y16
  DCD avg_xy16
  DCD avg_8
  DCD avg_x8
  DCD avg_y8
  DCD avg_xy8
  DCD test_16
  DCD test_16
  DCD test_16
  DCD test_16
  DCD test_8
  DCD test_8
  DCD test_8
  DCD test_8


;-------------------------

; do
; {
;   for (i=0; i<16; i++)
;     dst[i] = &FF
;   dst += stride;
; }
; while(--count);

  ALIGN 32
test_16
  stmfd sp!, {v1-v6, sl, lr}
  mov   v1, #&FF
  orr   v1, v1, v1, lsl #8
  orr   v1, v1, v1, lsl #16
  mov   v2, v1
  mov   v3, v1
  mov   v4, v1

test_16_loop
  stmia a1, {v1-v4}

  add   a2, a2, a3
  add   a1, a1, a3
  subs  a4, a4, #1
  bne   test_16_loop
  ldmfd sp!, {v1-v6, sl, pc}

;-------------------------

; do
; {
;   for (i=0; i<8; i++)
;     dst[i] = &FF;
;   dst += stride;
; }
; while(--count);

  ALIGN 32
test_8
  stmfd sp!, {v1-v5,lr}
  mov   ip, #&FF
  orr   ip, ip, ip, lsl #8
  orr   ip, ip, ip, lsl #16

test_8_loop
  str   ip, [a1, #4]
  str   ip, [a1], a3
  subs  a4, a4, #1
  bne   test_8_loop
  ldmfd sp!, {v1-v5,pc}

;-------------------------

; do
; {
;   for (i=0; i<16; i++)
;     dst[i] = src[i];
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
put_16
  stmfd sp!, {v1-v6, sl, lr}
  ands  v6, a2, #3
  bne   put_16_slow

put_16_fast_loop
  ldmia a2, {v1-v4}
  stmia a1, {v1-v4}
  add   a2, a2, a3
  add   a1, a1, a3
  subs  a4, a4, #1
  bne   put_16_fast_loop
  ldmfd sp!, {v1-v6, sl, pc}

put_16_slow
  bic   a2, a2, v6
  mov   v6, v6, lsl #3
  rsb   sl, v6, #32

put_16_loop
  ldmia a2, {v1-v5}
  mov   v1, v1, lsr v6
  orr   v1, v1, v2, lsl sl
  mov   v2, v2, lsr v6
  orr   v2, v2, v3, lsl sl
  mov   v3, v3, lsr v6
  orr   v3, v3, v4, lsl sl
  mov   v4, v4, lsr v6
  orr   v4, v4, v5, lsl sl
  stmia a1, {v1-v4}

  add   a2, a2, a3
  add   a1, a1, a3
  subs  a4, a4, #1
  bne   put_16_loop
  ldmfd sp!, {v1-v6, sl, pc}

;-------------------------

; do
; {
;   for (i=0; i<8; i++)
;     dst[i] = src[i];
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
put_8
  stmfd sp!, {v1-v5,lr}
  ands  v4, a2, #3
  bne   put_8_slow

put_8_fast_loop
  ldr   ip, [a2, #4]
  str   ip, [a1, #4]
  ldr   ip, [a2], a3
  str   ip, [a1], a3
  subs  a4, a4, #1
  bne   put_8_fast_loop
  ldmfd sp!, {v1-v5,pc}

put_8_slow
  bic   a2,a2,v4
  mov   v4,v4,lsl #3
  rsb   v5,v4,#32

put_8_loop
  ldmia a2,{v1-v3}
  mov   v1,v1,lsr v4
  orr   v1,v1,v2,lsl v5
  mov   v2,v2,lsr v4
  orr   v2,v2,v3,lsl v5
  str   v2, [a1, #4]
  str   v1, [a1], a3

  add   a2, a2, a3
  subs  a4, a4, #1
  bne   put_8_loop
  ldmfd sp!, {v1-v5,pc}

;-------------------------
; do
; {
;   for (i=0; i<16; i++)
;     dst[i] = (dst[i] + src[i] + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
avg_16
  stmfd sp!, {v1-v6, sl, fp, lr}
  ands  fp, a2, #3
  ldr   lr, mask1
  ldr   ip, mask2
  bne   avg_16_slow

avg_16_fast_loop
  ldmia a2, {v1, v2, v3, v4}

  ldr   v6, [a1, #12]
  orr   v5, v4, v6
  and   v5, v5, lr
  and   v4, ip, v4, lsr #1
  and   v6, ip, v6, lsr #1
  add   v4, v4, v5
  add   v4, v4, v6

  ldr   v6, [a1, #8]
  orr   v5, v3, v6
  and   v5, v5, lr
  and   v3, ip, v3, lsr #1
  and   v6, ip, v6, lsr #1
  add   v3, v3, v5
  add   v3, v3, v6

  ldr   v6, [a1, #4]
  orr   v5, v2, v6
  and   v5, v5, lr
  and   v2, ip, v2, lsr #1
  and   v6, ip, v6, lsr #1
  add   v2, v2, v5
  add   v2, v2, v6

  ldr   v6, [a1, #0]
  orr   v5, v1, v6
  and   v5, v5, lr
  and   v1, ip, v1, lsr #1
  and   v6, ip, v6, lsr #1
  add   v1, v1, v5
  add   v1, v1, v6

  stmia a1, {v1, v2, v3, v4}

  add   a1, a1, a3
  add   a2, a2, a3
  subs  a4, a4, #1
  bne   avg_16_fast_loop
  ldmfd sp!, {v1-v6, sl, fp, pc}

avg_16_slow
  bic   a2, a2, fp
  mov   fp, fp, lsl #3
  rsb   sl, fp, #32

avg_16_loop
  ldmia a2, {v1, v2, v3, v4, v5}
  mov   v1, v1, lsr fp
  orr   v1, v1, v2, lsl sl
  mov   v2, v2, lsr fp
  orr   v2, v2, v3, lsl sl
  mov   v3, v3, lsr fp
  orr   v3, v3, v4, lsl sl
  mov   v4, v4, lsr fp
  orr   v4, v4, v5, lsl sl

  ldr   v6, [a1, #12]
  orr   v5, v4, v6
  and   v5, v5, lr
  and   v4, ip, v4, lsr #1
  and   v6, ip, v6, lsr #1
  add   v4, v4, v5
  add   v4, v4, v6

  ldr   v6, [a1, #8]
  orr   v5, v3, v6
  and   v5, v5, lr
  and   v3, ip, v3, lsr #1
  and   v6, ip, v6, lsr #1
  add   v3, v3, v5
  add   v3, v3, v6

  ldr   v6, [a1, #4]
  orr   v5, v2, v6
  and   v5, v5, lr
  and   v2, ip, v2, lsr #1
  and   v6, ip, v6, lsr #1
  add   v2, v2, v5
  add   v2, v2, v6

  ldr   v6, [a1, #0]
  orr   v5, v1, v6
  and   v5, v5, lr
  and   v1, ip, v1, lsr #1
  and   v6, ip, v6, lsr #1
  add   v1, v1, v5
  add   v1, v1, v6

  stmia a1, {v1, v2, v3, v4}

  add   a1, a1, a3
  add   a2, a2, a3
  subs  a4, a4, #1
  bne   avg_16_loop
  ldmfd sp!, {v1-v6, sl, fp, pc}

;-------------------------

; do
; {
;   for (i=0; i<8; i++)
;     dst[i] = (dst[i] + src[i] + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
avg_8
  stmfd sp!, {v1-v5, sl, fp, lr}
  ands  fp, a2, #3
  ldr   lr, mask1
  ldr   ip, mask2
  bne   avg_8_slow

avg_8_fast_loop
  ldmia a2, {v1, v2}

  ldr   v4, [a1, #4]
  orr   v3, v2, v4
  and   v3, v3, lr
  and   v2, ip, v2, lsr #1
  and   v4, ip, v4, lsr #1
  add   v2, v2, v3
  add   v2, v2, v4
  str   v2, [a1, #4]

  ldr   v4, [a1, #0]
  orr   v3, v1, v4
  and   v3, v3, lr
  and   v1, ip, v1, lsr #1
  and   v4, ip, v4, lsr #1
  add   v1, v1, v3
  add   v1, v1, v4
  str   v1, [a1], a3

  add   a2, a2, a3
  subs  a4, a4, #1
  bne   avg_8_fast_loop
  ldmfd sp!, {v1-v5, sl, fp, pc}

avg_8_slow
  bic   a2, a2, fp
  mov   fp, fp, lsl #3
  rsb   sl, fp, #32

avg_8_loop
  ldmia a2, {v1, v2, v3}
  mov   v1, v1, lsr fp
  orr   v1, v1, v2, lsl sl
  mov   v2, v2, lsr fp
  orr   v2, v2, v3, lsl sl

  ldr   v4, [a1, #4]
  orr   v3, v2, v4
  and   v3, v3, lr
  and   v2, ip, v2, lsr #1
  and   v4, ip, v4, lsr #1
  add   v2, v2, v3
  add   v2, v2, v4
  str   v2, [a1, #4]

  ldr   v4, [a1, #0]
  orr   v3, v1, v4
  and   v3, v3, lr
  and   v1, ip, v1, lsr #1
  and   v4, ip, v4, lsr #1
  add   v1, v1, v3
  add   v1, v1, v4
  str   v1, [a1], a3

  add   a2, a2, a3
  subs  a4, a4, #1
  bne   avg_8_loop
  ldmfd sp!, {v1-v5, sl, fp, pc}

;-------------------------

; do
; {
;   for (i=0; i<16; i++)
;     dst[i] = (src[i] + src[i+1] + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
put_x16
  stmfd sp!, {v1-v6, sl, fp, lr}
  and   fp, a2, #3
  ldr   lr, mask1
  ldr   ip, mask2
  bic   a2, a2, fp
  mov   fp, fp, lsl #3

put_x16_loop
  ldmia a2, {v1, v2, v3, v4, v5}
  rsb   sl, fp, #32
  mov   v1, v1, lsr fp
  orr   v1, v1, v2, lsl sl
  mov   v2, v2, lsr fp
  orr   v2, v2, v3, lsl sl
  mov   v3, v3, lsr fp
  orr   v3, v3, v4, lsl sl
  mov   v4, v4, lsr fp
  orr   v4, v4, v5, lsl sl
  mov   v5, v5, lsr fp

  mov   sl, v5, lsl #24

  orr   v6, sl, v4, lsr #8
  mov   sl, v4, lsl #24
  orr   v5, v4, v6
  and   v5, v5, lr
  and   v4, ip, v4, lsr #1
  and   v6, ip, v6, lsr #1
  add   v4, v4, v5
  add   v4, v4, v6

  orr   v6, sl, v3, lsr #8
  mov   sl, v3, lsl #24
  orr   v5, v3, v6
  and   v5, v5, lr
  and   v3, ip, v3, lsr #1
  and   v6, ip, v6, lsr #1
  add   v3, v3, v5
  add   v3, v3, v6

  orr   v6, sl, v2, lsr #8
  mov   sl, v2, lsl #24
  orr   v5, v2, v6
  and   v5, v5, lr
  and   v2, ip, v2, lsr #1
  and   v6, ip, v6, lsr #1
  add   v2, v2, v5
  add   v2, v2, v6

  orr   v6, sl, v1, lsr #8
  orr   v5, v1, v6
  and   v5, v5, lr
  and   v1, ip, v1, lsr #1
  and   v6, ip, v6, lsr #1
  add   v1, v1, v5
  add   v1, v1, v6

  stmia a1, {v1, v2, v3, v4}

  add   a1, a1, a3
  add   a2, a2, a3
  subs  a4, a4, #1
  bne   put_x16_loop
  ldmfd sp!, {v1-v6, sl, fp, pc}

;-------------------------

; do
; {
;   for (i=0; i<8; i++)
;     dst[i] = (src[i] + src[i+1] + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
put_x8
  stmfd sp!, {v1-v6, sl, fp, lr}
  and   fp, a2, #3
  ldr   lr, mask1
  ldr   ip, mask2
  bic   a2, a2, fp
  mov   fp, fp, lsl #3
  rsb   sl, fp, #32

put_x8_loop
  ldmia a2, {v1, v2, v3}
  mov   v1, v1, lsr fp
  orr   v1, v1, v2, lsl sl
  mov   v2, v2, lsr fp
  orr   v2, v2, v3, lsl sl
  mov   v3, v3, lsr fp

  mov   v4, v3, lsl #24

  orr   v6, v4, v2, lsr #8
  mov   v4, v2, lsl #24
  orr   v5, v2, v6
  and   v5, v5, lr
  and   v2, ip, v2, lsr #1
  and   v6, ip, v6, lsr #1
  add   v2, v2, v5
  add   v2, v2, v6
  str   v2, [a1, #4]

  orr   v6, v4, v1, lsr #8
  orr   v5, v1, v6
  and   v5, v5, lr
  and   v1, ip, v1, lsr #1
  and   v6, ip, v6, lsr #1
  add   v1, v1, v5
  add   v1, v1, v6
  str   v1, [a1], a3

  add   a2, a2, a3
  subs  a4, a4, #1
  bne   put_x8_loop
  ldmfd sp!, {v1-v6, sl, fp, pc}

;-------------------------

; do
; {
;   for (i=0; i<16; i++)
;     dst[i] = ((src[i] + src[i+1] + 1) / 2 + dst[i] + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
avg_x16
  stmfd sp!, {v1-v6, sl, fp, lr}
  and   fp, a2, #3
  ldr   lr, mask1
  ldr   ip, mask2
  bic   a2, a2, fp
  mov   fp, fp, lsl #3

avg_x16_loop
  ldmia a2, {v1, v2, v3, v4, v5}
  rsb   sl, fp, #32
  mov   v1, v1, lsr fp
  orr   v1, v1, v2, lsl sl
  mov   v2, v2, lsr fp
  orr   v2, v2, v3, lsl sl
  mov   v3, v3, lsr fp
  orr   v3, v3, v4, lsl sl
  mov   v4, v4, lsr fp
  orr   v4, v4, v5, lsl sl
  mov   v5, v5, lsr fp

  mov   sl, v5, lsl #24

  orr   v6, sl, v4, lsr #8
  mov   sl, v4, lsl #24
  orr   v5, v4, v6
  and   v5, v5, lr
  and   v4, ip, v4, lsr #1
  and   v6, ip, v6, lsr #1
  add   v4, v4, v5
  add   v4, v4, v6
  ldr   v6, [a1, #12]
  orr   v5, v4, v6
  and   v5, v5, lr
  and   v4, ip, v4, lsr #1
  and   v6, ip, v6, lsr #1
  add   v4, v4, v5
  add   v4, v4, v6

  orr   v6, sl, v3, lsr #8
  mov   sl, v3, lsl #24
  orr   v5, v3, v6
  and   v5, v5, lr
  and   v3, ip, v3, lsr #1
  and   v6, ip, v6, lsr #1
  add   v3, v3, v5
  add   v3, v3, v6
  ldr   v6, [a1, #8]
  orr   v5, v3, v6
  and   v5, v5, lr
  and   v3, ip, v3, lsr #1
  and   v6, ip, v6, lsr #1
  add   v3, v3, v5
  add   v3, v3, v6

  orr   v6, sl, v2, lsr #8
  mov   sl, v2, lsl #24
  orr   v5, v2, v6
  and   v5, v5, lr
  and   v2, ip, v2, lsr #1
  and   v6, ip, v6, lsr #1
  add   v2, v2, v5
  add   v2, v2, v6
  ldr   v6, [a1, #4]
  orr   v5, v2, v6
  and   v5, v5, lr
  and   v2, ip, v2, lsr #1
  and   v6, ip, v6, lsr #1
  add   v2, v2, v5
  add   v2, v2, v6

  orr   v6, sl, v1, lsr #8
  orr   v5, v1, v6
  and   v5, v5, lr
  and   v1, ip, v1, lsr #1
  and   v6, ip, v6, lsr #1
  add   v1, v1, v5
  add   v1, v1, v6
  ldr   v6, [a1, #0]
  orr   v5, v1, v6
  and   v5, v5, lr
  and   v1, ip, v1, lsr #1
  and   v6, ip, v6, lsr #1
  add   v1, v1, v5
  add   v1, v1, v6

  stmia a1, {v1, v2, v3, v4}

  add   a1, a1, a3
  add   a2, a2, a3
  subs  a4, a4, #1
  bne   avg_x16_loop
  ldmfd sp!, {v1-v6, sl, fp, pc}

;-------------------------

; do
; {
;   for (i=0; i<8; i++)
;     dst[i] = (src[i] + src[i+1] + 1) /2 + dst[i] + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
avg_x8
  stmfd sp!, {v1-v6, sl, fp, lr}
  and   fp, a2, #3
  ldr   lr, mask1
  ldr   ip, mask2
  bic   a2, a2, fp
  mov   fp, fp, lsl #3
  rsb   sl, fp, #32

avg_x8_loop
  ldmia a2, {v1, v2, v3}
  mov   v1, v1, lsr fp
  orr   v1, v1, v2, lsl sl
  mov   v2, v2, lsr fp
  orr   v2, v2, v3, lsl sl
  mov   v3, v3, lsr fp

  mov   v4, v3, lsl #24

  orr   v6, v4, v2, lsr #8
  mov   v4, v2, lsl #24
  orr   v5, v2, v6
  and   v5, v5, lr
  and   v2, ip, v2, lsr #1
  and   v6, ip, v6, lsr #1
  add   v2, v2, v5
  add   v2, v2, v6
  ldr   v6, [a1, #4]
  orr   v5, v2, v6
  and   v5, v5, lr
  and   v2, ip, v2, lsr #1
  and   v6, ip, v6, lsr #1
  add   v2, v2, v5
  add   v2, v2, v6
  str   v2, [a1, #4]

  orr   v6, v4, v1, lsr #8
  orr   v5, v1, v6
  and   v5, v5, lr
  and   v1, ip, v1, lsr #1
  and   v6, ip, v6, lsr #1
  add   v1, v1, v5
  add   v1, v1, v6
  ldr   v6, [a1, #0]
  orr   v5, v1, v6
  and   v5, v5, lr
  and   v1, ip, v1, lsr #1
  and   v6, ip, v6, lsr #1
  add   v1, v1, v5
  add   v1, v1, v6
  str   v1, [a1], a3

  add   a2, a2, a3
  subs  a4, a4, #1
  bne   avg_x8_loop
  ldmfd sp!, {v1-v6, sl, fp, pc}

;-------------------------

; These constants are used when processing 4 bytes at a time.
mask1  DCD 0x01010101
mask2  DCD 0x7f7f7f7f

;-------------------------

; do
; {
;   for (i=0; i<16; i++)
;     dst[i] = (src[i] + src[i+stride] + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
put_y16
  stmfd sp!, {v1-v6, sl, fp, lr}
  ands  fp, a2, #3
  ldr   lr, mask1
  ldr   ip, mask2
  bne   put_y16_slow

  ldmia a2, {v1, v2, v3, v4}

put_y16_fast_loop
  add   a2, a2, a3

  ldr   v6, [a2, #12]
  orr   v5, v4, v6
  and   v5, v5, lr
  and   v4, ip, v4, lsr #1
  add   v5, v5, v4
  and   v4, ip, v6, lsr #1
  add   v5, v5, v4
  mov   v4, v6
  str   v5, [a1, #12]

  ldr   v6, [a2, #8]
  orr   v5, v3, v6
  and   v5, v5, lr
  and   v3, ip, v3, lsr #1
  add   v5, v5, v3
  and   v3, ip, v6, lsr #1
  add   v5, v5, v3
  mov   v3, v6
  str   v5, [a1, #8]

  ldr   v6, [a2, #4]
  orr   v5, v2, v6
  and   v5, v5, lr
  and   v2, ip, v2, lsr #1
  add   v5, v5, v2
  and   v2, ip, v6, lsr #1
  add   v5, v5, v2
  mov   v2, v6
  str   v5, [a1, #4]

  ldr   v6, [a2, #0]
  orr   v5, v1, v6
  and   v5, v5, lr
  and   v1, ip, v1, lsr #1
  add   v5, v5, v1
  and   v1, ip, v6, lsr #1
  add   v5, v5, v1
  mov   v1, v6
  str   v5, [a1], a3

  subs  a4, a4, #1
  bne   put_y16_fast_loop
  ldmfd sp!, {v1-v6, sl, fp, pc}

put_y16_slow
  bic   a2, a2, fp
  mov   fp, fp, lsl #3

  ldmia a2, {v1, v2, v3, v4, v5}
  rsb   sl, fp, #32
  mov   v1, v1, lsr fp
  orr   v1, v1, v2, lsl sl
  mov   v2, v2, lsr fp
  orr   v2, v2, v3, lsl sl
  mov   v3, v3, lsr fp
  orr   v3, v3, v4, lsl sl
  mov   v4, v4, lsr fp
  orr   v4, v4, v5, lsl sl

put_y16_loop
  add   a2, a2, a3

  ldr   sl, [a2, #16]

  rsb   v5, fp, #32
  mov   v6, sl, lsl v5
  ldr   sl, [a2, #12]
  orr   v6, v6, sl, lsr fp
  orr   v5, v4, v6
  and   v5, v5, lr
  and   v4, ip, v4, lsr #1
  add   v5, v5, v4
  and   v4, ip, v6, lsr #1
  add   v5, v5, v4
  mov   v4, v6
  str   v5, [a1, #12]

  rsb   v5, fp, #32
  mov   v6, sl, lsl v5
  ldr   sl, [a2, #8]
  orr   v6, v6, sl, lsr fp
  orr   v5, v3, v6
  and   v5, v5, lr
  and   v3, ip, v3, lsr #1
  add   v5, v5, v3
  and   v3, ip, v6, lsr #1
  add   v5, v5, v3
  mov   v3, v6
  str   v5, [a1, #8]

  rsb   v5, fp, #32
  mov   v6, sl, lsl v5
  ldr   sl, [a2, #4]
  orr   v6, v6, sl, lsr fp
  orr   v5, v2, v6
  and   v5, v5, lr
  and   v2, ip, v2, lsr #1
  add   v5, v5, v2
  and   v2, ip, v6, lsr #1
  add   v5, v5, v2
  mov   v2, v6
  str   v5, [a1, #4]

  rsb   v5, fp, #32
  mov   v6, sl, lsl v5
  ldr   sl, [a2, #0]
  orr   v6, v6, sl, lsr fp
  orr   v5, v1, v6
  and   v5, v5, lr
  and   v1, ip, v1, lsr #1
  add   v5, v5, v1
  and   v1, ip, v6, lsr #1
  add   v5, v5, v1
  mov   v1, v6
  str   v5, [a1], a3

  subs  a4, a4, #1
  bne   put_y16_loop
  ldmfd sp!, {v1-v6, sl, fp, pc}

;-------------------------

; do
; {
;   for (i=0; i<8; i++)
;     dst[i] = (src[i] + src[i+stride] + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
put_y8
  stmfd sp!, {v1-v5, sl, fp, lr}
  ands  fp, a2, #3
  ldr   lr, mask1
  ldr   ip, mask2
  bne   put_y8_slow

  ldmia a2, {v1, v2, v3}

put_y8_fast_loop
  add   a2, a2, a3
  ldmia a2, {v3, v4, v5}

  orr   v5, v2, v4
  and   v5, v5, lr
  and   v2, ip, v2, lsr #1
  add   v5, v5, v2
  and   v2, ip, v4, lsr #1
  add   v5, v5, v2
  mov   v2, v4
  str   v5, [a1, #4]

  orr   v5, v1, v3
  and   v5, v5, lr
  and   v1, ip, v1, lsr #1
  add   v5, v5, v1
  and   v1, ip, v3, lsr #1
  add   v5, v5, v1
  mov   v1, v3
  str   v5, [a1], a3

  subs  a4, a4, #1
  bne   put_y8_fast_loop
  ldmfd sp!, {v1-v5, sl, fp, pc}

put_y8_slow
  bic   a2, a2, fp
  mov   fp, fp, lsl #3
  rsb   sl, fp, #32

  ldmia a2, {v1, v2, v3}
  mov   v1, v1, lsr fp
  orr   v1, v1, v2, lsl sl
  mov   v2, v2, lsr fp
  orr   v2, v2, v3, lsl sl

put_y8_loop
  add   a2, a2, a3
  ldmia a2, {v3, v4, v5}
  mov   v3, v3, lsr fp
  orr   v3, v3, v4, lsl sl
  mov   v4, v4, lsr fp
  orr   v4, v4, v5, lsl sl

  orr   v5, v2, v4
  and   v5, v5, lr
  and   v2, ip, v2, lsr #1
  add   v5, v5, v2
  and   v2, ip, v4, lsr #1
  add   v5, v5, v2
  mov   v2, v4
  str   v5, [a1, #4]

  orr   v5, v1, v3
  and   v5, v5, lr
  and   v1, ip, v1, lsr #1
  add   v5, v5, v1
  and   v1, ip, v3, lsr #1
  add   v5, v5, v1
  mov   v1, v3
  str   v5, [a1], a3

  subs  a4, a4, #1
  bne   put_y8_loop
  ldmfd sp!, {v1-v5, sl, fp, pc}

;-------------------------

; do
; {
;   for (i=0; i<16; i++)
;     dst[i] = ((src[i] + src[i+stride] + 1) / 2 + dst[i] + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
avg_y16
  stmfd sp!, {v1-v6, sl, fp, lr}
  and   fp, a2, #3
  ldr   lr, mask1
  ldr   ip, mask2
  bic   a2, a2, fp
  mov   fp, fp, lsl #3

  ; regroup a3 and a4 to get an extra register
  add   a3, a4, a3, lsl #16

  ldmia a2, {v1, v2, v3, v4, v5}
  rsb   sl, fp, #32
  mov   v1, v1, lsr fp
  orr   v1, v1, v2, lsl sl
  mov   v2, v2, lsr fp
  orr   v2, v2, v3, lsl sl
  mov   v3, v3, lsr fp
  orr   v3, v3, v4, lsl sl
  mov   v4, v4, lsr fp
  orr   v4, v4, v5, lsl sl

avg_y16_loop
  add   a2, a2, a3, lsr #16

  ldr   sl, [a2, #16]

  rsb   v5, fp, #32
  mov   v6, sl, lsl v5
  ldr   sl, [a2, #12]
  orr   v6, v6, sl, lsr fp
  orr   v5, v4, v6
  and   v5, v5, lr
  and   v4, ip, v4, lsr #1
  add   v5, v5, v4
  and   v4, ip, v6, lsr #1
  add   v5, v5, v4
  mov   v4, v6
  ldr   v6, [a1, #12]
  orr   a4, v5, v6
  and   a4, a4, lr
  and   v5, ip, v5, lsr #1
  and   v6, ip, v6, lsr #1
  add   v5, v5, v6
  add   v5, v5, a4
  str   v5, [a1, #12]

  rsb   v5, fp, #32
  mov   v6, sl, lsl v5
  ldr   sl, [a2, #8]
  orr   v6, v6, sl, lsr fp
  orr   v5, v3, v6
  and   v5, v5, lr
  and   v3, ip, v3, lsr #1
  add   v5, v5, v3
  and   v3, ip, v6, lsr #1
  add   v5, v5, v3
  mov   v3, v6
  ldr   v6, [a1, #8]
  orr   a4, v5, v6
  and   a4, a4, lr
  and   v5, ip, v5, lsr #1
  and   v6, ip, v6, lsr #1
  add   v5, v5, v6
  add   v5, v5, a4
  str   v5, [a1, #8]

  rsb   v5, fp, #32
  mov   v6, sl, lsl v5
  ldr   sl, [a2, #4]
  orr   v6, v6, sl, lsr fp
  orr   v5, v2, v6
  and   v5, v5, lr
  and   v2, ip, v2, lsr #1
  add   v5, v5, v2
  and   v2, ip, v6, lsr #1
  add   v5, v5, v2
  mov   v2, v6
  ldr   v6, [a1, #4]
  orr   a4, v5, v6
  and   a4, a4, lr
  and   v5, ip, v5, lsr #1
  and   v6, ip, v6, lsr #1
  add   v5, v5, v6
  add   v5, v5, a4
  str   v5, [a1, #4]

  rsb   v5, fp, #32
  mov   v6, sl, lsl v5
  ldr   sl, [a2, #0]
  orr   v6, v6, sl, lsr fp
  orr   v5, v1, v6
  and   v5, v5, lr
  and   v1, ip, v1, lsr #1
  add   v5, v5, v1
  and   v1, ip, v6, lsr #1
  add   v5, v5, v1
  mov   v1, v6
  ldr   v6, [a1, #0]
  orr   a4, v5, v6
  and   a4, a4, lr
  and   v5, ip, v5, lsr #1
  and   v6, ip, v6, lsr #1
  add   v5, v5, v6
  add   v5, v5, a4
  str   v5, [a1, #0]
  add   a1, a1, a3, lsr #16

  sub   a3, a3, #1
  movs  a4, a3, lsl #16
  bne   avg_y16_loop
  ldmfd sp!, {v1-v6, sl, fp, pc}

;-------------------------

; do
; {
;   for (i=0; i<8; i++)
;     dst[i] = ((src[i] + src[i+stride] + 1) / 2 + dst[i] + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
avg_y8
  stmfd sp!, {v1-v6, sl, fp, lr}
  and   fp, a2, #3
  ldr   lr, mask1
  ldr   ip, mask2
  bic   a2, a2, fp
  mov   fp, fp, lsl #3
  rsb   sl, fp, #32

  ldmia a2, {v1, v2, v3}
  mov   v1, v1, lsr fp
  orr   v1, v1, v2, lsl sl
  mov   v2, v2, lsr fp
  orr   v2, v2, v3, lsl sl

avg_y8_loop
  add   a2, a2, a3
  ldmia a2, {v3, v4, v5}
  mov   v3, v3, lsr fp
  orr   v3, v3, v4, lsl sl
  mov   v4, v4, lsr fp
  orr   v4, v4, v5, lsl sl

  orr   v5, v2, v4
  and   v5, v5, lr
  and   v2, ip, v2, lsr #1
  add   v5, v5, v2
  and   v2, ip, v4, lsr #1
  add   v5, v5, v2
  mov   v2, v4
  ldr   v6, [a1, #4]
  orr   v4, v5, v6
  and   v4, v4, lr
  and   v5, ip, v5, lsr #1
  and   v6, ip, v6, lsr #1
  add   v5, v5, v6
  add   v5, v5, v4
  str   v5, [a1, #4]

  orr   v5, v1, v3
  and   v5, v5, lr
  and   v1, ip, v1, lsr #1
  add   v5, v5, v1
  and   v1, ip, v3, lsr #1
  add   v5, v5, v1
  mov   v1, v3
  ldr   v6, [a1, #0]
  orr   v3, v5, v6
  and   v3, v3, lr
  and   v5, ip, v5, lsr #1
  and   v6, ip, v6, lsr #1
  add   v5, v5, v6
  add   v5, v5, v3
  str   v5, [a1], a3

  subs  a4, a4, #1
  bne   avg_y8_loop
  ldmfd sp!, {v1-v6, sl, fp, pc}

;-------------------------

; do
; {
;   for (i=0; i<16; i++)
;     dst[i] = (src[i] + src[i+1] +
;               src[i+stride] + src[i+stride+1] + 2) / 4;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
put_xy16
  stmfd sp!, {v1-v5, lr}
  add   v1, a2, a3
put_xy16_loop
  ldrb  v2, [a2, #0]
  ldrb  v3, [a2, #1]
  ldrb  v4, [v1, #0]
  ldrb  ip, [v1, #1]
  add   lr, v2, v3
  add   lr, lr, v4
  add   lr, lr, ip
  add   lr, lr, #2
  mov   v5, lr, lsr #2

  ldrb  v2, [a2, #2]
  ldrb  v4, [v1, #2]
  add   lr, v2, v3
  add   lr, lr, v4
  add   lr, lr, ip
  add   lr, lr, #2
  mov   lr, lr, lsr #2
  orr   v5, v5, lr, lsl #8

  ldrb  v3, [a2, #3]
  ldrb  ip, [v1, #3]
  add   lr, v2, v3
  add   lr, lr, v4
  add   lr, lr, ip
  add   lr, lr, #2
  mov   lr, lr, lsr #2
  orr   v5, v5, lr, lsl #16

  ldrb  v2, [a2, #4]
  ldrb  v4, [v1, #4]
  add   lr, v2, v3
  add   lr, lr, v4
  add   lr, lr, ip
  add   lr, lr, #2
  mov   lr, lr, lsr #2
  orr   v5, v5, lr, lsl #24

  str   v5, [a1, #0]

  ldrb  v3, [a2, #5]
  ldrb  ip, [v1, #5]
  add   lr, v2, v3
  add   lr, lr, v4
  add   lr, lr, ip
  add   lr, lr, #2
  mov   v5, lr, lsr #2

  ldrb  v2, [a2, #6]
  ldrb  v4, [v1, #6]
  add   lr, v2, v3
  add   lr, lr, v4
  add   lr, lr, ip
  add   lr, lr, #2
  mov   lr, lr, lsr #2
  orr   v5, v5, lr, lsl #8

  ldrb  v3, [a2, #7]
  ldrb  ip, [v1, #7]
  add   lr, v2, v3
  add   lr, lr, v4
  add   lr, lr, ip
  add   lr, lr, #2
  mov   lr, lr, lsr #2
  orr   v5, v5, lr, lsl #16

  ldrb  v2, [a2, #8]
  ldrb  v4, [v1, #8]
  add   lr, v2, v3
  add   lr, lr, v4
  add   lr, lr, ip
  add   lr, lr, #2
  mov   lr, lr, lsr #2
  orr   v5, v5, lr, lsl #24

  str   v5, [a1, #4]

  ldrb  v3, [a2, #9]
  ldrb  ip, [v1, #9]
  add   lr, v2, v3
  add   lr, lr, v4
  add   lr, lr, ip
  add   lr, lr, #2
  mov   v5, lr, lsr #2

  ldrb  v2, [a2, #10]
  ldrb  v4, [v1, #10]
  add   lr, v2, v3
  add   lr, lr, v4
  add   lr, lr, ip
  add   lr, lr, #2
  mov   lr, lr, lsr #2
  orr   v5, v5, lr, lsl #8

  ldrb  v3, [a2, #11]
  ldrb  ip, [v1, #11]
  add   lr, v2, v3
  add   lr, lr, v4
  add   lr, lr, ip
  add   lr, lr, #2
  mov   lr, lr, lsr #2
  orr   v5, v5, lr, lsl #16

  ldrb  v2, [a2, #12]
  ldrb  v4, [v1, #12]
  add   lr, v2, v3
  add   lr, lr, v4
  add   lr, lr, ip
  add   lr, lr, #2
  mov   lr, lr, lsr #2
  orr   v5, v5, lr, lsl #24

  str   v5, [a1, #8]

  ldrb  v3, [a2, #13]
  ldrb  ip, [v1, #13]
  add   lr, v2, v3
  add   lr, lr, v4
  add   lr, lr, ip
  add   lr, lr, #2
  mov   v5, lr, lsr #2

  ldrb  v2, [a2, #14]
  ldrb  v4, [v1, #14]
  add   lr, v2, v3
  add   lr, lr, v4
  add   lr, lr, ip
  add   lr, lr, #2
  mov   lr, lr, lsr #2
  orr   v5, v5, lr, lsl #8

  ldrb  v3, [a2, #15]
  ldrb  ip, [v1, #15]
  add   lr, v2, v3
  add   lr, lr, v4
  add   lr, lr, ip
  add   lr, lr, #2
  mov   lr, lr, lsr #2
  orr   v5, v5, lr, lsl #16

  ldrb  v2, [a2, #16]
  ldrb  v4, [v1, #16]
  add   lr, v2, v3
  add   lr, lr, v4
  add   lr, lr, ip
  add   lr, lr, #2
  mov   lr, lr, lsr #2
  orr   v5, v5, lr, lsl #24

  str   v5, [a1, #12]

  add   a1, a1, a3
  add   a2, a2, a3
  add   v1, v1, a3
  subs  a4, a4, #1
  bne   put_xy16_loop
  ldmfd sp!, {v1-v5, pc}

;-------------------------

; do
; {
;   for (i=0; i<8; i++)
;     dst[i] = (src[i] + src[i+1] +
;               src[i+stride] + src[i+stride+1] + 2) / 4;
; ; nearly the same is:
; ;    dst[i] = ((src[i] + src[i+1] + 1) / 2
; ;              (src[i+stride] + src[i+stride+1] + 1) / 2 + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
put_xy8
  stmfd sp!, {v1-v6, sl, fp, lr}
  and   fp, a2, #3
  ldr   lr, mask1
  ldr   ip, mask2
  bic   a2, a2, fp
  mov   fp, fp, lsl #3
  rsb   sl, fp, #32

  ; keep avarage of src[i] + scr[i+1]
  ldmia a2, {v1, v2, v3}
  mov   v1, v1, lsr fp
  orr   v1, v1, v2, lsl sl
  mov   v2, v2, lsr fp
  orr   v2, v2, v3, lsl sl
  mov   v3, v3, lsr fp

  mov   v4, v3, lsl #24

  orr   v6, v4, v2, lsr #8
  mov   v4, v2, lsl #24
  orr   v5, v2, v6
  and   v5, v5, lr
  and   v2, ip, v2, lsr #1
  and   v6, ip, v6, lsr #1
  add   v2, v2, v6
  add   v2, v2, v5

  orr   v6, v4, v1, lsr #8
  orr   v5, v1, v6
  and   v5, v5, lr
  and   v1, ip, v1, lsr #1
  and   v6, ip, v6, lsr #1
  add   v1, v1, v6
  add   v1, v1, v5

put_xy8_loop
  add   a2, a2, a3

  ldmia a2, {v3, v4, v5}
  rsb   sl, fp, #32
  mov   v3, v3, lsr fp
  orr   v3, v3, v4, lsl sl
  mov   v4, v4, lsr fp
  orr   v4, v4, v5, lsl sl
  mov   v5, v5, lsr fp

  mov   sl, v5, lsl #24

  orr   v6, sl, v4, lsr #8
  mov   sl, v4, lsl #24
  orr   v5, v4, v6
  and   v5, v5, lr
  and   v4, ip, v4, lsr #1
  and   v6, ip, v6, lsr #1
  add   v4, v4, v6
  add   v4, v4, v5
  orr   v5, v4, v2
  and   v5, v5, lr
  and   v6, ip, v4, lsr #1
  and   v2, ip, v2, lsr #1
  add   v5, v5, v2
  add   v5, v5, v6
  mov   v2, v4
  str   v5, [a1, #4]

  orr   v6, sl, v3, lsr #8
  orr   v5, v3, v6
  and   v5, v5, lr
  and   v3, ip, v3, lsr #1
  and   v6, ip, v6, lsr #1
  add   v3, v3, v6
  add   v3, v3, v5
  orr   v5, v3, v1
  and   v5, v5, lr
  and   v6, ip, v3, lsr #1
  and   v1, ip, v1, lsr #1
  add   v5, v5, v1
  add   v5, v5, v6
  mov   v1, v3
  str   v5, [a1], a3

  subs  a4, a4, #1
  bne   put_xy8_loop
  ldmfd sp!, {v1-v6, sl, fp, pc}

;-------------------------

; do
; {
;   for (i=0; i<16; i++)
;     dst[i] = (src[i] + src[i+1] +
;               src[i+stride] + src[i+stride+1] + 4*dst[i] + 4) / 8;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
avg_xy16
  stmfd sp!, {v1-v6, fp, lr}
  add   v1, a2, a3
avg_xy16_loop
  ldr   v6, [a1, #0]

  ldrb  v2, [a2, #0]
  ldrb  v3, [a2, #1]
  ldrb  v4, [v1, #0]
  ldrb  v5, [v1, #1]
  add   ip, v2, v3
  add   ip, ip, v4
  add   ip, ip, v5
  and   lr, v6, #255
  add   lr, ip, lr, lsl #2
  add   lr, lr, #4
  mov   fp, lr, lsr #3

  ldrb  v2, [a2, #2]
  ldrb  v4, [v1, #2]
  add   ip, v2, v3
  add   ip, ip, v4
  add   ip, ip, v5
  and   lr, v6, #255<<8
  add   lr, ip, lr, lsr #6
  add   lr, lr, #4
  mov   lr, lr, lsr #3
  orr   fp, fp, lr, lsl #8

  ldrb  v3, [a2, #3]
  ldrb  v5, [v1, #3]
  add   ip, v2, v3
  add   ip, ip, v4
  add   ip, ip, v5
  and   lr, v6, #255<<16
  add   lr, ip, lr, lsr #14
  add   lr, lr, #4
  mov   lr, lr, lsr #3
  orr   fp, fp, lr, lsl #16

  ldrb  v2, [a2, #4]
  ldrb  v4, [v1, #4]
  add   ip, v2, v3
  add   ip, ip, v4
  add   ip, ip, v5
  mov   lr, v6, lsr #24
  add   lr, ip, lr, lsl #2
  add   lr, lr, #4
  mov   lr, lr, lsr #3
  orr   fp, fp, lr, lsl #24

  str   fp, [a1, #0]

  ldr   v6, [a1, #4]

  ldrb  v3, [a2, #5]
  ldrb  v5, [v1, #5]
  add   ip, v2, v3
  add   ip, ip, v4
  add   ip, ip, v5
  and   lr, v6, #255
  add   lr, ip, lr, lsl #2
  add   lr, lr, #4
  mov   fp, lr, lsr #3

  ldrb  v2, [a2, #6]
  ldrb  v4, [v1, #6]
  add   ip, v2, v3
  add   ip, ip, v4
  add   ip, ip, v5
  and   lr, v6, #255<<8
  add   lr, ip, lr, lsr #6
  add   lr, lr, #4
  mov   lr, lr, lsr #3
  orr   fp, fp, lr, lsl #8

  ldrb  v3, [a2, #7]
  ldrb  v5, [v1, #7]
  add   ip, v2, v3
  add   ip, ip, v4
  add   ip, ip, v5
  and   lr, v6, #255<<16
  add   lr, ip, lr, lsr #14
  add   lr, lr, #4
  mov   lr, lr, lsr #3
  orr   fp, fp, lr, lsl #16

  ldrb  v2, [a2, #8]
  ldrb  v4, [v1, #8]
  add   ip, v2, v3
  add   ip, ip, v4
  add   ip, ip, v5
  mov   lr, v6, lsr #24
  add   lr, ip, lr, lsl #2
  add   lr, lr, #4
  mov   lr, lr, lsr #3
  orr   fp, fp, lr, lsl #24

  str   fp, [a1, #4]

  ldr   v6, [a1, #8]

  ldrb  v3, [a2, #9]
  ldrb  v5, [v1, #9]
  add   ip, v2, v3
  add   ip, ip, v4
  add   ip, ip, v5
  and   lr, v6, #255
  add   lr, ip, lr, lsl #2
  add   lr, lr, #4
  mov   fp, lr, lsr #3

  ldrb  v2, [a2, #10]
  ldrb  v4, [v1, #10]
  add   ip, v2, v3
  add   ip, ip, v4
  add   ip, ip, v5
  and   lr, v6, #255<<8
  add   lr, ip, lr, lsr #6
  add   lr, lr, #4
  mov   lr, lr, lsr #3
  orr   fp, fp, lr, lsl #8

  ldrb  v3, [a2, #11]
  ldrb  v5, [v1, #11]
  add   ip, v2, v3
  add   ip, ip, v4
  add   ip, ip, v5
  and   lr, v6, #255<<16
  add   lr, ip, lr, lsr #14
  add   lr, lr, #4
  mov   lr, lr, lsr #3
  orr   fp, fp, lr, lsl #16

  ldrb  v2, [a2, #12]
  ldrb  v4, [v1, #12]
  add   ip, v2, v3
  add   ip, ip, v4
  add   ip, ip, v5
  mov   lr, v6, lsr #24
  add   lr, ip, lr, lsl #2
  add   lr, lr, #4
  mov   lr, lr, lsr #3
  orr   fp, fp, lr, lsl #24

  str   fp, [a1, #8]

  ldr   v6, [a1, #12]

  ldrb  v3, [a2, #13]
  ldrb  v5, [v1, #13]
  add   ip, v2, v3
  add   ip, ip, v4
  add   ip, ip, v5
  and   lr, v6, #255
  add   lr, ip, lr, lsl #2
  add   lr, lr, #4
  mov   fp, lr, lsr #3

  ldrb  v2, [a2, #14]
  ldrb  v4, [v1, #14]
  add   ip, v2, v3
  add   ip, ip, v4
  add   ip, ip, v5
  and   lr, v6, #255<<8
  add   lr, ip, lr, lsr #6
  add   lr, lr, #4
  mov   lr, lr, lsr #3
  orr   fp, fp, lr, lsl #8

  ldrb  v3, [a2, #15]
  ldrb  v5, [v1, #15]
  add   ip, v2, v3
  add   ip, ip, v4
  add   ip, ip, v5
  and   lr, v6, #255<<16
  add   lr, ip, lr, lsr #14
  add   lr, lr, #4
  mov   lr, lr, lsr #3
  orr   fp, fp, lr, lsl #16

  ldrb  v2, [a2, #16]
  ldrb  v4, [v1, #16]
  add   ip, v2, v3
  add   ip, ip, v4
  add   ip, ip, v5
  mov   lr, v6, lsr #24
  add   lr, ip, lr, lsl #2
  add   lr, lr, #4
  mov   lr, lr, lsr #3
  orr   fp, fp, lr, lsl #24

  str   fp, [a1, #12]

  add   a1, a1, a3
  add   a2, a2, a3
  add   v1, v1, a3
  subs  a4, a4, #1
  bne   avg_xy16_loop
  ldmfd sp!, {v1-v6, fp, pc}

;-------------------------

; do
; {
;   for (i=0; i<8; i++)
;     dst[i] = (src[i] + src[i+1] +
;               src[i+stride] + src[i+stride+1] + 2) / 4 + dst[i] + 1) / 2;
; ; nearly the same is:
; ;   dst[i] = (((src[i] + src[i+1] + 1) / 2 +
; ;              (src[i+stride] + src[i+stride+1] + 1) / 2
; ;             ) / 2 + dst[i] + 1) / 2;
;   dst += stride;
;   src += stride;
; }
; while(--count);

  ALIGN 32
avg_xy8
  stmfd sp!, {v1-v6, sl, fp, lr}
  and   fp, a2, #3
  ldr   lr, mask1
  ldr   ip, mask2
  bic   a2, a2, fp
  mov   fp, fp, lsl #3
  rsb   sl, fp, #32

  ; keep avarage of src[i] + scr[i+1]
  ldmia a2, {v1, v2, v3}
  mov   v1, v1, lsr fp
  orr   v1, v1, v2, lsl sl
  mov   v2, v2, lsr fp
  orr   v2, v2, v3, lsl sl
  mov   v3, v3, lsr fp

  mov   v4, v3, lsl #24

  orr   v6, v4, v2, lsr #8
  mov   v4, v2, lsl #24
  orr   v5, v2, v6
  and   v5, v5, lr
  and   v2, ip, v2, lsr #1
  and   v6, ip, v6, lsr #1
  add   v2, v2, v6
  add   v2, v2, v5

  orr   v6, v4, v1, lsr #8
  orr   v5, v1, v6
  and   v5, v5, lr
  and   v1, ip, v1, lsr #1
  and   v6, ip, v6, lsr #1
  add   v1, v1, v6
  add   v1, v1, v5

avg_xy8_loop
  add   a2, a2, a3

  ldmia a2, {v3, v4, v5}
  rsb   sl, fp, #32
  mov   v3, v3, lsr fp
  orr   v3, v3, v4, lsl sl
  mov   v4, v4, lsr fp
  orr   v4, v4, v5, lsl sl
  mov   v5, v5, lsr fp

  mov   sl, v5, lsl #24

  orr   v6, sl, v4, lsr #8
  mov   sl, v4, lsl #24
  orr   v5, v4, v6
  and   v5, v5, lr
  and   v4, ip, v4, lsr #1
  and   v6, ip, v6, lsr #1
  add   v4, v4, v6
  add   v4, v4, v5
  orr   v5, v4, v2
  and   v5, v5, lr
  and   v6, ip, v4, lsr #1
  and   v2, ip, v2, lsr #1
  add   v5, v5, v2
  add   v5, v5, v6
  mov   v2, v4
  ldr   v6, [a1, #4]
  orr   v4, v5, v6
  and   v4, v4, lr
  and   v5, ip, v5, lsr #1
  and   v6, ip, v6, lsr #1
  add   v4, v4, v5
  add   v4, v4, v6
  str   v4, [a1, #4]

  orr   v6, sl, v3, lsr #8
  orr   v5, v3, v6
  and   v5, v5, lr
  and   v3, ip, v3, lsr #1
  and   v6, ip, v6, lsr #1
  add   v3, v3, v6
  add   v3, v3, v5
  orr   v5, v3, v1
  and   v5, v5, lr
  and   v6, ip, v3, lsr #1
  and   v1, ip, v1, lsr #1
  add   v5, v5, v1
  add   v5, v5, v6
  mov   v1, v3
  ldr   v6, [a1, #0]
  orr   v3, v5, v6
  and   v3, v3, lr
  and   v5, ip, v5, lsr #1
  and   v6, ip, v6, lsr #1
  add   v3, v3, v5
  add   v3, v3, v6
  str   v3, [a1], a3

  subs  a4, a4, #1
  bne   avg_xy8_loop
  ldmfd sp!, {v1-v6, sl, fp, pc}

;-------------------------

  ALIGN 32 ; to ensure next file starts on correct boundary

  END

