;
; Scaled sprite plot
; ------------------
;
; Displays a rectangle of the supplied sprite data. Scales the display up
; or down independently in x and y axis. Magnification factors are fixed
; point fractional.
;   output_size = input_size / magnification
; Functions provided for colour depths of 8, 16, 32 bpp. Addresses need not
; be word aligned for 8bpp and 16bpp. The sprite must have the same colour
; depth as the display and the supplied sprite top left address should point
; to the sprite data, not the header. Does not work for negative mags.
;
; inputs, r0 = &blk
;   blk+0  -> r0 = x magnification
;   blk+4  -> r1 = y magnification
;   blk+8  -> r2 = dest rectangle x offset in pixels
;   blk+12 -> r3 = dest rectangle y offset
;   blk+16 -> r4 = dest rectangle width in pixels (x loop count)
;   blk+20 -> r5 = dest rectangle height (y loop count)
;   blk+24 -> r6 = dest rectangle top left address
;   blk+28 -> r7 = screen width in bytes
;   blk+32 -> r8 = sprite top left address
;   blk+36 -> r9 = sprite width in bytes
;
; dest offsets are relative to the display window.
;
; note. r5 is used for both x & y counts, (upper half for x)
;

frac  EQU 16 ; fractional bits in x/y magnification
split EQU 16 ; number of bits of r5 used for y count

    AREA |C$$code|, CODE, READONLY

    EXPORT ka_scale_raw_8bpp

; colour depth = 8bpp
ka_scale_raw_8bpp
    mov     ip, sp
    stmdb   sp!, {r0, r4-r9, fp, ip, lr, pc}
    sub     fp, ip, #4
    ldmia   r0, {r0-r9}             ; load registers from blk
    mul     r2, r0, r2              ; initial Xoffset
    mul     r3, r1, r3              ; initial Yoffset
    stmdb   sp!, {r2,r8,r9}
yloop8
    ldmfd   sp, {r2,r8,r9}          ; read fron stack
    sub     r5, r5, r4, asl #split  ; width count
    mov     r14, r3, asr #frac      ; (int)Yoffset
    mla     r14, r9, r14, r8        ; src_addr += src_width * (int)Yoffset
    mov     r12, r6                 ; dest line start
    tst     r12, #3                 ; check dest alignment
    beq     xloop8
; store up to 3 single pixels
xloop8_1
    ldrb    r9, [r14, r2, asr #frac]; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              ; increment Xoffset
    strb    r9, [r12], #1           ; display pixel
    adds    r5, r5, #1<<split       ; decrement x count by 1 pixel
    bgt     xend8
    tst     r12, #3                 ; check dest alignment
    bne     xloop8_1
xloop8
    cmn     r5, #3<<split           ; check for a width less than 4
    bge     xloop8_1
; store a word of 4 pixels
    ldrb    r9, [r14, r2, asr #frac]; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              ; increment Xoffset
    ldrb    r8, [r14, r2, asr #frac]; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              ; increment Xoffset
    orr     r9, r9, r8, lsl #8
    ldrb    r8, [r14, r2, asr #frac]; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              ; increment Xoffset
    orr     r9, r9, r8, lsl #16
    ldrb    r8, [r14, r2, asr #frac]; pixel = *(src_addr + (int)Xoffset)
    add     r2, r2, r0              ; increment Xoffset
    orr     r9, r9, r8, lsl #24
    str     r9, [r12], #4           ; display 4 pixels
    adds    r5, r5, #4<<split       ; decrement x count by 4 pixels
    ble     xloop8
xend8
    add     r3, r3, r1              ; increment Yoffset
    add     r6, r6, r7              ; next dest line start
    subs    r5, r5, #1              ; decrement y count by 1 pixel
    bgt     yloop8
    ldmdb   fp, {r4-r9, fp, sp, pc}

;-----------------------------------------------------------

    ALIGN 32
    EXPORT ka_scale_raw_16bpp
; colour depth = 15/16bpp
ka_scale_raw_16bpp
    mov     ip, sp
    stmdb   sp!, {r0, r4-r9, fp, ip, lr, pc}
    sub     fp, ip, #4
    ldmia   r0, {r0-r9}             ; load registers from blk
    bic     r6, r6, #1              ; ensure pixel alignment
    mul     r2, r0, r2              ; initial Xoffset
    mul     r3, r1, r3              ; initial Yoffset
    stmfd   sp!, {r2,r8,r9}
yloop16
    ldmfd   sp, {r2,r8,r9}          ; read fron stack
    sub     r5, r5, r4, asl #split  ; width count
    mov     r14, r3, asr #frac      ; (int)Yoffset
    mla     r14, r9, r14, r8        ; src_addr += src_width * (int)Yoffset
    mov     r12, r6                 ; dest line start
    tst     r12, #3                 ; check dest alignment
    beq     xloop16
; store a single pixel
xloop16_1
    mov     r9, r2, asr #frac       ; (int)Xoffset
    ldr     r9, [r14, r9, asl #1]   ; pixel = *(src_addr + (int)Xoffset(half words))
    strb    r9, [r12], #1           ; display left half of pixel
    add     r2, r2, r0              ; increment Xoffset
    mov     r9, r9, lsr #8
    strb    r9, [r12], #1           ; display right half of pixel
    adds    r5, r5, #1<<split       ; decrement x count by 1 pixel
    bgt     xend16
xloop16
    cmn     r5, #1<<split           ; check for a width less than 2
    bge     xloop16_1
; store a word of 2 pixels
    mov     r9, r2, asr #frac       ; (int)Xoffset
    ldr     r9, [r14, r9, asl #1]   ; pixel = *(src_addr + (int)Xoffset(half words))
    add     r2, r2, r0              ; increment Xoffset
    mov     r9, r9, lsl #16
    mov     r9, r9, lsr #16
    mov     r8, r2, asr #frac       ; (int)Xoffset
    ldr     r8, [r14, r8, asl #1]   ; pixel = *(src_addr + (int)Xoffset(half words))
    add     r2, r2, r0              ; increment Xoffset
    orr     r9, r9, r8, lsl #16
    str     r9, [r12], #4           ; display 2 pixels
    adds    r5, r5, #2<<split       ; decrement x count by 2 pixels
    ble     xloop16
xend16
    add     r3, r3, r1              ; increment Yoffset
    add     r6, r6, r7              ; next dest line start
    subs    r5, r5, #1              ; decrement y count by 1 pixel
    bgt     yloop16
    add     sp, sp, #12
    ldmdb   fp, {r4-r9, fp, sp, pc}

;-----------------------------------------------------------

    ALIGN 32
    EXPORT ka_scale_raw_16bpp_sh
; colour depth = 15/16bpp
ka_scale_raw_16bpp_sh
    mov     ip, sp
    stmdb   sp!, {r0, r4-r9, fp, ip, lr, pc}
    sub     fp, ip, #4
    ldmia   r0, {r0-r9}             ; load registers from blk
    bic     r6, r6, #1              ; ensure pixel alignment
    mul     r2, r0, r2              ; initial Xoffset
    mul     r3, r1, r3              ; initial Yoffset
    stmfd   sp!, {r2,r8,r9}
yloop16_sh
    ldmfd   sp, {r2,r8,r9}          ; read fron stack
    sub     r5, r5, r4, asl #split  ; width count
    mov     r14, r3, asr #frac      ; (int)Yoffset
    mla     r14, r9, r14, r8        ; src_addr += src_width * (int)Yoffset
    mov     r12, r6                 ; dest line start
    tst     r12, #3                 ; check dest alignment
    beq     xloop16_sh
; store a single pixel
xloop16_1_sh
    mov     r9, r2, asr #frac       ; (int)Xoffset
    mov     r9, r9, asl #1
    ldrh    r9, [r14, r9]           ; pixel = *(src_addr + (int)Xoffset(half words))
    add     r2, r2, r0              ; increment Xoffset
    strh    r9, [r12], #2           ; display pixel
    adds    r5, r5, #1<<split       ; decrement x count by 1 pixel
    bgt     xend16_sh
xloop16_sh
    cmn     r5, #1<<split           ; check for a width less than 2
    bge     xloop16_1_sh
; store a word of 2 pixels
    mov     r9, r2, asr #frac       ; (int)Xoffset
    mov     r9, r9, asl #1
    ldrh    r9, [r14, r9]           ; pixel = *(src_addr + (int)Xoffset(half words))
    add     r2, r2, r0              ; increment Xoffset
    mov     r8, r2, asr #frac       ; (int)Xoffset
    mov     r8, r8, asl #1
    ldrh    r8, [r14, r8]           ; pixel = *(src_addr + (int)Xoffset(half words))
    add     r2, r2, r0              ; increment Xoffset
    orr     r9, r9, r8, lsl #16
    str     r9, [r12], #4           ; display 2 pixels
    adds    r5, r5, #2<<split       ; decrement x count by 2 pixels
    ble     xloop16_sh
xend16_sh
    add     r3, r3, r1              ; increment Yoffset
    add     r6, r6, r7              ; next dest line start
    subs    r5, r5, #1              ; decrement y count by 1 pixel
    bgt     yloop16_sh
    add     sp, sp, #12
    ldmdb   fp, {r4-r9, fp, sp, pc}

;-----------------------------------------------------------

    ALIGN 32
    EXPORT ka_scale_raw_32bpp
; colour depth = 32bpp
ka_scale_raw_32bpp
    mov     ip, sp
    stmdb   sp!, {r0, r4-r9, fp, ip, lr, pc}
    sub     fp, ip, #4
    ldmia   r0, {r0-r9}             ; load registers from blk
    bic     r6, r6, #3              ; ensure pixel alignment
    mul     r2, r0, r2              ; initial Xoffset
    mul     r3, r1, r3              ; initial Yoffset
    stmfd   sp!, {r2,r8,r9}
yloop32
    ldmfd   sp, {r2,r8,r9}          ; read fron stack
    sub     r5, r5, r4, asl #split  ; width count
    mov     r14, r3, asr #frac      ; (int)Yoffset
    mla     r14, r9, r14, r8        ; src_addr += src_width * Yoffset
    mov     r12, r6                 ; dest line start
    ands    r9, r4, #3
    rsbne   r9, r9, #4
    mov     r8, #(xloop32_one - xloop32)
    mul     r9, r8, r9
    adr     r8, xloop32
    add     r8, r8, r9
    mov     pc, r8
xloop32
; note, do 4 pixels in line if possible to save a few cycles.
    mov     r9, r2, asr #frac       ; (int)Xoffset
    ldr     r9, [r14, r9, asl #2]   ; pixel = *(src_addr + (int)Xoffset(words))
    add     r2, r2, r0              ; increment Xoffset
    str     r9, [r12], #4           ; display pixel
xloop32_one
    mov     r8, r2, asr #frac       ; (int)Xoffset
    ldr     r8, [r14, r8, asl #2]   ; pixel = *(src_addr + (int)Xoffset(words))
    add     r2, r2, r0              ; increment Xoffset
    str     r8, [r12], #4           ; display pixel
    mov     r9, r2, asr #frac       ; (int)Xoffset
    ldr     r9, [r14, r9, asl #2]   ; pixel = *(src_addr + (int)Xoffset(words))
    add     r2, r2, r0              ; increment Xoffset
    str     r9, [r12], #4           ; display pixel
    mov     r8, r2, asr #frac       ; (int)Xoffset
    ldr     r8, [r14, r8, asl #2]   ; pixel = *(src_addr + (int)Xoffset(words))
    add     r2, r2, r0              ; increment Xoffset
    str     r8, [r12], #4           ; display pixel
    adds    r5, r5, #4<<split       ; decrement x count by 1 pixel
    blt     xloop32
xend32
    add     r3, r3, r1              ; increment Yoffset
    add     r6, r6, r7              ; next dest line start
    mov     r5,r5, lsl #split       ; get rid of any x count > 0
    mov     r5,r5, lsr #split
    subs    r5, r5, #1              ; decrement y count by 1 pixel
    bgt     yloop32
    add     sp, sp, #12
    ldmdb   fp, {r4-r9, fp, sp, pc}

    ALIGN 32
;---------------
; uint32_t* pdst
; const uint32_t* line0
; const uint32_t* line1
; uint32_t sc
; int width
    EXPORT merge_lines32_8_neon
merge_lines32_8_neon
    stmfd      sp!, {lr}
    cmp        r3, #0
    beq        merge_lines32_8_neon_dup

    vdup.8     d0, r3
    rsb        r3, r3, #256
    vdup.8     d1, r3
    ldr        r3, [sp, #1*4] ; width is on stack

merge_lines32_8_neon_1more
    tst        r3, #1
    beq        merge_lines32_8_neon_2more

    vld1.32    {d12[0]},[r1]!
    vld1.32    {d28[0]},[r2]!
    vmull.u8   q2, d12, d1
    vmlal.u8   q2, d28, d0
    vrshrn.u16 d12, q2, #8
    vst1.32    {d12[0]},[r0]!

merge_lines32_8_neon_2more
    tst        r3, #2
    beq        merge_lines32_8_neon_4more

    vld1.32    {d12},[r1]!
    vld1.32    {d28},[r2]!
    vmull.u8   q2, d12, d1
    vmlal.u8   q2, d28, d0
    vrshrn.u16 d12, q2, #8
    vst1.32    {d12},[r0]!

merge_lines32_8_neon_4more
    tst        r3, #4
    beq        merge_lines32_8_neon_8more

    vld1.32    {d12,d13},[r1]!
    vld1.32    {d28,d29},[r2]!
    vmull.u8   q2, d12, d1
    vmull.u8   q3, d13, d1
    vmlal.u8   q2, d28, d0
    vmlal.u8   q3, d29, d0
    vrshrn.u16 d12, q2, #8
    vrshrn.u16 d13, q3, #8
    vst1.32    {d12,d13},[r0]!

merge_lines32_8_neon_8more
    tst        r3, #8
    beq        merge_lines32_8_neon_16more

    vld1.32    {d12,d13,d14,d15},[r1]!
    vld1.32    {d28,d29,d30,d31},[r2]!
    vmull.u8   q2, d12, d1
    vmull.u8   q3, d13, d1
    vmull.u8   q4, d14, d1
    vmull.u8   q5, d15, d1
    vmlal.u8   q2, d28, d0
    vmlal.u8   q3, d29, d0
    vmlal.u8   q4, d30, d0
    vmlal.u8   q5, d31, d0
    vrshrn.u16 d12, q2, #8
    vrshrn.u16 d13, q3, #8
    vrshrn.u16 d14, q4, #8
    vrshrn.u16 d15, q5, #8
    vst1.32    {d12,d13,d14,d15},[r0]!


merge_lines32_8_neon_16more
    bic        r3, r3, #15
    cmp        r3, #0
    beq        merge_lines32_8_neon_end

merge_lines32_8_neon_loop16
    vld1.32    {d16,d17,d18,d19},[r1]!
    vld1.32    {d24,d25,d26,d27},[r2]!
    vld1.32    {d20,d21,d22,d23},[r1]!
    vld1.32    {d28,d29,d30,d31},[r2]!
    vmull.u8   q1, d16, d1
    vmull.u8   q2, d17, d1
    vmull.u8   q3, d18, d1
    vmull.u8   q4, d19, d1
    vmull.u8   q5, d20, d1
    vmull.u8   q6, d21, d1
    vmull.u8   q7, d22, d1
    vmull.u8   q8, d23, d1
    vmlal.u8   q1, d24, d0
    vmlal.u8   q2, d25, d0
    vmlal.u8   q3, d26, d0
    vmlal.u8   q4, d27, d0
    vmlal.u8   q5, d28, d0
    vmlal.u8   q6, d29, d0
    vmlal.u8   q7, d30, d0
    vmlal.u8   q8, d31, d0
    vrshrn.u16 d24, q1, #8
    vrshrn.u16 d25, q2, #8
    vrshrn.u16 d26, q3, #8
    vrshrn.u16 d27, q4, #8
    vrshrn.u16 d28, q5, #8
    vrshrn.u16 d29, q6, #8
    vrshrn.u16 d30, q7, #8
    vrshrn.u16 d31, q8, #8
    vst1.32    {d24,d25,d26,d27},[r0]!
    vst1.32    {d28,d29,d30,d31},[r0]!
    subs       r3, r3, #16
    bgt        merge_lines32_8_neon_loop16

merge_lines32_8_neon_end
    ldmfd      sp!, {pc}


merge_lines32_8_neon_dup
    ldr        r3, [sp, #1*4] ; width is on stack

merge_lines32_8_neon_dup_1more
    tst        r3, #1
    beq        merge_lines32_8_neon_dup_2more

    vld1.32    {d12[0]},[r1]!
    vst1.32    {d12[0]},[r0]!

merge_lines32_8_neon_dup_2more
    tst        r3, #2
    beq        merge_lines32_8_neon_dup_4more

    vld1.32    {d12},[r1]!
    vst1.32    {d12},[r0]!

merge_lines32_8_neon_dup_4more
    tst        r3, #4
    beq        merge_lines32_8_neon_dup_8more

    vld1.32    {d12,d13},[r1]!
    vst1.32    {d12,d13},[r0]!

merge_lines32_8_neon_dup_8more
    tst        r3, #8
    beq        merge_lines32_8_neon_dup_16more
    vld1.32    {d12,d13,d14,d15},[r1]!
    vst1.32    {d12,d13,d14,d15},[r0]!

merge_lines32_8_neon_dup_16more
    bic        r3, r3, #15
    cmp        r3, #0
    beq        merge_lines32_8_neon_dup_end

merge_lines32_8_neon_dup_loop16
    vld1.32    {d12,d13,d14,d15},[r1]!
    vst1.32    {d12,d13,d14,d15},[r0]!
    vld1.32    {d12,d13,d14,d15},[r1]!
    vst1.32    {d12,d13,d14,d15},[r0]!
    subs       r3, r3, #16
    bgt        merge_lines32_8_neon_dup_loop16

merge_lines32_8_neon_dup_end
    ldmfd      sp!, {pc}

    ALIGN 32
;---------------
; uint32_t* pdst
; uint32_t  width
; const uint32_t* psrc
; int       x0
; int       x_mag < 1 << 16
    EXPORT scaleup_line32_8_neon
scaleup_line32_8_neon
    stmfd      sp!, {r4-r7, lr}
    ldr        r4, [sp, #5*4] ; x_mag is on stack

    mov        r5, r3, lsr #16 ; psrc += x0 >> 16
    add        r2, r2, r5, lsl #2
    sub        r3, r3, r5, lsl #16 ; x0 &= 0xffff

    mov        r5, #0x80
    vdup.8     d31, r5

    mov        r5, #0x10000
    ldr        r6, [r2]
    mov        r7, r6

scaleup_line32_8_neon_1more
    tst        r1, #1
    beq        scaleup_line32_8_neon_2more

    vdup.16    d0, r3      ; replicate, 4 x frac as 16-bit
    vmov.32    d16, r6, r7 ; val0 & val1
    add        r3, r3, r4  ; x0 += x_mag
    cmp        r3, r5      ; x0 >= 1<<16 ?
    subge      r3, r3, r5  ;   x0 &= 0xffff
    movge      r6, r7      ;   val0 = val1
    ldrge      r7, [r2], #4;   load new val1

    vshr.u16   q4, q0, #9  ; 4 x frac, keep top 7-bits
    vtrn.32    d16, d24    ; rearrange, all val0 in one d and val1 in the other
    vmovn.u16  d0, q4      ; 4 x frac, narrow to 8-bit

    vsub.u8    d4, d31, d0 ; 4 x (128 - 7-bit frac)

    ; (frac * val0 + (128 - frac) * val1) >> 7
    ; on each r, g, b, a at the same time
    vmull.u8   q4, d16, d1
    vmlal.u8   q4, d24, d0
    vrshrn.u16 d24, q4, #7
    vst1.32    {d24[0]},[r0]!

scaleup_line32_8_neon_2more
    tst        r1, #2
    beq        scaleup_line32_8_neon_4more

    vdup.16    d0, r3      ; replicate, 4 x frac as 16-bit
    vmov.32    d16, r6, r7 ; val0 & val1
    add        r3, r3, r4  ; x0 += x_mag
    cmp        r3, r5      ; x0 >= 1<<16 ?
    subge      r3, r3, r5  ;   x0 &= 0xffff
    movge      r6, r7      ;   val0 = val1
    ldrge      r7, [r2], #4;   load new val1

    vdup.16    d1, r3      ; replicate frac as 16-bit
    vmov.32    d24, r6, r7 ; val0 & val1
    add        r3, r3, r4  ; x0 += x_mag
    cmp        r3, r5      ; x0 >= 1<<16 ?
    subge      r3, r3, r5  ;   x0 &= 0xffff
    movge      r6, r7      ;   val0 = val1
    ldrge      r7, [r2], #4;   load new val1

    vshr.u16   q4, q0, #9  ; 4 x frac, keep top 7-bits
    vtrn.32    d16, d24    ; rearrange, all val0 in one d and val1 in the other
    vmovn.u16  d0, q4      ; 4 x frac, narrow to 8-bit

    vsub.u8    d4, d31, d0 ; 4 x (128 - 7-bit frac)

    ; (frac * val0 + (128 - frac) * val1) >> 7
    ; on each r, g, b, a at the same time
    vmull.u8   q4, d16, d4
    vmlal.u8   q4, d24, d0
    vrshrn.u16 d24, q4, #7
    vst1.32    {d24},[r0]!

scaleup_line32_8_neon_4more
    tst        r1, #4
    beq        scaleup_line32_8_neon_8more

    vdup.16    d0, r3      ; replicate, 4 x frac as 16-bit
    vmov.32    d16, r6, r7 ; val0 & val1
    add        r3, r3, r4  ; x0 += x_mag
    cmp        r3, r5      ; x0 >= 1<<16 ?
    subge      r3, r3, r5  ;   x0 &= 0xffff
    movge      r6, r7      ;   val0 = val1
    ldrge      r7, [r2], #4;   load new val1

    vdup.16    d1, r3      ; replicate frac as 16-bit
    vmov.32    d24, r6, r7 ; val0 & val1
    add        r3, r3, r4  ; x0 += x_mag
    cmp        r3, r5      ; x0 >= 1<<16 ?
    subge      r3, r3, r5  ;   x0 &= 0xffff
    movge      r6, r7      ;   val0 = val1
    ldrge      r7, [r2], #4;   load new val1

    vshr.u16   q4, q0, #9  ; 4 x frac, keep top 7-bits
    vtrn.32    d16, d24    ; rearrange, all val0 in one d and val1 in the other
    vmovn.u16  d0, q4      ; 4 x frac, narrow to 8-bit

    vdup.16    d2, r3      ; replicate, 4 x frac as 16-bit
    vmov.32    d17, r6, r7 ; val0 & val1
    add        r3, r3, r4  ; x0 += x_mag
    cmp        r3, r5      ; x0 >= 1<<16 ?
    subge      r3, r3, r5  ;   x0 &= 0xffff
    movge      r6, r7      ;   val0 = val1
    ldrge      r7, [r2], #4;   load new val1

    vdup.16    d3, r3      ; replicate, 4 x frac as 16-bit
    vmov.32    d25, r6, r7 ; val0 & val1
    add        r3, r3, r4  ; x0 += x_mag
    cmp        r3, r5      ; x0 >= 1<<16 ?
    subge      r3, r3, r5  ;   x0 &= 0xffff
    movge      r6, r7      ;   val0 = val1
    ldrge      r7, [r2], #4;   load new val1

    vshr.u16   q5, q1, #9  ; 4 x frac, keep top 7-bits
    vtrn.32    d17, d25    ; rearrange, all val0 in one d and val1 in the other
    vmovn.u16  d1, q5      ; 4 x frac, narrow to 8-bit

    vsub.u8    d4, d31, d0 ; 4 x (128 - 7-bit frac)
    vsub.u8    d5, d31, d1 ; 4 x (128 - 7-bit frac)

    ; (frac * val0 + (128 - frac) * val1) >> 7
    ; on each r, g, b, a at the same time
    vmull.u8   q4, d16, d4
    vmull.u8   q5, d17, d5
    vmlal.u8   q4, d24, d0
    vmlal.u8   q5, d25, d1
    vrshrn.u16 d24, q4, #7
    vrshrn.u16 d25, q5, #7
    vst1.32    {d24,d25},[r0]!

scaleup_line32_8_neon_8more
    bic        r1, r1, #7
    cmp        r1, #0
    beq        scaleup_line32_8_neon_end

scaleup_line32_8_neon_loop8
    vdup.16    d0, r3      ; replicate, 4 x frac as 16-bit
    vmov.32    d16, r6, r7 ; val0 & val1
    add        r3, r3, r4  ; x0 += x_mag
    cmp        r3, r5      ; x0 >= 1<<16 ?
    subge      r3, r3, r5  ;   x0 &= 0xffff
    movge      r6, r7      ;   val0 = val1
    ldrge      r7, [r2], #4;   load new val1

    vdup.16    d1, r3      ; replicate frac as 16-bit
    vmov.32    d24, r6, r7 ; val0 & val1
    add        r3, r3, r4  ; x0 += x_mag
    cmp        r3, r5      ; x0 >= 1<<16 ?
    subge      r3, r3, r5  ;   x0 &= 0xffff
    movge      r6, r7      ;   val0 = val1
    ldrge      r7, [r2], #4;   load new val1

    vshr.u16   q4, q0, #9  ; 4 x frac, keep top 7-bits
    vtrn.32    d16, d24    ; rearrange, all val0 in one d and val1 in the other
    vmovn.u16  d0, q4      ; 4 x frac, narrow to 8-bit

    vdup.16    d2, r3      ; replicate, 4 x frac as 16-bit
    vmov.32    d17, r6, r7 ; val0 & val1
    add        r3, r3, r4  ; x0 += x_mag
    cmp        r3, r5      ; x0 >= 1<<16 ?
    subge      r3, r3, r5  ;   x0 &= 0xffff
    movge      r6, r7      ;   val0 = val1
    ldrge      r7, [r2], #4;   load new val1

    vdup.16    d3, r3      ; replicate, 4 x frac as 16-bit
    vmov.32    d25, r6, r7 ; val0 & val1
    add        r3, r3, r4  ; x0 += x_mag
    cmp        r3, r5      ; x0 >= 1<<16 ?
    subge      r3, r3, r5  ;   x0 &= 0xffff
    movge      r6, r7      ;   val0 = val1
    ldrge      r7, [r2], #4;   load new val1

    vshr.u16   q5, q1, #9  ; 4 x frac, keep top 7-bits
    vtrn.32    d17, d25    ; rearrange, all val0 in one d and val1 in the other
    vmovn.u16  d1, q5      ; 4 x frac, narrow to 8-bit

    vdup.16    d4, r3      ; replicate, 4 x frac as 16-bit
    vmov.32    d18, r6, r7 ; val0 & val1
    add        r3, r3, r4  ; x0 += x_mag
    cmp        r3, r5      ; x0 >= 1<<16 ?
    subge      r3, r3, r5  ;   x0 &= 0xffff
    movge      r6, r7      ;   val0 = val1
    ldrge      r7, [r2], #4;   load new val1

    vdup.16    d5, r3      ; replicate, 4 x frac as 16-bit
    vmov.32    d26, r6, r7 ; val0 & val1
    add        r3, r3, r4  ; x0 += x_mag
    cmp        r3, r5      ; x0 >= 1<<16 ?
    subge      r3, r3, r5  ;   x0 &= 0xffff
    movge      r6, r7      ;   val0 = val1
    ldrge      r7, [r2], #4;   load new val1

    vshr.u16   q6, q2, #9  ; 4 x frac, keep top 7-bits
    vtrn.32    d18, d26    ; rearrange, all val0 in one d and val1 in the other
    vmovn.u16  d2, q6      ; 4 x frac, narrow to 8-bit

    vdup.16    d6, r3      ; replicate, 4 x frac as 16-bit
    vmov.32    d19, r6, r7 ; val0 & val1
    add        r3, r3, r4  ; x0 += x_mag
    cmp        r3, r5      ; x0 >= 1<<16 ?
    subge      r3, r3, r5  ;   x0 &= 0xffff
    movge      r6, r7      ;   val0 = val1
    ldrge      r7, [r2], #4;   load new val1

    vdup.16    d7, r3      ; replicate, 4 x frac as 16-bit
    vmov.32    d27, r6, r7 ; val0 & val1
    add        r3, r3, r4  ; x0 += x_mag
    cmp        r3, r5      ; x0 >= 1<<16 ?
    subge      r3, r3, r5  ;   x0 &= 0xffff
    movge      r6, r7      ;   val0 = val1
    ldrge      r7, [r2], #4;   load new val1

    vshr.u16   q7, q3, #9  ; 4 x frac, keep top 7-bits
    vtrn.32    d19, d27    ; rearrange, all val0 in one d and val1 in the other
    vmovn.u16  d3, q7      ; 4 x frac, narrow to 8-bit

    vsub.u8    d4, d31, d0 ; 4 x (128 - 7-bit frac)
    vsub.u8    d5, d31, d1 ; 4 x (128 - 7-bit frac)
    vsub.u8    d6, d31, d2 ; 4 x (128 - 7-bit frac)
    vsub.u8    d7, d31, d3 ; 4 x (128 - 7-bit frac)

    ; (frac * val0 + (128 - frac) * val1) >> 7
    ; on each r, g, b, a at the same time
    vmull.u8   q4, d16, d4
    vmull.u8   q5, d17, d5
    vmull.u8   q6, d18, d6
    vmull.u8   q7, d19, d7
    vmlal.u8   q4, d24, d0
    vmlal.u8   q5, d25, d1
    vmlal.u8   q6, d26, d2
    vmlal.u8   q7, d27, d3
    vrshrn.u16 d24, q4, #7
    vrshrn.u16 d25, q5, #7
    vrshrn.u16 d26, q6, #7
    vrshrn.u16 d27, q7, #7
    vst1.32    {d24,d25,d26,d27},[r0]!
    subs       r1, r1, #8
    bgt        scaleup_line32_8_neon_loop8

scaleup_line32_8_neon_end
    ldmfd      sp!, {r4-r7, pc}

;-----------------------------------------------------------
    ALIGN 32
    END
