;
; mono_tbgr.s
; Copyright (C) 2002 P.Everett <peter@everett9981.freeserve.co.uk>
;
; This file is part of KinoAMP, a free RISCOS MPEG program stream decoder.
;
; KinoAMP is free software; you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation; either version 2 of the License, or
; (at your option) any later version.
;
; KinoAMP is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with this program; if not, write to the Free Software
; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
;

 GET hdr.ka_drawer

; This file contains the greyscale drivers for 16bpp and 32bpp displays.
; For each colour depth, there is a driver for each zoom magnification.
; These drivers output the Y component directly and ignore both u and v.
; yuv tables are bypassed so the brightness,contrast,colour controls don't work.
; There are 8 functions, one for each magnification in 16bpp and 32bpp.
;
; Register and stack usage for both functions,
;
; on input to loops,
; v4 = dst1          line screen address
; v5 = bytesperrow   vertical display step
; v6 = y1            line luminance input address
; lr = height        remaining lum lines
; a3 = u             Cr chroma input address
; a4 = v             Cb chroma input address
; [sp+0] = width
; [sp+4] = uv_skip
; [sp+8] = y_skip
; [sp+12] = dst_skip
;
; within loops,
; ip = Yc
; v2 = pix
; v1 = temp
; v3 = width
;
; note. The input arrays are byte arrays and the screen memory is a word array.
;

yc_width            EQU  0 ; + sp        (paint + 28)
dst_skip            EQU  4 ; + sp        (paint + 32)
yc_skip             EQU  8 ; + sp        (paint + 36)
uv_skip             EQU 12 ; + sp        (paint + 40)

  AREA |A$$code|, CODE, READONLY

  IMPORT ka_Yc0_Table_4bit
  IMPORT ka_Yc0_Table_5bit
  IMPORT ka_Yc0_Table_8bit

  ALIGN 32
Yc0_Tab_4bit DCD ka_Yc0_Table_4bit
Yc0_Tab_5bit DCD ka_Yc0_Table_5bit
Yc0_Tab_8bit DCD ka_Yc0_Table_8bit

; ka_drawy_z05_tbgr12 : 4k cols 50% zoom greyscale
; ------------------
  EXPORT ka_drawy_z05_tbgr12
  ALIGN 32
ka_drawy_z05_tbgr12
  stmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, lr}
  sub   sp, sp, #16

  add   a2, a1, #drawer_yc_bpr
  ldmia a2!, {v3, v4, v5, lr} ; yc_bpr, dst1, dst_bpr, yc_height
  ldr   v6, [a1, #0]        ; y1 = frame->base[0]
  ldmia a2, {a3-v2}         ; yc_width, dst_skip, yc_skip, uv_skip
  add   v1, v1, v3          ; yc_skip += yc_bpr
  stmia sp, {a3-v2}

  ldr   ip, Yc0_Tab_4bit

height_05loop_t12
  ldr   v3, [sp, #yc_width]
width_05loop_t12

  ldrb  v2, [v6], #2        ; NW1
  ldrb  v1, [v6], #2        ; NW2
  ldrb  v2, [ip,v2]
  ldrb  v1, [ip,v1]
  orr   v2, v2, v1, lsl #16
  orr   v2, v2, v2, lsl #4
  orr   v2, v2, v2, lsl #4
; display double pixel
  str   v2, [v4], #4        ; *dst++ = pix

  subs  v3, v3, #4          ; width--
  bne   width_05loop_t12

  ldr   v1, [sp, #yc_skip]
  add   v6, v6, v1          ; y1 += yc_skip
  ldr   v1, [sp, #dst_skip]
  add   v4, v4, v1          ; dst += dst_skip

  subs  lr, lr, #2          ; height--
  bne   height_05loop_t12

  add   sp, sp, #16
  ldmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, pc}

; ka_drawy_z1_tbgr12 : 4k cols 100% zoom greyscale
; -----------------
  EXPORT ka_drawy_z1_tbgr12
  ALIGN 32
ka_drawy_z1_tbgr12
  stmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, lr}
  sub   sp, sp, #16

  add   a2, a1, #drawer_yc_bpr
  ldmia a2!, {v3, v4, v5, lr} ; yc_bpr, dst1, dst_bpr, yc_height
  ldr   v6, [a1, #0]        ; y1 = frame->base[0]
  ldmia a2, {a3-v2}         ; yc_width, dst_skip, yc_skip, uv_skip
  stmia sp, {a3-v2}

  ldr   ip, Yc0_Tab_4bit

height_loop_t12
  ldr   v3, [sp, #yc_width]
width_loop_t12

  ldrb  v2, [v6], #1
  ldrb  v1, [v6], #1
  ldrb  v2, [ip,v2]
  ldrb  v1, [ip,v1]
  orr   v2, v2, v1, lsl #16
  orr   v2, v2, v2, lsl #4
  orr   v2, v2, v2, lsl #4
; display double pixel
  str   v2, [v4], #4        ; *dst++ = pix

  subs  v3, v3, #2          ; width--
  bne   width_loop_t12

  ldr   v1, [sp, #yc_skip]
  add   v6, v6, v1          ; y1 += yc_skip
  ldr   v1, [sp, #dst_skip]
  add   v4, v4, v1          ; dst += dst_skip

  subs  lr, lr, #1          ; height--
  bne   height_loop_t12

  add   sp, sp, #16
  ldmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, pc}

; ka_drawy_z2_tbgr12 : 4k cols 200% zoom greyscale
; -----------------
  EXPORT ka_drawy_z2_tbgr12
  ALIGN 32
ka_drawy_z2_tbgr12
  stmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, lr}
  sub   sp, sp, #16

  add   a2, a1, #drawer_yc_bpr
  ldmia a2!, {v3, v4, v5, lr} ; yc_bpr, dst1, dst_bpr, yc_height
  ldr   v6, [a1, #0]        ; y1 = frame->base[0]
  ldmia a2, {a3-v2}         ; yc_width, dst_skip, yc_skip, uv_skip
  add   a4, a4, v5          ; dst_skip += dst_bpr
  stmia sp, {a3-v2}

  ldr   ip, Yc0_Tab_4bit

height2_loop_t12
  ldr   v3, [sp, #yc_width]
width2_loop_t12

; W pixel luminance
  ldrb  v2, [v6], #1
  ldrb  v2, [ip,v2]
  orr   v2, v2, v2, lsl #4
  orr   v2, v2, v2, lsl #4
; display west double pixel pair
  orr   v2, v2, v2, lsl #16 ; double pixel
  str   v2, [v4], v5        ; *dst = pix, then down
  str   v2, [v4], #4        ; *dst = pix, then right

; E pixel luminance
  ldrb  v2, [v6], #1
  ldrb  v2, [ip,v2]
  orr   v2, v2, v2, lsl #4
  orr   v2, v2, v2, lsl #4
; display east double pixel pair
  orr   v2, v2, v2, lsl #16 ; double pixel
  str   v2, [v4], -v5       ; *dst = pix, then up
  str   v2, [v4], #4        ; *dst = pix, then right

  subs  v3, v3, #2          ; width--
  bne   width2_loop_t12

  ldr   v1, [sp, #yc_skip]
  add   v6, v6, v1          ; y1 += y_skip
  ldr   v1, [sp, #dst_skip]
  add   v4, v4, v1          ; dst += dst_skip

  subs  lr, lr, #1          ; height--
  bne   height2_loop_t12

  add   sp, sp, #16
  ldmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, pc}

; ka_drawy_z3_tbgr12 : 4k cols 300% zoom greyscale
; -----------------
  EXPORT ka_drawy_z3_tbgr12
  ALIGN 32
ka_drawy_z3_tbgr12
  stmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, lr}
  sub   sp, sp, #16

  add   a2, a1, #drawer_yc_bpr
  ldmia a2!, {v3, v4, v5, lr} ; yc_bpr, dst1, dst_bpr, yc_height
  ldr   v6, [a1, #0]        ; y1 = frame->base[0]
  ldmia a2, {a3-v2}         ; yc_width, dst_skip, yc_skip, uv_skip
  add   a4, a4, v5, lsl #1  ; dst_skip += 2 * dst_bpr
  stmia sp, {a3-v2}

  ldr   ip, Yc0_Tab_4bit

height3_loop_t12
  ldr   v3, [sp, #yc_width]
width3_loop_t12

; W pixel luminance
  ldrb  v1, [v6], #1
  ldrb  v1, [ip,v1]
  orr   v1, v1, v1, lsl #4
  orr   v1, v1, v1, lsl #4
; E pixel luminance
  ldrb  v2, [v6], #1
  ldrb  v2, [ip,v2]
  orr   v2, v2, v2, lsl #4
  orr   v2, v2, v2, lsl #4
; create the 3 double pixels
  orr   a1, v1, v2, lsl #16 ; centre
  orr   v1, v1, v1, lsl #16 ; left
  orr   v2, v2, v2, lsl #16 ; right
; display the pixels
  str   a1, [v4, #4]        ; *(dst + 4) = centre
  str   v2, [v4, #8]        ; *(dst + 8) = right
  str   v1, [v4], v5        ; *dst = left, then down
  str   a1, [v4, #4]        ; *(dst + 4) = centre
  str   v2, [v4, #8]        ; *(dst + 8) = right
  str   v1, [v4], v5        ; *dst = left, then down
  str   v1, [v4], #4        ; *dst = left, then right
  str   a1, [v4], #4        ; *dst = centre, then right
  str   v2, [v4], #4        ; *dst = right, then right

; subtract 2*v5 from dst to position it for next
  sub   v4, v4, v5, lsl #1

  subs  v3, v3, #2          ; width--
  bne   width3_loop_t12

  ldr   v1, [sp, #yc_skip]
  add   v6, v6, v1          ; y1 += y_skip
  ldr   v1, [sp, #dst_skip]
  add   v4, v4, v1          ; dst += dst_skip

  subs  lr, lr, #1          ; height--
  bne   height3_loop_t12

  add   sp, sp, #16
  ldmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, pc}

; ka_drawy_z4_tbgr12 : 4k cols 400% zoom greyscale
; -----------------
  EXPORT ka_drawy_z4_tbgr12
  ALIGN 32
ka_drawy_z4_tbgr12
  stmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, lr}
  sub   sp, sp, #16

  add   a2, a1, #drawer_yc_bpr
  ldmia a2!, {v3, v4, v5, lr} ; yc_bpr, dst1, dst_bpr, yc_height
  ldr   v6, [a1, #0]        ; y1 = frame->base[0]
  ldmia a2, {a3-v2}         ; yc_width, dst_skip, yc_skip, uv_skip
  add   a4, a4, v5, lsl #1  ; dst_skip += 3 * dst_bpr
  add   a4, a4, v5
  stmia sp, {a3-v2}

  ldr   ip, Yc0_Tab_4bit

height4_loop_t12
  ldr   v3, [sp, #yc_width]
width4_loop_t12

; W pixel luminance
  ldrb  v2, [v6], #1
  ldrb  v2, [ip,v2]
  orr   v2, v2, v2, lsl #4
  orr   v2, v2, v2, lsl #4
; display west double pixel pair
  orr   v2, v2, v2, lsl #16 ; double pixel
  str   v2, [v4, #4]        ; *(dst + 4) = pix
  str   v2, [v4], v5        ; *dst = pix, then down
  str   v2, [v4, #4]        ; *(dst + 4) = pix
  str   v2, [v4], v5        ; *dst = pix, then down
  str   v2, [v4, #4]        ; *(dst + 4) = pix
  str   v2, [v4], v5        ; *dst = pix, then down
  str   v2, [v4], #4        ; *dst = pix, then right
  str   v2, [v4], #4        ; *dst = pix, then right

; E pixel luminance
  ldrb  v2, [v6], #1
  ldrb  v2, [ip,v2]
  orr   v2, v2, v2, lsl #4
  orr   v2, v2, v2, lsl #4
; display east double pixel pair
  orr   v2, v2, v2, lsl #16 ; double pixel
  str   v2, [v4, #4]        ; *(dst + 4) = pix
  str   v2, [v4], -v5       ; *dst = pix, then up
  str   v2, [v4, #4]        ; *(dst + 4) = pix
  str   v2, [v4], -v5       ; *dst = pix, then up
  str   v2, [v4, #4]        ; *(dst + 4) = pix
  str   v2, [v4], -v5       ; *dst = pix, then up
  str   v2, [v4], #4        ; *dst = pix, then right
  str   v2, [v4], #4        ; *dst = pix, then right

  subs  v3, v3, #2          ; width--
  bne   width4_loop_t12

  ldr   v1, [sp, #yc_skip]
  add   v6, v6, v1          ; y1 += y_skip
  ldr   v1, [sp, #dst_skip]
  add   v4, v4, v1          ; dst += dst_skip

  subs  lr, lr, #1          ; height--
  bne   height4_loop_t12

  add   sp, sp, #16
  ldmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, pc}

; ka_drawy_z05_tbgr15 : 32k cols 50% zoom greyscale
; ------------------
  EXPORT ka_drawy_z05_tbgr15
  ALIGN 32
ka_drawy_z05_tbgr15
  stmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, lr}
  sub   sp, sp, #16

  add   a2, a1, #drawer_yc_bpr
  ldmia a2!, {v3, v4, v5, lr} ; yc_bpr, dst1, dst_bpr, yc_height
  ldr   v6, [a1, #0]        ; y1 = frame->base[0]
  ldmia a2, {a3-v2}         ; yc_width, dst_skip, yc_skip, uv_skip
  add   v1, v1, v3          ; yc_skip += yc_bpr
  stmia sp, {a3-v2}

  ldr   ip, Yc0_Tab_5bit

height_05loop_t15
  ldr   v3, [sp, #yc_width]
width_05loop_t15

  ldrb  v2, [v6], #2        ; NW1
  ldrb  v1, [v6], #2        ; NW2
  ldrb  v2, [ip,v2]
  ldrb  v1, [ip,v1]
  orr   v2, v2, v1, lsl #16
  orr   v2, v2, v2, lsl #5
  orr   v2, v2, v2, lsl #5
; display double pixel
  str   v2, [v4], #4        ; *dst++ = pix

  subs  v3, v3, #4          ; width--
  bne   width_05loop_t15

  ldr   v1, [sp, #yc_skip]
  add   v6, v6, v1          ; y1 += yc_skip
  ldr   v1, [sp, #dst_skip]
  add   v4, v4, v1          ; dst += dst_skip

  subs  lr, lr, #2          ; height--
  bne   height_05loop_t15

  add   sp, sp, #16
  ldmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, pc}

; ka_drawy_z1_tbgr15 : 32k cols 100% zoom greyscale
; -----------------
  EXPORT ka_drawy_z1_tbgr15
  ALIGN 32
ka_drawy_z1_tbgr15
  stmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, lr}
  sub   sp, sp, #16

  add   a2, a1, #drawer_yc_bpr
  ldmia a2!, {v3, v4, v5, lr} ; yc_bpr, dst1, dst_bpr, yc_height
  ldr   v6, [a1, #0]        ; y1 = frame->base[0]
  ldmia a2, {a3-v2}         ; yc_width, dst_skip, yc_skip, uv_skip
  stmia sp, {a3-v2}

  ldr   ip, Yc0_Tab_5bit

height_loop_t15
  ldr   v3, [sp, #yc_width]
width_loop_t15

  ldrb  v2, [v6], #1
  ldrb  v1, [v6], #1
  ldrb  v2, [ip,v2]
  ldrb  v1, [ip,v1]
  orr   v2, v2, v1, lsl #16
  orr   v2, v2, v2, lsl #5
  orr   v2, v2, v2, lsl #5
; display double pixel
  str   v2, [v4], #4        ; *dst++ = pix

  subs  v3, v3, #2          ; width--
  bne   width_loop_t15

  ldr   v1, [sp, #yc_skip]
  add   v6, v6, v1          ; y1 += yc_skip
  ldr   v1, [sp, #dst_skip]
  add   v4, v4, v1          ; dst += dst_skip

  subs  lr, lr, #1          ; height--
  bne   height_loop_t15

  add   sp, sp, #16
  ldmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, pc}

; ka_drawy_z2_tbgr15 : 32k cols 200% zoom greyscale
; -----------------
  EXPORT ka_drawy_z2_tbgr15
  ALIGN 32
ka_drawy_z2_tbgr15
  stmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, lr}
  sub   sp, sp, #16

  add   a2, a1, #drawer_yc_bpr
  ldmia a2!, {v3, v4, v5, lr} ; yc_bpr, dst1, dst_bpr, yc_height
  ldr   v6, [a1, #0]        ; y1 = frame->base[0]
  ldmia a2, {a3-v2}         ; yc_width, dst_skip, yc_skip, uv_skip
  add   a4, a4, v5          ; dst_skip += dst_bpr
  stmia sp, {a3-v2}

  ldr   ip, Yc0_Tab_5bit

height2_loop_t15
  ldr   v3, [sp, #yc_width]
width2_loop_t15

; W pixel luminance
  ldrb  v2, [v6], #1
  ldrb  v2, [ip,v2]
  orr   v2, v2, v2, lsl #5
  orr   v2, v2, v2, lsl #5
; display west double pixel pair
  orr   v2, v2, v2, lsl #16 ; double pixel
  str   v2, [v4], v5        ; *dst = pix, then down
  str   v2, [v4], #4        ; *dst = pix, then right

; E pixel luminance
  ldrb  v2, [v6], #1
  ldrb  v2, [ip,v2]
  orr   v2, v2, v2, lsl #5
  orr   v2, v2, v2, lsl #5
; display east double pixel pair
  orr   v2, v2, v2, lsl #16 ; double pixel
  str   v2, [v4], -v5       ; *dst = pix, then up
  str   v2, [v4], #4        ; *dst = pix, then right

  subs  v3, v3, #2          ; width--
  bne   width2_loop_t15

  ldr   v1, [sp, #yc_skip]
  add   v6, v6, v1          ; y1 += y_skip
  ldr   v1, [sp, #dst_skip]
  add   v4, v4, v1          ; dst += dst_skip

  subs  lr, lr, #1          ; height--
  bne   height2_loop_t15

  add   sp, sp, #16
  ldmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, pc}

; ka_drawy_z3_tbgr15 : 32k cols 300% zoom greyscale
; -----------------
  EXPORT ka_drawy_z3_tbgr15
  ALIGN 32
ka_drawy_z3_tbgr15
  stmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, lr}
  sub   sp, sp, #16

  add   a2, a1, #drawer_yc_bpr
  ldmia a2!, {v3, v4, v5, lr} ; yc_bpr, dst1, dst_bpr, yc_height
  ldr   v6, [a1, #0]        ; y1 = frame->base[0]
  ldmia a2, {a3-v2}         ; yc_width, dst_skip, yc_skip, uv_skip
  add   a4, a4, v5, lsl #1  ; dst_skip += 2 * dst_bpr
  stmia sp, {a3-v2}

  ldr   ip, Yc0_Tab_5bit

height3_loop_t15
  ldr   v3, [sp, #yc_width]
width3_loop_t15

; W pixel luminance
  ldrb  v1, [v6], #1
  ldrb  v1, [ip,v1]
  orr   v1, v1, v1, lsl #5
  orr   v1, v1, v1, lsl #5
; E pixel luminance
  ldrb  v2, [v6], #1
  ldrb  v2, [ip,v2]
  orr   v2, v2, v2, lsl #5
  orr   v2, v2, v2, lsl #5
; create the 3 double pixels
  orr   a1, v1, v2, lsl #16 ; centre
  orr   v1, v1, v1, lsl #16 ; left
  orr   v2, v2, v2, lsl #16 ; right
; display the pixels
  str   a1, [v4, #4]        ; *(dst + 4) = centre
  str   v2, [v4, #8]        ; *(dst + 8) = right
  str   v1, [v4], v5        ; *dst = left, then down
  str   a1, [v4, #4]        ; *(dst + 4) = centre
  str   v2, [v4, #8]        ; *(dst + 8) = right
  str   v1, [v4], v5        ; *dst = left, then down
  str   v1, [v4], #4        ; *dst = left, then right
  str   a1, [v4], #4        ; *dst = centre, then right
  str   v2, [v4], #4        ; *dst = right, then right

; subtract 2*v5 from dst to position it for next
  sub   v4, v4, v5, lsl #1

  subs  v3, v3, #2          ; width--
  bne   width3_loop_t15

  ldr   v1, [sp, #yc_skip]
  add   v6, v6, v1          ; y1 += y_skip
  ldr   v1, [sp, #dst_skip]
  add   v4, v4, v1          ; dst += dst_skip

  subs  lr, lr, #1          ; height--
  bne   height3_loop_t15

  add   sp, sp, #16
  ldmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, pc}

; ka_drawy_z4_tbgr15 : 32k cols 400% zoom greyscale
; -----------------
  EXPORT ka_drawy_z4_tbgr15
  ALIGN 32
ka_drawy_z4_tbgr15
  stmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, lr}
  sub   sp, sp, #16

  add   a2, a1, #drawer_yc_bpr
  ldmia a2!, {v3, v4, v5, lr} ; yc_bpr, dst1, dst_bpr, yc_height
  ldr   v6, [a1, #0]        ; y1 = frame->base[0]
  ldmia a2, {a3-v2}         ; yc_width, dst_skip, yc_skip, uv_skip
  add   a4, a4, v5, lsl #1  ; dst_skip += 3 * dst_bpr
  add   a4, a4, v5
  stmia sp, {a3-v2}

  ldr   ip, Yc0_Tab_5bit

height4_loop_t15
  ldr   v3, [sp, #yc_width]
width4_loop_t15

; W pixel luminance
  ldrb  v2, [v6], #1
  ldrb  v2, [ip,v2]
  orr   v2, v2, v2, lsl #5
  orr   v2, v2, v2, lsl #5
; display west double pixel pair
  orr   v2, v2, v2, lsl #16 ; double pixel
  str   v2, [v4, #4]        ; *(dst + 4) = pix
  str   v2, [v4], v5        ; *dst = pix, then down
  str   v2, [v4, #4]        ; *(dst + 4) = pix
  str   v2, [v4], v5        ; *dst = pix, then down
  str   v2, [v4, #4]        ; *(dst + 4) = pix
  str   v2, [v4], v5        ; *dst = pix, then down
  str   v2, [v4], #4        ; *dst = pix, then right
  str   v2, [v4], #4        ; *dst = pix, then right

; E pixel luminance
  ldrb  v2, [v6], #1
  ldrb  v2, [ip,v2]
  orr   v2, v2, v2, lsl #5
  orr   v2, v2, v2, lsl #5
; display east double pixel pair
  orr   v2, v2, v2, lsl #16 ; double pixel
  str   v2, [v4, #4]        ; *(dst + 4) = pix
  str   v2, [v4], -v5       ; *dst = pix, then up
  str   v2, [v4, #4]        ; *(dst + 4) = pix
  str   v2, [v4], -v5       ; *dst = pix, then up
  str   v2, [v4, #4]        ; *(dst + 4) = pix
  str   v2, [v4], -v5       ; *dst = pix, then up
  str   v2, [v4], #4        ; *dst = pix, then right
  str   v2, [v4], #4        ; *dst = pix, then right

  subs  v3, v3, #2          ; width--
  bne   width4_loop_t15

  ldr   v1, [sp, #yc_skip]
  add   v6, v6, v1          ; y1 += y_skip
  ldr   v1, [sp, #dst_skip]
  add   v4, v4, v1          ; dst += dst_skip

  subs  lr, lr, #1          ; height--
  bne   height4_loop_t15

  add   sp, sp, #16
  ldmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, pc}


; ka_drawy_z05_tbgr32 : 16m cols 50% zoom greyscale
; ------------------
  EXPORT ka_drawy_z05_tbgr32
  ALIGN 32
ka_drawy_z05_tbgr32
  stmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, lr}
  sub   sp, sp, #16

  add   a2, a1, #drawer_yc_bpr
  ldmia a2!, {v3, v4, v5, lr} ; yc_bpr, dst1, dst_bpr, yc_height
  ldr   v6, [a1, #0]        ; y1 = frame->base[0]
  ldmia a2, {a3-v2}         ; yc_width, dst_skip, yc_skip, uv_skip
  add   v1, v1, v3          ; yc_skip += yc_bpr
  stmia sp, {a3-v2}

  ldr   ip, Yc0_Tab_8bit

height_05loop_t32
  ldr   v3, [sp, #yc_width]
width_05loop_t32

; pixel luminance
  ldrb  v2, [v6], #2        ; Yc = *y1++
  ldrb  v2, [ip,v2]
  orr   v2, v2, v2, lsl #8
  orr   v2, v2, v2, lsl #8
; display pixel
  str   v2, [v4], #4        ; *dst = pix, then right

  subs  v3, v3, #2          ; width--
  bne   width_05loop_t32

  ldr   v1, [sp, #yc_skip]
  add   v6, v6, v1          ; y1 += y_skip
  ldr   v1, [sp, #dst_skip]
  add   v4, v4, v1          ; dst += dst_skip

  subs  lr, lr, #2          ; height--
  bne   height_05loop_t32

  add   sp, sp, #16
  ldmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, pc}


; Macro to render frames in T(BGR|RGB)32 at a variety of multiplication factors
       MACRO
$name  DRAWFRAME_T32_MONO  $mult
       EXPORT $name
       = "$name",0
       ALIGN
       DCD   &FF000000 :OR: (((:LEN: "$name")+4) :AND: :NOT: 3)
       ALIGN 32
$name
       stmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, lr}
       sub   sp, sp, #16

       add   a2, a1, #drawer_yc_bpr
       ldmia a2!, {v3, v4, v5, lr} ; yc_bpr, dst1, dst_bpr, yc_height
       ldr   v6, [a1, #0]        ; y1 = frame->base[0]
       ldmia a2, {a3-v2}         ; yc_width, dst_skip, yc_skip, uv_skip
; JRF: Note that this isn't the most optimal method for +7*dst_bpr [use +(8-1)*dst_bpr]
   [ (($mult-1) :AND: 1) = 1
       add   a4, a4, v5          ; dst_skip += dst_bpr
   ]
   [ (($mult-1) :AND: 2) = 2
       add   a4, a4, v5, lsl #1  ; dst_skip += 2 * dst_bpr
   ]
   [ (($mult-1) :AND: 4) = 4
       add   a4, a4, v5, lsl #2  ; dst_skip += 4 * dst_bpr
   ]
       ASSERT ($mult-1) < 8 ; the code isn't set up for *8 yet (WHILE-generate ?)
       stmia sp, {a3-v2}

  ldr   ip, Yc0_Tab_8bit

height_loop_t32_$mult
       ldr   v3, [sp, #yc_width]
width_loop_t32_$mult

; W pixel luminance
       ldrb  v2, [v6], #1        ; Yc = *y1++
       ldrb  v2, [ip,v2]
       orr   v2, v2, v2, lsl #8
       orr   v2, v2, v2, lsl #8
; display west 1/4/9/16 pixel(s)
           LCLA  ycount
           LCLA  xcount
ycount     SETA  0
           WHILE $ycount < $mult-1
xcount       SETA  0
             WHILE $xcount < $mult-1
               str   v2, [v4, #4 + 4 * $xcount]
xcount         SETA $xcount + 1
             WEND
             str   v2, [v4], v5        ; *dst = pix, then down
ycount       SETA $ycount + 1
           WEND
; final x loop
xcount     SETA  0
           WHILE $xcount < $mult-1
             str   v2, [v4], #4        ; *dst = pix, then right
xcount       SETA $xcount + 1
           WEND
       str   v2, [v4], #4        ; *dst = pix, then right


; E pixel luminance
       ldrb  v2, [v6], #1        ; Yc = *y1++
       ldrb  v2, [ip,v2]
       orr   v2, v2, v2, lsl #8
       orr   v2, v2, v2, lsl #8
; display east 1/4/9/16 pixel(s)
           LCLA  ycount
           LCLA  xcount
ycount     SETA  0
           WHILE $ycount < $mult-1
xcount       SETA  0
             WHILE $xcount < $mult-1
               str   v2, [v4, #4 + 4 * $xcount]
xcount         SETA $xcount + 1
             WEND
             str   v2, [v4], -v5        ; *dst = pix, then down
ycount       SETA $ycount + 1
           WEND
; final x loop
xcount     SETA  0
           WHILE $xcount < $mult-1
             str   v2, [v4], #4        ; *dst = pix, then right
xcount       SETA $xcount + 1
           WEND
       str   v2, [v4], #4        ; *dst = pix, then right

       subs  v3, v3, #2          ; width--
       bne   width_loop_t32_$mult

       ldr   v1, [sp, #yc_skip]
       add   v6, v6, v1          ; y1 += y_skip
       ldr   v1, [sp, #dst_skip]
       add   v4, v4, v1          ; dst += dst_skip

       subs  lr, lr, #1          ; height--
       bne   height_loop_t32_$mult

       add   sp, sp, #16
       ldmfd sp!, {v1, v2, v3, v4, v5, v6, sl, fp, ip, pc}
       MEND

ka_drawy_z1_tbgr32  DRAWFRAME_T32_MONO 1
ka_drawy_z2_tbgr32  DRAWFRAME_T32_MONO 2
ka_drawy_z3_tbgr32  DRAWFRAME_T32_MONO 3
ka_drawy_z4_tbgr32  DRAWFRAME_T32_MONO 4

       ALIGN 32
  END
