;
; idcts.s
; Copyright (C) 2002 P.Everett <peter@everett9981.freeserve.co.uk>
;
; This file is part of KinoAMP, a free RISCOS MPEG program stream decoder.
;
; KinoAMP is free software; you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation; either version 2 of the License, or
; (at your option) any later version.
;
; KinoAMP is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with this program; if not, write to the Free Software
; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
;

; Thanks to Andr Timmermans for this code.


  AREA |A$$code|, CODE, READONLY
;  AREA    |mpeg$$idctscode|, CODE, READONLY

  EXPORT  idct_block_copy_dc_arm
  EXPORT  idct_block_add_dc_arm
  EXPORT  idcts_rows_add_arm
  EXPORT  idcts_rows_copy_arm

;-------------------------------------------------------------------------------
; r0  block   64x32-bit
; r1  dest    8x 8x8-bit lines
; r2  stride  bytes between start of 2 consecutive lines
;-------------------------------------------------------------------------------
 ALIGN 32
idct_block_copy_dc_arm
  stmfd r13!,{r9,r14}
  ; DC = (block[0] + 4) >> 3
  ldr   r0, [r0,#0]
  add   r0, r0, #4
  movs  r0, r0, asr #3
  movmi r0, #0
  cmp   r0, #255
  movgt r0, #255
  orr   r0, r0, r0, lsl #8
  orr   r0, r0, r0, lsl #16
  mov   r3, r0

  mov   r9, #8

idct_block_copy_dc_loop
  stmia r1, {r0, r3}
  add   r1, r1, r2

  subs  r9,r9, #1
  bgt   idct_block_copy_dc_loop

  ldmfd r13!,{r9,pc}

;-------------------------------------------------------------------------------
; r0  block   64x32-bit
; r1  dest    8x 8x8-bit lines
; r2  stride  bytes between start of 2 consecutive lines
;-------------------------------------------------------------------------------
 ALIGN 32
idct_block_add_dc_arm
  stmfd r13!,{r4-r9,r14}
  ; DC = (block[0] + 4) >> 3
  ldr   r0, [r0,#0]
  movs  r0, r0, asr#3
  adc   r0, r0, #0
  mov   r9, #8

idct_block_add_dc_loop
  ldmia r1, {r3, r4}      ; dest[0] .. dest[7]

  and   r5, r3, #255
  add   r5, r0, r5
  movs  r7, r5, asr#8
  movgt r5, #255
  movmi r5, #0

  and   r6, r3, #255<<8
  add   r6, r0, r6, lsr#8
  movs  r7, r6, asr#8
  orrgt r5, r5, #255<<8
  orreq r5, r5, r6, lsl#8

  and   r6, r3, #255<<16
  add   r6, r0, r6, lsr#16
  movs  r7, r6, asr#8
  orrgt r5, r5, #255<<16
  orreq r5, r5, r6, lsl#16

  add   r6, r0, r3, lsr#24
  movs  r7, r6, asr#8
  orrgt r5, r5, #255<<24
  orreq r5, r5, r6, lsl#24


  and   r6, r4, #255
  add   r6, r0, r6
  movs  r8, r6, asr#8
  movgt r6, #255
  movmi r6, #0

  and   r7, r4, #255<<8
  add   r7, r0, r7, lsr#8
  movs  r8, r7, asr#8
  orrgt r6, r6, #255<<8
  orreq r6, r6, r7, lsl#8

  and   r7, r4, #255<<16
  add   r7, r0, r7, lsr#16
  movs  r8, r7, asr#8
  orrgt r6, r6, #255<<16
  orreq r6, r6, r7, lsl#16

  add   r7, r0, r4, lsr#24
  movs  r8, r7, asr#8
  orrgt r6, r6, #255<<24
  orreq r6, r6, r7, lsl#24


  stmia r1, {r5, r6}
  add   r1, r1, r2

  subs  r9,r9, #1
  bgt   idct_block_add_dc_loop

  ldmfd r13!,{r4-r9,pc}


x0     RN  0
x1     RN  4
x2     RN  6
x3     RN  2
x4     RN  1
x5     RN  7
x6     RN  5
x7     RN  3

x8     RN  8

;-------------------------------------------------------------------------------
; r0  block   64x32-bit
; r1  dest    8x 8x8-bit lines
; r2  stride  bytes between start of 2 consecutive lines
;-------------------------------------------------------------------------------
 ALIGN 32
idcts_rows_copy_arm
  stmfd r13!,{r4-r12,r14}
  sub   r13, r13, #8
  str   r1,[r13,#0]
  str   r2,[r13,#4]
  mov   r10, r0
  bl    idcts_row_copy
  bl    idcts_row_copy
  bl    idcts_row_copy
  bl    idcts_row_copy
  bl    idcts_row_copy
  bl    idcts_row_copy
  bl    idcts_row_copy
  bl    idcts_row_copy
  add   r13, r13, #8
  ldmfd r13!,{r4-r12,pc}

;-------------------------------------------------------------------------------
; r0  block   64x32-bit
; r1  dest    8x 8x8-bit lines
; r2  stride  bytes between start of 2 consecutive lines
;-------------------------------------------------------------------------------
 ALIGN 32
idcts_rows_add_arm
  stmfd r13!,{r4-r12,r14}
  sub   r13, r13, #8
  str   r1,[r13,#0]
  str   r2,[r13,#4]
  mov   r10, r0
  bl    idcts_row_add
  bl    idcts_row_add
  bl    idcts_row_add
  bl    idcts_row_add
  bl    idcts_row_add
  bl    idcts_row_add
  bl    idcts_row_add
  bl    idcts_row_add
  add   r13, r13, #8
  ldmfd r13!,{r4-r12,pc}

;-------------------------------------------------------------------------------
; r10    row     8 words
; r13+0  dest    8x8-bit line
; r13+4  stride  bytes between start of 2 consecutive lines
; Out:
; r10, r13+0 updated on exit
;-------------------------------------------------------------------------------
 ALIGN 32
idcts_row_add
  ldmia r10!,{x0,x4,x3,x7,x1,x6,x2,x5}
  mov   x1, x1, lsl#8

  orr   r9, x6, x7
  orrs  r11, x4, x5
  cmpeq r9,#0
  beq   idcts_row_add_half

; W1 2841
; W2 2676
; W3 2408
; W5 1609
; W6 1108
; W7 565

  cmp   r11, #0
  beq   idcts_row_add_67

; first stage
;    x8 = W7 * (x4 + x5) + 4;
; W7 565
  add   r11, x4, x5
  rsb   r12, r11, r11, lsl#4
  rsb   r12, r12, r11, lsl#7
  add   x8, r12, r12, lsl#2
  add   x8, x8, #4

;    x4 = (x8 + (W1 - W7) * x4)>>3;
; W1 2841
; W7 565
  add   r12, x4, x4, lsl#5
  add   r12, r12, r12, lsl#4
  add   r12, r12, x4, lsl#3
  add   x4, x8, r12, lsl#2
  mov   x4, x4, asr#3

;    x5 = (x8 - (W1 + W7) * x5)>>3;
; W1 2841
; W7 565
  mov   r12, #&4e
  add   r12, r12, #&d00
  mul   r11, r12, x5
  sub   x5, x8, r11
  mov   x5, x5, asr#3

idcts_row_add_67
  cmp   r9, #0
  beq   idcts_row_add_second

;    x8 = W3 * (x6 + x7) + 4;
; W3 2408
  add   r11, x6, x7
  add   r12, r11, r11, lsl#5
  add   r12, r12, r12, lsl#3
  add   r12, r12, r11, lsl#2
  mov   x8, r12, lsl#3
  add   x8, x8, #4

;    x6 = (x8 - (W3 - W5) * x6)>>3;
; W3 2408
; W5 1609
  rsb   r11, x6, x6, lsl#4
  add   r11, r11, x6, lsl#5
  add   r11, r11, r11, lsl#4
  sub   x6, x8, r11
  mov   x6, x6, asr#3

;    x7 = (x8 - (W3 + W5) * x7)>>3;
; W3 2408
; W5 1609
  rsb   r11, x7, x7, lsl#6
  rsb   r11, r11, x7, lsl#12
  sub   r11, r11, x7, lsl#4
  sub   x7, x8, r11
  mov   x7, x7, asr#3

idcts_row_add_second
; second stage
  mov   x0, x0, lsl#8
  add   x0, x0, #8192
;    x8 = x0 + x1;
  add   x8, x0, x1

;    x0 -= x1;
  sub   x0, x0, x1

;    x1 = W6 * (x3 + x2) +4;
; W6 1108
  add   r11, x3, x2
  add   r12, r11, r11, lsl#4
  add   r12, r12, r11, lsl#8
  add   r12, r12, r11, lsl#2
  mov   x1, r12, lsl#2
  add   x1, x1, #4

;    x2 = (x1 - (W2 + W6) * x2)>>3;
; W2 2676
; W6 1108
  rsb   r12, x2, x2, lsl#5
  rsb   r12, r12, r12, lsl#4
  add   r12, r12, x2, lsl#3
  sub   x2, x1, r12, lsl#3
  mov   x2, x2, asr#3

;    x3 = (x1 + (W2 - W6) * x3)>>3;
; W2 2676
; W6 1108
  add   r12, x3, x3, lsl#4
  add   r12, r12, x3, lsl#5
  add   x3, x1, r12, lsl#5
  mov   x3, x3, asr#3

;    x1 = x4 + x6;
  add   x1, x4, x6

;    x4 -= x6;
  sub   x4, x4, x6

;    x6 = x5 + x7;
  add   x6, x5, x7

;    x5 -= x7;
  sub   x5, x5, x7

; third stage
;    x7 = x8 + x3;
  add   x7, x8, x3

;    x8 -= x3;
  sub   x8, x8, x3

;    x3 = x0 + x2;
  add   x3, x0, x2

;    x0 -= x2;
  sub   x0, x0, x2

;    x2 = (181 * (x4 + x5) + 128) >> 8;
  add   x2, x4, x5
  rsb   r12, x2, x2, lsl#4
  add   r12, r12, r12, lsl#1
  add   x2, x2, r12, lsl#2
  add   x2, x2, #128
  mov   x2, x2, asr#8

;    x4 = (181 * (x4 - x5) + 128) >> 8;
  sub   x4, x4, x5
  rsb   r12, x4, x4, lsl#4
  add   r12, r12, r12, lsl#1
  add   x4, x4, r12, lsl#2
  add   x4, x4, #128
  mov   x4, x4, asr#8

; fourth stage

;    block[7] = (x7 - x1) >> 14;
  sub   r12, x7, x1

;    block[6] = (x3 - x2) >> 14;
  sub   r11, x3, x2

;    block[5] = (x0 - x4) >> 14;
  sub   r9, x0, x4

;    block[0] = (x7 + x1) >> 14;
  add   r3, x7, x1        ; done with r3,r4

;    block[1] = (x3 + x2) >> 14;
  add   r4, x3, x2        ; done with r2, r6

;    block[3] = (x8 + x6) >> 14;
  add   r6, x8, x6

;    block[4] = (x8 - x6) >> 14;
  sub   r7, x8, x6        ; done with r5,r8

;    block[2] = (x0 + x4) >> 14;
  add   r5, x0, x4        ; done with r0,r1

  b     idcts_row_add_avg

 ALIGN 32
idcts_row_add_half
  orrs  r9, x2, x3
  cmpeq x1,#0
  beq   idcts_row_add_none

; second stage
  mov   x0, x0, lsl#8
  add   x0, x0, #8192
;    x8 = x0 + x1;
  add   x8, x0, x1

;    x0 -= x1;
  sub   x0, x0, x1

;    x1 = W6 * (x3 + x2) +4;
; W6 1108
  add   r11, x3, x2
  add   r12, r11, r11, lsl#4
  add   r12, r12, r11, lsl#8
  add   r12, r12, r11, lsl#2
  mov   x1, r12, lsl#2
  add   x1, x1, #4

;    x2 = (x1 - (W2 + W6) * x2)>>3;
; W2 2676
; W6 1108
  rsb   r12, x2, x2, lsl#5
  rsb   r12, r12, r12, lsl#4
  add   r12, r12, x2, lsl#3
  sub   x2, x1, r12, lsl#3
  mov   x2, x2, asr#3

;    x3 = (x1 + (W2 - W6) * x3)>>3;
; W2 2676
; W6 1108
  add   r12, x3, x3, lsl#4
  add   r12, r12, x3, lsl#5
  add   x3, x1, r12, lsl#5
  mov   x3, x3, asr#3

; fourth stage

;    block[7] = (x8 + x3) >> 14;
  add   r12, x8, x3

;    block[6] = (x0 + x2) >> 14;
  add   r11, x0, x2

;    block[5] = (x0 - x2) >> 14;
  sub   r9, x0, x2

;    block[4] = (x8 - x3) >> 14;
  sub   r7, x8, x3

;    block[0] = block[7]
  mov   r3, r12

;    block[1] = block[6]
  mov   r4, r11

;    block[2] = block[5]
  mov   r5, r9

;    block[3] = block[4]
  mov   r6, x7

  b     idcts_row_add_avg

 ALIGN 32
idcts_row_add_none
  add   r3, x0, #32
  mov   r3, r3, lsl #8
  mov   r4, r3
  mov   r5, r3
  mov   r6, r3
  mov   r7, r3
  mov   r9, r3
  mov   r11, r3
  mov   r12, r3

idcts_row_add_avg
  ldr   r0, [r13, #0]     ; dest
  ldmia r0, {r2, r8}      ; dest[0] .. dest[7]
; TODO: I suspect the following can be done a little more efficient??
  and   r1, r2, #255
  add   r3, r1, r3, asr#14
  movs  r1, r3, asr #8
  movgt r3, #255
  movmi r3, #0

  and   r1, r2, #255<<8
  mov   r1, r1, lsr#8
  add   r4, r1, r4, asr#14
  movs  r1, r4, asr#8
  orrgt r3, r3, #255<<8
  orreq r3, r3, r4, lsl#8

  and   r1, r2, #255<<16
  mov   r1, r1, lsr#16
  add   r4, r1, r5, asr#14
  movs  r1, r4, asr#8
  orrgt r3, r3, #255<<16
  orreq r3, r3, r4, lsl#16

  mov   r1, r2, lsr#24
  add   r4, r1, r6, asr#14
  movs  r1, r4, asr#8
  orrgt r3, r3, #255<<24
  orreq r3, r3, r4, lsl#24


  and   r1, r8, #255
  add   r5, r1, r7, asr#14
  movs  r1, r5, asr #8
  movgt r5, #255
  movmi r5, #0

  and   r1, r8, #255<<8
  mov   r1, r1, lsr#8
  add   r4, r1, r9, asr#14
  movs  r1, r4, asr #8
  orrgt r5, r5, #255<<8
  orreq r5, r5, r4, lsl#8

  and   r1, r8, #255<<16
  mov   r1, r1, lsr#16
  add   r4, r1, r11, asr#14
  movs  r1, r4, asr#8
  orrgt r5, r5, #255<<16
  orreq r5, r5, r4, lsl#16

  mov   r1, r8, lsr#24
  add   r4, r1, r12, asr#14
  movs  r1, r4, asr#8
  orrgt r5, r5, #255<<24
  orreq r5, r5, r4, lsl#24

  stmia r0, {r3, r5}
  ldr   r3, [r13, #4]       ; stride
  add   r0, r0, r3
  str   r0, [r13, #0]

  mov   pc, r14

;-------------------------------------------------------------------------------
; r10    row     8 words
; r13+0  dest    8x8-bit line
; r13+4  stride  bytes between start of 2 consecutive lines
; Out:
; r10, r13+0 updated on exit
;-------------------------------------------------------------------------------
 ALIGN 32
idcts_row_copy
  ldmia r10!,{x0,x4,x3,x7,x1,x6,x2,x5}
  mov   x1, x1, lsl#8

  orr   r9, x6, x7
  orrs  r11, x4, x5
  cmpeq r9,#0
  beq   idcts_row_copy_half

; W1 2841
; W2 2676
; W3 2408
; W5 1609
; W6 1108
; W7 565

  cmp   r11, #0
  beq   idcts_row_copy_67

; first stage
;    x8 = W7 * (x4 + x5) + 4;
; W7 565
  add   r11, x4, x5
  rsb   r12, r11, r11, lsl#4
  rsb   r12, r12, r11, lsl#7
  add   x8, r12, r12, lsl#2
  add   x8, x8, #4

;    x4 = (x8 + (W1 - W7) * x4)>>3;
; W1 2841
; W7 565
  add   r12, x4, x4, lsl#5
  add   r12, r12, r12, lsl#4
  add   r12, r12, x4, lsl#3
  add   x4, x8, r12, lsl#2
  mov   x4, x4, asr#3

;    x5 = (x8 - (W1 + W7) * x5)>>3;
; W1 2841
; W7 565
  mov   r12, #&4e
  add   r12, r12, #&d00
  mul   r11, r12, x5
  sub   x5, x8, r11
  mov   x5, x5, asr#3

idcts_row_copy_67
  cmp   r9, #0
  beq   idcts_row_copy_second

;    x8 = W3 * (x6 + x7) + 4;
; W3 2408
  add   r11, x6, x7
  add   r12, r11, r11, lsl#5
  add   r12, r12, r12, lsl#3
  add   r12, r12, r11, lsl#2
  mov   x8, r12, lsl#3
  add   x8, x8, #4

;    x6 = (x8 - (W3 - W5) * x6)>>3;
; W3 2408
; W5 1609
  rsb   r11, x6, x6, lsl#4
  add   r11, r11, x6, lsl#5
  add   r11, r11, r11, lsl#4
  sub   x6, x8, r11
  mov   x6, x6, asr#3

;    x7 = (x8 - (W3 + W5) * x7)>>3;
; W3 2408
; W5 1609
  rsb   r11, x7, x7, lsl#6
  rsb   r11, r11, x7, lsl#12
  sub   r11, r11, x7, lsl#4
  sub   x7, x8, r11
  mov   x7, x7, asr#3

idcts_row_copy_second
; second stage
  mov   x0, x0, lsl#8
  add   x0, x0, #8192
;    x8 = x0 + x1;
  add   x8, x0, x1

;    x0 -= x1;
  sub   x0, x0, x1

;    x1 = W6 * (x3 + x2) +4;
; W6 1108
  add   r11, x3, x2
  add   r12, r11, r11, lsl#4
  add   r12, r12, r11, lsl#8
  add   r12, r12, r11, lsl#2
  mov   x1, r12, lsl#2
  add   x1, x1, #4

;    x2 = (x1 - (W2 + W6) * x2)>>3;
; W2 2676
; W6 1108
  rsb   r12, x2, x2, lsl#5
  rsb   r12, r12, r12, lsl#4
  add   r12, r12, x2, lsl#3
  sub   x2, x1, r12, lsl#3
  mov   x2, x2, asr#3

;    x3 = (x1 + (W2 - W6) * x3)>>3;
; W2 2676
; W6 1108
  add   r12, x3, x3, lsl#4
  add   r12, r12, x3, lsl#5
  add   x3, x1, r12, lsl#5
  mov   x3, x3, asr#3

;    x1 = x4 + x6;
  add   x1, x4, x6

;    x4 -= x6;
  sub   x4, x4, x6

;    x6 = x5 + x7;
  add   x6, x5, x7

;    x5 -= x7;
  sub   x5, x5, x7

; third stage
;    x7 = x8 + x3;
  add   x7, x8, x3

;    x8 -= x3;
  sub   x8, x8, x3

;    x3 = x0 + x2;
  add   x3, x0, x2

;    x0 -= x2;
  sub   x0, x0, x2

;    x2 = (181 * (x4 + x5) + 128) >> 8;
  add   x2, x4, x5
  rsb   r12, x2, x2, lsl#4
  add   r12, r12, r12, lsl#1
  add   x2, x2, r12, lsl#2
  add   x2, x2, #128
  mov   x2, x2, asr#8

;    x4 = (181 * (x4 - x5) + 128) >> 8;
  sub   x4, x4, x5
  rsb   r12, x4, x4, lsl#4
  add   r12, r12, r12, lsl#1
  add   x4, x4, r12, lsl#2
  add   x4, x4, #128
  mov   x4, x4, asr#8

; fourth stage

;    block[7] = (x7 - x1) >> 14;
  sub   r12, x7, x1
  mov   r12, r12, asr#14
  movs  r9, r12, asr#8
  movmi r12, #0
  movgt r12, #255<<24
  moveq r12, r12, lsl#24

;    block[6] = (x3 - x2) >> 14;
  sub   r11, x3, x2
  mov   r11, r11, asr#14
  movs  r9, r11, asr#8
  orrgt r12, r12, #255<<16
  orreq r12, r12, r11, lsl#16

;    block[5] = (x0 - x4) >> 14;
  sub   r11, x0, x4
  mov   r11, r11, asr#14
  movs  r9, r11, asr#8
  orrgt r12, r12, #255<<8
  orreq r12, r12, r11, lsl#8

;    block[4] = (x8 - x6) >> 14;
  sub   r11, x8, x6
  mov   r11, r11, asr#14
  movs  r9, r11, asr#8
  orrgt r12, r12, #255
  orreq r12, r12, r11

;    block[3] = (x8 + x6) >> 14;
  add   r11, x8, x6        ; done with r5,r8
  mov   r11, r11, asr#14
  movs  r9, r11, asr#8
  movmi r11, #0
  movgt r11, #255<<24
  moveq r11, r11, lsl#24

;    block[2] = (x0 + x4) >> 14;
  add   r9, x0, x4        ; done with r0,r1
  mov   r9, r9, asr#14
  movs  r0, r9, asr#8
  orrgt r11, r11, #255<<16
  orreq r11, r11, r9, lsl#16

;    block[1] = (x3 + x2) >> 14;
  add   r9, x3, x2        ; done with r2, r6
  mov   r9, r9, asr#14
  movs  r0, r9, asr#8
  orrgt r11, r11, #255<<8
  orreq r11, r11, r9, lsl#8

;    block[0] = (x7 + x1) >> 14;
  add   r9, x7, x1        ; done with r3,r4
  mov   r9, r9, asr#14
  movs  r0, r9, asr#8
  orrgt r11, r11, #255
  orreq r11, r11, r9

  b     idcts_row_copy_store

 ALIGN 32
idcts_row_copy_half
  orrs  r9, x2, x3
  cmpeq x1,#0
  beq   idcts_row_copy_none

; second stage
  mov   x0, x0, lsl#8
  add   x0, x0, #8192
;    x8 = x0 + x1;
  add   x8, x0, x1

;    x0 -= x1;
  sub   x0, x0, x1

;    x1 = W6 * (x3 + x2) +4;
; W6 1108
  add   r11, x3, x2
  add   r12, r11, r11, lsl#4
  add   r12, r12, r11, lsl#8
  add   r12, r12, r11, lsl#2
  mov   x1, r12, lsl#2
  add   x1, x1, #4

;    x2 = (x1 - (W2 + W6) * x2)>>3;
; W2 2676
; W6 1108
  rsb   r12, x2, x2, lsl#5
  rsb   r12, r12, r12, lsl#4
  add   r12, r12, x2, lsl#3
  sub   x2, x1, r12, lsl#3
  mov   x2, x2, asr#3

;    x3 = (x1 + (W2 - W6) * x3)>>3;
; W2 2676
; W6 1108
  add   r12, x3, x3, lsl#4
  add   r12, r12, x3, lsl#5
  add   x3, x1, r12, lsl#5
  mov   x3, x3, asr#3

; fourth stage

;    block[7] = (x8 + x3) >> 14;
  add   r12, x8, x3
  mov   r12, r12, asr#14
  movs  r9, r12, asr#8
  movmi r12, #0
  movgt r12, #255<<24
  moveq r12, r12, lsl#24

;    block[6] = (x0 + x2) >> 14;
  add   r11, x0, x2
  mov   r11, r11, asr#14
  movs  r9, r11, asr#8
  orrgt r12, r12, #255<<16
  orreq r12, r12, r11, lsl#16

;    block[5] = (x0 - x2) >> 14;
  sub   r11, x0, x2
  mov   r11, r11, asr#14
  movs  r9, r11, asr#8
  orrgt r12, r12, #255<<8
  orreq r12, r12, r11, lsl#8

;    block[4] = (x8 - x3) >> 14;
  sub   r11, x8, x3
  mov   r11, r11, asr#14
  movs  r9, r11, asr#8
  orrgt r12, r12, #255
  orreq r12, r12, r11

;    block[0] = block[7]
;    block[1] = block[6]
;    block[2] = block[5]
;    block[3] = block[4]
  mov   r9, #255
  orr   r9, r9, #255<<16
  and   r7, r12, r9
  and   r11, r9, r12, ror#24
  orr   r11, r11, r7, ror#8

  b     idcts_row_copy_store

 ALIGN 32
idcts_row_copy_none
  add   r12, x0, #32
  movs  r12, r12, asr #6
  movmi r12, #0
  cmp   r12, #255
  movgt r12, #255
  orr   r12, r12, r12,lsl #8
  orr   r12, r12, r12,lsl #16
  mov   r11, r12

idcts_row_copy_store
  ldmia r13, {r0, r3} ; dest, stride
  stmia r0, {r11, r12}
  add   r0, r0, r3
  str   r0, [r13, #0]

  mov   pc, r14

 ALIGN 32 ; to ensure next file starts on correct boundary

  END
