; ARM code version of IDEA stuff, (1) for speed and (2) because
; Cv5 actually compiles ideaExpandKey *wrongly*.
; Unfortunately IDEA assumes MSB first, so we have to reverse everything.

; Please see the note at the start of ideaExpandKey regarding
; alignment.

IdeaRounds EQU 8

inbuf RN 0
outbuf RN 1
key RN 2
round RN 3
x1 RN 4
x2 RN 5
x3 RN 6
x4 RN 7
t1 RN 8
t2 RN 9
s2 RN 10
s3 RN 11
sp RN 13
lr RN 14
pc RN 15

r0 RN 0
r1 RN 1
r2 RN 2
r3 RN 3
r4 RN 4
r5 RN 5
r6 RN 6
r7 RN 7
r8 RN 8
ip RN 12

	AREA |A$$Code|, CODE, READONLY
	EXPORT ideaCipher
	EXPORT ideaExpandKey

; r0 -> inbuf (8 bytes)
; r1 -> outbuf (8 bytes)
; r2 -> key
ideaCipher
	STMFD sp!,{x1-s3,lr}	; change if reg nums change!!!!
	MOV round,#IdeaRounds
	MOV s2,#&FF
	LDR t1,[inbuf],#4
	AND x1,s2,t1		; 1H
	AND t2,s2,t1,LSR#8	; 1L
	ADD x1,t2,x1,LSL#8	; x1
	AND x2,s2,t1,LSR#16	; 2H
	AND t2,s2,t1,LSR#24	; 2L
	ADD x2,t2,x2,LSL#8	; x1
	LDR t1,[inbuf]
	AND x3,s2,t1		; 3H
	AND t2,s2,t1,LSR#8	; 3L
	ADD x3,t2,x3,LSL#8	; x3
	AND x4,s2,t1,LSR#16	; 4H
	AND t2,s2,t1,LSR#24	; 4L
	ADD x4,t2,x4,LSL#8	; x4
ic1	; start of DO loop
	; MUL(x1,*key++)
	LDR t1,[key],#2
	MOV t1,t1,LSL#16
	MOVS t1,t1,LSR#16
	RSBEQ x1,x1,#1
	BEQ md1
	MOV x1,x1,LSL#16
	MOVS x1,x1,LSR#16
	RSBEQ x1,t1,#1
	BEQ md1
	MUL x1,t1,x1
	MOV t1,x1,LSR#16
	BIC t2,x1,t1,LSL#16
	SUBS x1,t2,t1
	ADDLO x1,x1,#1
md1	; x2 += *key++; x3 += *key++
	LDR t1,[key],#2
	ADD x2,x2,t1
	LDR t1,[key],#2
	ADD x3,x3,t1
	; MUL(x4,*key++)
	LDR t1,[key],#2
	MOV t1,t1,LSL#16
	MOVS t1,t1,LSR#16
	RSBEQ x4,x4,#1
	BEQ md2
	MOV x4,x4,LSL#16
	MOVS x4,x4,LSR#16
	RSBEQ x4,t1,#1
	BEQ md2
	MUL x4,t1,x4
	MOV t1,x4,LSR#16
	BIC t2,x4,t1,LSL#16
	SUBS x4,t2,t1
	ADDLO x4,x4,#1
md2	; s3=x3; x3^=x1
	MOV s3,x3
	EOR x3,x3,x1
	; MUL(x3,*key++)
	LDR t1,[key],#2
	MOV t1,t1,LSL#16
	MOVS t1,t1,LSR#16
	RSBEQ x3,x3,#1
	BEQ md3
	MOV x3,x3,LSL#16
	MOVS x3,x3,LSR#16
	RSBEQ x3,t1,#1
	BEQ md3
	MUL x3,t1,x3
	MOV t1,x3,LSR#16
	BIC t2,x3,t1,LSL#16
	SUBS x3,t2,t1
	ADDLO x3,x3,#1
md3	; s2=x2; x2^=x4; x2+=x3
	MOV s2,x2
	EOR x2,x2,x4
	ADD x2,x2,x3
	; MUL(x2,*key++)
	LDR t1,[key],#2
	MOV t1,t1,LSL#16
	MOVS t1,t1,LSR#16
	RSBEQ x2,x2,#1
	BEQ md4
	MOV x2,x2,LSL#16
	MOVS x2,x2,LSR#16
	RSBEQ x2,t1,#1
	BEQ md4
	MUL x2,t1,x2
	MOV t1,x2,LSR#16
	BIC t2,x2,t1,LSL#16
	SUBS x2,t2,t1
	ADDLO x2,x2,#1
md4	; x3+=x2; x1^=x2; x4^=x3; x2^=s3; x3^=s2;
	ADD x3,x3,x2
	EOR x1,x1,x2
	EOR x4,x4,x3
	EOR x2,x2,s3
	EOR x3,x3,s2
	; while (--r);
	SUBS round,round,#1
	BNE ic1
	; MUL(x1,*key++)
	LDR t1,[key],#2
	MOV t1,t1,LSL#16
	MOVS t1,t1,LSR#16
	RSBEQ x1,x1,#1
	BEQ md5
	MOV x1,x1,LSL#16
	MOVS x1,x1,LSR#16
	RSBEQ x1,t1,#1
	BEQ md5
	MUL x1,t1,x1
	MOV t1,x1,LSR#16
	BIC t2,x1,t1,LSL#16
	SUBS x1,t2,t1
	ADDLO x1,x1,#1
md5	; x3 += *key++; x2+=*key++
	LDR t1,[key],#2
	ADD x3,x3,t1
	LDR t1,[key],#2
	ADD x2,x2,t1
	; MUL(x4,*key)
	LDR t1,[key]
	MOV t1,t1,LSL#16
	MOVS t1,t1,LSR#16
	RSBEQ x4,x4,#1
	BEQ md6
	MOV x4,x4,LSL#16
	MOVS x4,x4,LSR#16
	RSBEQ x4,t1,#1
	BEQ md6
	MUL x4,t1,x4
	MOV t1,x4,LSR#16
	BIC t2,x4,t1,LSL#16
	SUBS x4,t2,t1
	ADDLO x4,x4,#1
md6	; store x1..x4 in outbuf.
	MOV s2,#&FF
	ORR s2,s2,s2,LSL#16	; s2 = 00 FF 00 FF
	MOV x1,x1,LSL#16	; x1 = 1H 1L 00 00
	MOV x3,x3,LSL#16	; x3 = 3H 3L 00 00
	ORR t1,x3,x1,LSR#16	; t1 = 3H 3L 1H 1L
	AND s3,s2,t1		; s3 = 00 3L 00 1L
	AND t1,s2,t1,LSR#8	; t1 = 00 3H 00 1H
	ADD t1,t1,s3,LSL#8	; t1 = 3L 3H 1L 1H
	STR t1,[outbuf],#4
	MOV x2,x2,LSL#16	; x2 = 2H 2L 00 00
	MOV x4,x4,LSL#16	; x4 = 4H 4L 00 00
	ORR t1,x4,x2,LSR#16	; t1 = 4H 4L 2H 2L
	AND s3,s2,t1		; s3 = 00 4L 00 2L
	AND t1,s2,t1,LSR#8	; t1 = 00 4H 00 2H
	ADD t1,t1,s3,LSL#8	; t1 = 4L 4H 2L 2H
	STR t1,[outbuf]
	; and we're done. I think.
	LDMFD sp!,{x1-s3,pc}	; change if reg nums change!!!!

; r0 -> userkey (byte const *)
; r1 -> EK (word16 *)
; NB: this *requires* that EK be 4-aligned. This is always the case
;     as ideaExpandKey is actually used. However, userkey is often
;     *not* 4-aligned; it need not even be 2-aligned.
;     I apologise for the fact that alignment is relevant at all;
;     this is entirely the result of my stupidity.
ideaExpandKey
	STMFD	sp!,{r4-r8,lr}
	; First loop: put user key into first 8 EK entries,
	; swapping bytes because of endianness conflict.
	MOV	r4,#4
l1	LDRB	r3,[r0],#1
	LDRB	r2,[r0],#1
	ADD	r3,r2,r3,LSL#8
	LDRB	r2,[r0],#1
	ADD	r3,r3,r2,LSL#24	; sic
	LDRB	r2,[r0],#1
	ADD	r3,r3,r2,LSL#16	; sic
	STR	r3,[r1],#4
	SUBS	r4,r4,#1
	BGT	l1
	; now we've added 16 bytes to r0 (no longer needed)
	; and r1 (needs resetting).
	SUB	r0,r1,#16	; so r0 is now EK
	; Second loop: we could certainly optimise this way further,
	; but it's not done very often so we don't bother.
	MOV	ip,#6
	LDMIA	r0!,{r1,r2,r3,r4}
	; r1: EK[0,1]
	; r2: EK[2,3]
	; r3: EK[4,5]
	; r4: EK[6,7]
l2	MOV	r1,r1,ROR#16
	MOV	r2,r2,ROR#16
	MOV	r3,r3,ROR#16
	MOV	r4,r4,ROR#16
	; now we have, always, high 16bits first.
	MOV	r5,r1,LSL#25
	ADD	r5,r5,r2,LSR#7	; r5: 1,2 2,3
	MOV	r6,r2,LSL#25
	ADD	r6,r6,r3,LSR#7	; r6: 3,4 4,5
	MOV	r7,r3,LSL#25
	ADD	r7,r7,r4,LSR#7	; r7: 5,6 6,7
	MOV	r8,r4,LSL#25
	ADD	r8,r8,r1,LSR#7	; r8: 7,0 0,1
	; get order right again
	MOV	r1,r5,ROR#16
	MOV	r2,r6,ROR#16
	MOV	r3,r7,ROR#16
	MOV	r4,r8,ROR#16
	SUBS	ip,ip,#1
	STMGTIA	r0!,{r1,r2,r3,r4}
	BGT	l2
	STMIA	r0!,{r1,r2}	; last time only write 4 halfwords
	; Done. (Phew!)
	LDMFD	sp!,{r4-r8,pc}

	END
