; Fixed point math ARM code V1.67 17/2/08
; 16e8 code probably doesn't work
; See fixmath.h for usage stuff
; Copyright 2008 Jeffrey Lee
; This file is part of WOUM.
; WOUM is free software: you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation, either version 3 of the License, or
; (at your option) any later version.
; WOUM is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; GNU General Public License for more details.
; You should have received a copy of the GNU General Public License
; along with WOUM.  If not, see <http://www.gnu.org/licenses/>.

	GET bindings.s

F16E8_CODE	EQU	0 ; Set to 1 to enable the unfinished f16e8 code
OLDCODE		EQU	0 ; Set to 1 to enable some old float/double to long long conversion code (Required when there was a bug in GCC's code)
DEBUG		EQU	0 ; Set to 1 to enable the debugging code (Doesn't mean it'll be used, though)

	MACRO
	GEN_DIV $dendacc,$doracc,$outacc,$fname
	; Produce APCS function to divide R0 (which has $dendacc fraction bits) by R1 (which has $doracc fraction bits), producing a result in R0 (with $outacc fraction bits)
	; R2 = count of how many shifts to do
	; R3 = remainder
	; R4 = result
	; R5 = negative flag
	FBEGIN $fname,1
	STMFD R13!,{R4-R5,R14}
	EOR R5,R0,R1 ; R5's sign bit is the sign of the result we want
	CMP R0,#0
	RSBLT R0,R0,#0
	CMP R1,#0
	BEQ %f90 ; Go straight to overflow code for /0
	RSBLT R1,R1,#0
	MOV R2,#32+$outacc+$dendacc-$doracc ; Calculate how many shifts we need
	MOV R3,#0
	MOV R4,#0
91	; Main loop
	MOVS R0,R0,LSL #1
	ADC R3,R3,R3
	MOVS R4,R4,LSL #1
	BMI %f90 ; Overflow
	CMP R3,R1
	SUBHS R3,R3,R1
	ADDHS R4,R4,#1
	SUBS R2,R2,#1
	BLE %f92 ; End
	; Unwind a bit
	MOVS R0,R0,LSL #1
	ADC R3,R3,R3
	MOVS R4,R4,LSL #1
	BMI %f90 ; Overflow
	CMP R3,R1
	SUBHS R3,R3,R1
	ADDHS R4,R4,#1
	SUBS R2,R2,#1
	BGT %b91
92	; End code
	MOV R0,R4
	CMP R5,#0
	RSBLT R0,R0,#0
	LDMFD R13!,{R4-R5,PC}
90	; Overflow code
	CMP R5,#0
	MOVLT R0,#&80000000 ; Maximum negative
	MVNGE R0,#&80000000 ; Maximum positive
	LDMFD R13!,{R4-R5,PC}
	MEND

	AREA	fp_math, CODE, READONLY

	EXPORT	f1616_mul
	EXPORT	f1616_div
	EXPORT	f1616_tridiv
	EXPORT	f1616_sqrt
	EXPORT	f1616_sqr
	EXPORT	f1616_sin
	EXPORT	f1616_cos
	EXPORT	f1616_atan2

	EXPORT	f248_mul
	EXPORT	f248_div
	EXPORT	f248_tridiv
	EXPORT	f248_sqrt
	EXPORT	f248_sqr
	EXPORT	f248_sin
	EXPORT	f248_cos

	EXPORT	f2012_mul
	EXPORT	f2012_div
	EXPORT	f2012_tridiv
	EXPORT	f2012_sqrt
	EXPORT	f2012_sqr
	EXPORT	f2012_sin
	EXPORT	f2012_cos

	EXPORT	f4816_mul
	EXPORT	f4816_div
	EXPORT	f4816_tridiv
	EXPORT	f4816_sqrt
	EXPORT	f4816_sqr
	EXPORT	f4816_sin
	EXPORT	f4816_cos

	[ F16E8_CODE = 1
	EXPORT	f16e8_add
	EXPORT	f16e8_sub
	EXPORT	f16e8_mul
	EXPORT	f16e8_toint
	EXPORT	f16e8_fromint
	]

	EXPORT	int_sqrt

	EXPORT	gen_vlen
	EXPORT	gen_movetox
	EXPORT	gen_movetoy
	EXPORT	gen_movetoz
	EXPORT	gen_pmovetox
	EXPORT	gen_pmovetoy
	EXPORT	gen_inverse

	EXPORT	gen_move2d

	[ OLDCODE = 1
	EXPORT	float_to_long_long
	EXPORT	double_to_long_long
	]

	EXPORT	calc_izgrad

f1616_costab
	DCD 65536, 65535, 65535, 65535, 65535, 65535, 65534, 65534, 65533
	DCD 65532, 65532, 65531, 65530, 65529, 65528, 65527, 65526, 65524
	DCD 65523, 65521, 65520, 65518, 65517, 65515, 65513, 65511, 65509
	DCD 65507, 65505, 65503, 65500, 65498, 65496, 65493, 65490, 65488
	DCD 65485, 65482, 65479, 65476, 65473, 65470, 65467, 65463, 65460
	DCD 65457, 65453, 65449, 65446, 65442, 65438, 65434, 65430, 65426
	DCD 65422, 65418, 65413, 65409, 65404, 65400, 65395, 65390, 65386
	DCD 65381, 65376, 65371, 65366, 65361, 65355, 65350, 65345, 65339
	DCD 65333, 65328, 65322, 65316, 65310, 65304, 65298, 65292, 65286
	DCD 65280, 65273, 65267, 65261, 65254, 65247, 65241, 65234, 65227
	DCD 65220, 65213, 65206, 65199, 65191, 65184, 65176, 65169, 65161
	DCD 65154, 65146, 65138, 65130, 65122, 65114, 65106, 65098, 65090
	DCD 65081, 65073, 65064, 65056, 65047, 65038, 65029, 65021, 65012
	DCD 65002, 64993, 64984, 64975, 64965, 64956, 64946, 64937, 64927
	DCD 64917, 64908, 64898, 64888, 64878, 64868, 64857, 64847, 64837
	DCD 64826, 64816, 64805, 64794, 64784, 64773, 64762, 64751, 64740
	DCD 64729, 64717, 64706, 64695, 64683, 64672, 64660, 64648, 64637
	DCD 64625, 64613, 64601, 64589, 64577, 64565, 64552, 64540, 64527
	DCD 64515, 64502, 64490, 64477, 64464, 64451, 64438, 64425, 64412
	DCD 64399, 64385, 64372, 64359, 64345, 64331, 64318, 64304, 64290
	DCD 64276, 64262, 64248, 64234, 64220, 64206, 64191, 64177, 64162
	DCD 64148, 64133, 64118, 64103, 64088, 64074, 64058, 64043, 64028
	DCD 64013, 63997, 63982, 63967, 63951, 63935, 63920, 63904, 63888
	DCD 63872, 63856, 63840, 63824, 63807, 63791, 63774, 63758, 63741
	DCD 63725, 63708, 63691, 63674, 63657, 63640, 63623, 63606, 63589
	DCD 63571, 63554, 63537, 63519, 63501, 63484, 63466, 63448, 63430
	DCD 63412, 63394, 63376, 63358, 63339, 63321, 63302, 63284, 63265
	DCD 63247, 63228, 63209, 63190, 63171, 63152, 63133, 63114, 63094
	DCD 63075, 63056, 63036, 63016, 62997, 62977, 62957, 62937, 62917
	DCD 62897, 62877, 62857, 62837, 62816, 62796, 62775, 62755, 62734
	DCD 62714, 62693, 62672, 62651, 62630, 62609, 62588, 62566, 62545
	DCD 62524, 62502, 62481, 62459, 62437, 62416, 62394, 62372, 62350
	DCD 62328, 62306, 62284, 62261, 62239, 62217, 62194, 62171, 62149
	DCD 62126, 62103, 62080, 62058, 62034, 62011, 61988, 61965, 61942
	DCD 61918, 61895, 61871, 61848, 61824, 61800, 61776, 61753, 61729
	DCD 61705, 61680, 61656, 61632, 61608, 61583, 61559, 61534, 61510
	DCD 61485, 61460, 61435, 61410, 61385, 61360, 61335, 61310, 61285
	DCD 61259, 61234, 61208, 61183, 61157, 61131, 61105, 61080, 61054
	DCD 61028, 61002, 60975, 60949, 60923, 60896, 60870, 60843, 60817
	DCD 60790, 60763, 60737, 60710, 60683, 60656, 60629, 60601, 60574
	DCD 60547, 60519, 60492, 60464, 60437, 60409, 60381, 60354, 60326
	DCD 60298, 60270, 60242, 60213, 60185, 60157, 60128, 60100, 60071
	DCD 60043, 60014, 59985, 59957, 59928, 59899, 59870, 59841, 59811
	DCD 59782, 59753, 59723, 59694, 59664, 59635, 59605, 59575, 59545
	DCD 59516, 59486, 59456, 59425, 59395, 59365, 59335, 59304, 59274
	DCD 59243, 59213, 59182, 59151, 59121, 59090, 59059, 59028, 58997
	DCD 58965, 58934, 58903, 58871, 58840, 58809, 58777, 58745, 58714
	DCD 58682, 58650, 58618, 58586, 58554, 58522, 58490, 58457, 58425
	DCD 58393, 58360, 58327, 58295, 58262, 58229, 58197, 58164, 58131
	DCD 58098, 58064, 58031, 57998, 57965, 57931, 57898, 57864, 57831
	DCD 57797, 57763, 57730, 57696, 57662, 57628, 57594, 57560, 57525
	DCD 57491, 57457, 57422, 57388, 57353, 57319, 57284, 57249, 57214
	DCD 57179, 57144, 57109, 57074, 57039, 57004, 56969, 56933, 56898
	DCD 56862, 56827, 56791, 56755, 56720, 56684, 56648, 56612, 56576
	DCD 56540, 56503, 56467, 56431, 56395, 56358, 56322, 56285, 56248
	DCD 56212, 56175, 56138, 56101, 56064, 56027, 55990, 55953, 55915
	DCD 55878, 55841, 55803, 55766, 55728, 55691, 55653, 55615, 55577
	DCD 55539, 55501, 55463, 55425, 55387, 55349, 55310, 55272, 55234
	DCD 55195, 55156, 55118, 55079, 55040, 55002, 54963, 54924, 54885
	DCD 54846, 54806, 54767, 54728, 54688, 54649, 54610, 54570, 54530
	DCD 54491, 54451, 54411, 54371, 54331, 54291, 54251, 54211, 54171
	DCD 54131, 54090, 54050, 54009, 53969, 53928, 53888, 53847, 53806
	DCD 53765, 53724, 53683, 53642, 53601, 53560, 53519, 53478, 53436
	DCD 53395, 53353, 53312, 53270, 53229, 53187, 53145, 53103, 53061
	DCD 53019, 52977, 52935, 52893, 52851, 52808, 52766, 52724, 52681
	DCD 52639, 52596, 52553, 52510, 52468, 52425, 52382, 52339, 52296
	DCD 52253, 52210, 52166, 52123, 52080, 52036, 51993, 51949, 51906
	DCD 51862, 51818, 51774, 51730, 51687, 51643, 51599, 51554, 51510
	DCD 51466, 51422, 51377, 51333, 51289, 51244, 51199, 51155, 51110
	DCD 51065, 51020, 50975, 50931, 50886, 50840, 50795, 50750, 50705
	DCD 50660, 50614, 50569, 50523, 50478, 50432, 50386, 50341, 50295
	DCD 50249, 50203, 50157, 50111, 50065, 50019, 49972, 49926, 49880
	DCD 49833, 49787, 49740, 49694, 49647, 49601, 49554, 49507, 49460
	DCD 49413, 49366, 49319, 49272, 49225, 49178, 49130, 49083, 49036
	DCD 48988, 48941, 48893, 48845, 48798, 48750, 48702, 48654, 48606
	DCD 48558, 48510, 48462, 48414, 48366, 48318, 48269, 48221, 48173
	DCD 48124, 48076, 48027, 47978, 47929, 47881, 47832, 47783, 47734
	DCD 47685, 47636, 47587, 47538, 47488, 47439, 47390, 47340, 47291
	DCD 47241, 47192, 47142, 47092, 47043, 46993, 46943, 46893, 46843
	DCD 46793, 46743, 46693, 46643, 46593, 46542, 46492, 46441, 46391
	DCD 46340, 46290, 46239, 46189, 46138, 46087, 46036, 45985, 45934
	DCD 45883, 45832, 45781, 45730, 45679, 45627, 45576, 45525, 45473
	DCD 45422, 45370, 45318, 45267, 45215, 45163, 45112, 45060, 45008
	DCD 44956, 44904, 44852, 44799, 44747, 44695, 44643, 44590, 44538
	DCD 44485, 44433, 44380, 44328, 44275, 44222, 44169, 44117, 44064
	DCD 44011, 43958, 43905, 43852, 43798, 43745, 43692, 43639, 43585
	DCD 43532, 43478, 43425, 43371, 43318, 43264, 43210, 43157, 43103
	DCD 43049, 42995, 42941, 42887, 42833, 42779, 42725, 42670, 42616
	DCD 42562, 42507, 42453, 42398, 42344, 42289, 42235, 42180, 42125
	DCD 42070, 42016, 41961, 41906, 41851, 41796, 41741, 41686, 41630
	DCD 41575, 41520, 41464, 41409, 41354, 41298, 41243, 41187, 41131
	DCD 41076, 41020, 40964, 40908, 40853, 40797, 40741, 40685, 40629
	DCD 40572, 40516, 40460, 40404, 40347, 40291, 40235, 40178, 40122
	DCD 40065, 40009, 39952, 39895, 39839, 39782, 39725, 39668, 39611
	DCD 39554, 39497, 39440, 39383, 39326, 39269, 39211, 39154, 39097
	DCD 39039, 38982, 38924, 38867, 38809, 38752, 38694, 38636, 38578
	DCD 38521, 38463, 38405, 38347, 38289, 38231, 38173, 38115, 38056
	DCD 37998, 37940, 37882, 37823, 37765, 37706, 37648, 37589, 37531
	DCD 37472, 37414, 37355, 37296, 37237, 37178, 37119, 37061, 37002
	DCD 36943, 36883, 36824, 36765, 36706, 36647, 36587, 36528, 36469
	DCD 36409, 36350, 36290, 36231, 36171, 36112, 36052, 35992, 35932
	DCD 35873, 35813, 35753, 35693, 35633, 35573, 35513, 35453, 35393
	DCD 35332, 35272, 35212, 35152, 35091, 35031, 34970, 34910, 34849
	DCD 34789, 34728, 34668, 34607, 34546, 34485, 34425, 34364, 34303
	DCD 34242, 34181, 34120, 34059, 33998, 33937, 33876, 33814, 33753
	DCD 33692, 33630, 33569, 33508, 33446, 33385, 33323, 33262, 33200
	DCD 33138, 33077, 33015, 32953, 32891, 32829, 32768, 32706, 32644
	DCD 32582, 32520, 32457, 32395, 32333, 32271, 32209, 32146, 32084
	DCD 32022, 31959, 31897, 31834, 31772, 31709, 31647, 31584, 31522
	DCD 31459, 31396, 31333, 31271, 31208, 31145, 31082, 31019, 30956
	DCD 30893, 30830, 30767, 30704, 30640, 30577, 30514, 30451, 30387
	DCD 30324, 30261, 30197, 30134, 30070, 30007, 29943, 29880, 29816
	DCD 29752, 29689, 29625, 29561, 29497, 29433, 29369, 29305, 29242
	DCD 29178, 29113, 29049, 28985, 28921, 28857, 28793, 28729, 28664
	DCD 28600, 28536, 28471, 28407, 28342, 28278, 28213, 28149, 28084
	DCD 28020, 27955, 27890, 27826, 27761, 27696, 27631, 27567, 27502
	DCD 27437, 27372, 27307, 27242, 27177, 27112, 27047, 26982, 26916
	DCD 26851, 26786, 26721, 26655, 26590, 26525, 26459, 26394, 26328
	DCD 26263, 26197, 26132, 26066, 26001, 25935, 25869, 25804, 25738
	DCD 25672, 25606, 25541, 25475, 25409, 25343, 25277, 25211, 25145
	DCD 25079, 25013, 24947, 24881, 24815, 24748, 24682, 24616, 24550
	DCD 24483, 24417, 24351, 24284, 24218, 24151, 24085, 24019, 23952
	DCD 23885, 23819, 23752, 23686, 23619, 23552, 23486, 23419, 23352
	DCD 23285, 23218, 23151, 23085, 23018, 22951, 22884, 22817, 22750
	DCD 22683, 22616, 22548, 22481, 22414, 22347, 22280, 22212, 22145
	DCD 22078, 22011, 21943, 21876, 21808, 21741, 21674, 21606, 21539
	DCD 21471, 21404, 21336, 21268, 21201, 21133, 21065, 20998, 20930
	DCD 20862, 20794, 20727, 20659, 20591, 20523, 20455, 20387, 20319
	DCD 20251, 20183, 20115, 20047, 19979, 19911, 19843, 19775, 19707
	DCD 19638, 19570, 19502, 19434, 19365, 19297, 19229, 19160, 19092
	DCD 19024, 18955, 18887, 18818, 18750, 18681, 18613, 18544, 18476
	DCD 18407, 18338, 18270, 18201, 18132, 18064, 17995, 17926, 17857
	DCD 17789, 17720, 17651, 17582, 17513, 17444, 17375, 17306, 17238
	DCD 17169, 17100, 17031, 16961, 16892, 16823, 16754, 16685, 16616
	DCD 16547, 16478, 16408, 16339, 16270, 16201, 16131, 16062, 15993
	DCD 15923, 15854, 15785, 15715, 15646, 15576, 15507, 15438, 15368
	DCD 15299, 15229, 15160, 15090, 15020, 14951, 14881, 14812, 14742
	DCD 14672, 14603, 14533, 14463, 14393, 14324, 14254, 14184, 14114
	DCD 14044, 13975, 13905, 13835, 13765, 13695, 13625, 13555, 13485
	DCD 13415, 13345, 13275, 13205, 13135, 13065, 12995, 12925, 12855
	DCD 12785, 12715, 12645, 12575, 12504, 12434, 12364, 12294, 12224
	DCD 12153, 12083, 12013, 11942, 11872, 11802, 11732, 11661, 11591
	DCD 11520, 11450, 11380, 11309, 11239, 11168, 11098, 11028, 10957
	DCD 10887, 10816, 10746, 10675, 10604, 10534, 10463, 10393, 10322
	DCD 10252, 10181, 10110, 10040, 9969, 9898, 9828, 9757, 9686, 9616
	DCD 9545, 9474, 9403, 9333, 9262, 9191, 9120, 9050, 8979, 8908, 8837
	DCD 8766, 8695, 8625, 8554, 8483, 8412, 8341, 8270, 8199, 8128, 8057
	DCD 7986, 7915, 7844, 7773, 7702, 7631, 7560, 7489, 7418, 7347, 7276
	DCD 7205, 7134, 7063, 6992, 6921, 6850, 6779, 6708, 6637, 6565, 6494
	DCD 6423, 6352, 6281, 6210, 6139, 6067, 5996, 5925, 5854, 5783, 5711
	DCD 5640, 5569, 5498, 5426, 5355, 5284, 5213, 5141, 5070, 4999, 4928
	DCD 4856, 4785, 4714, 4642, 4571, 4500, 4428, 4357, 4286, 4214, 4143
	DCD 4072, 4000, 3929, 3858, 3786, 3715, 3644, 3572, 3501, 3429, 3358
	DCD 3287, 3215, 3144, 3072, 3001, 2930, 2858, 2787, 2715, 2644, 2572
	DCD 2501, 2430, 2358, 2287, 2215, 2144, 2072, 2001, 1929, 1858, 1786
	DCD 1715, 1644, 1572, 1501, 1429, 1358, 1286, 1215, 1143, 1072, 1000
	DCD 929, 857, 786, 714, 643, 571, 500, 428, 357, 285, 214, 142, 71, 0

f1616_costabptr
	DCD f1616_costab

	FBEGIN f1616_mul,1
	; r0,r1=in
	; r0=out
	[ TARGET_CPU = "ARM7M"
	SMULL R2,R3,R0,R1
	MOV R0,R2,LSR #16
	ORR R0,R0,R3,LSL #16
	; Overflow check
	MOVS R3,R3,ASR #16
	ADDLTS R3,R3,#1
	MOVNE R0,#&80000000 ; Maximum negative
	ADDGT R0,R0,#1 ; Maximum positive
	MOV PC,R14
	|
	STMFD R13!,{R4-R8,R14}
	CMP R0,#0
	MOVLT R8,#1
	RSBLT R0,R0,#0
	MOVGE R8,#0
	CMP R1,#0
	EORLT R8,R8,#1 ; Invert the sign bit
	RSBLT R1,R1,#0
	MOV R2,R0,LSR #16 ; R2=High bits (A)
	MOV R3,R1,LSR #16 ; R3=High bits (B)
	BIC R0,R0,R2,LSL #16 ; R0=Low bits (a)
	BIC R1,R1,R3,LSL #16 ; R1=Low bits (b)
	MUL R7,R2,R3 ; AB
	CMP R7,#32768
	BGE f1616_mul_overflow
	MUL R4,R0,R1 ; Multiply - ab
	MUL R5,R0,R3 ; aB
	MUL R6,R2,R1 ; Ab
	; Now recombine...
	MOV R0,R7,LSL #16 ; High bits
	ORR R0,R0,R4,LSR #16 ; Low bits
	ADDS R0,R0,R5 ; Middle bits
	BLT f1616_mul_overflow
	ADDS R0,R0,R6
	BLT f1616_mul_overflow
	CMP R8,#1
	RSBEQ R0,R0,#0
	LDMFD R13!,{R4-R8,PC}
f1616_mul_overflow
	CMP R8,#0
	MOV R0,#&80000000 ; Maximum negative
	SUBEQ R0,R0,#1 ; Maximum positive
	LDMFD R13!,{R4-R8,PC}
	]


	GEN_DIV 16,16,16,f1616_div

	FBEGIN f1616_tridiv,1
	; r0-r2=pointers to numbers
	; r3=number to divide by
	STMFD R13!,{R4-R7,R14}
	; Since muls are so quick, calculate 1/r2 and then mul the others
	; Cheat and use mul/div routines at the moment...
	MOV R4,R0
	MOV R5,R1
	MOV R6,R2
	MOV R0,#65536
	MOV R1,R3
	BL f1616_div
	MOV R7,R0
	MOV R1,R0
	LDR R0,[R4]
	BL f1616_mul
	STR R0,[R4]
	MOV R1,R7
	LDR R0,[R5]
	BL f1616_mul
	STR R0,[R5]
	MOV R1,R7
	LDR R0,[R6]
	BL f1616_mul
	STR R0,[R6]
	LDMFD R13!,{R4-R7,PC}

	FBEGIN f1616_sqrt,1
	; r0 = in
	; r0 = out
	[ TARGET_CPU = "ARM7M"
	; Max number would be 256(*65536)...
	; First do negative checks
	CMP R0,#0
	MOVLT R0,#0 ; Assume it's meant to be 0
	MOVLE PC,R14
	MOV R2,#128*65536 ; Shift value
	MOV R1,#0 ; Set up start value
f1616_sqrt_loop
	; Just use UMULL all the way
	UMULL R3,R12,R1,R1
	MOV R3,R3,LSR #16
	ORR R3,R3,R12,LSL #16
	CMP R3,R0
	SUBHI R1,R1,R2 ; Use HI and LO to ignore sign - handles >16384 OK now
	ADDLO R1,R1,R2
	MOVNES R2,R2,LSR #1 ; if answer is found this will be skipped & code will exit
	BNE f1616_sqrt_loop ; Otherwise the instruction will run and may come to end of cycle
	MOV R0,R1
	MOV PC,R14
	|
	STMFD R13!,{R4-R6,R14}
	; Max number would be 256(*65536)...
	; First do negative checks
	CMP R0,#0
	MOVLT R0,#0 ; Assume it's meant to be 0
	LDMLEFD R13!,{R4-R6,PC}
	MOV R2,#128*65536 ; Shift value
	MOV R1,#0 ; Set up start value
f1616_sqrt_loop
	; Multiply!
	; Do quick MULs to start off with
	MOVS R3,R1,LSR #16
	BCS f1616_sqrt_part2
	MUL R3,R1,R3 ; X*65536*X=X^2*65536
	CMP R3,R0
	SUBHI R1,R1,R2 ; Use HI and LO to ignore sign - handles >16384 OK now
	ADDLO R1,R1,R2
	MOVNES R2,R2,LSR #1 ; if answer is found this will be skipped & code will exit
	BNE f1616_sqrt_loop ; Otherwise the instruction will run and may come to end of cycle
	MOV R0,R1
	LDMFD R13!,{R4-R6,PC}
f1616_sqrt_loop2
	MOV R3,R1,LSR #16
f1616_sqrt_part2
	; Need to do a long MUL...
	; R3 is already the high bits
	SUB R4,R1,R3,LSL #16 ; Get low bits
	; Now we need 2xHL+HH+LL...
	MOV R5,R3
	MOV R6,R4
	MUL R5,R3,R5 ; HH
	MUL R6,R4,R6 ; LL
	MUL R3,R4,R3 ; HL
	MOV R6,R6,LSR #16 ; Get to right pos
	ORR R6,R6,R5,LSL #16 ; No need for overflow checks
	ADD R6,R6,R3,LSL #1 ; *2
	CMP R6,R0
	SUBHI R1,R1,R2
	ADDLO R1,R1,R2
	MOVNES R2,R2,LSR #1
	BNE f1616_sqrt_loop2
	MOV R0,R1
	LDMFD R13!,{R4-R6,PC}
	]

	FBEGIN f1616_sqr,1
	; r0=in
	; r0=out
	[ TARGET_CPU = "ARM7M"
	SMULL R1,R2,R0,R0
	CMP R2,#32768
	MVNGE R0,#&80000000 ; Overflow - set to maximum positive
	MOVLT R0,R1,LSR #16
	ORRLT R0,R0,R2,LSL #16
	MOV PC,R14
	|
	CMP R0,#0
	RSBLT R0,R0,#0
	MOV R1,R0,LSR #16 ; R1=High bits (H)
	BIC R0,R0,R1,LSL #16 ; R0=Low bits (L)
	MOV R2,R0
	MOV R3,R1
	MUL R3,R1,R3 ; HH
	CMP R3,#32768
	BGE f1616_sqr_overflow
	MUL R2,R0,R2 ; LL
	MUL R1,R0,R1 ; HL
	; Now recombine...
	MOV R0,R3,LSL #16 ; High bits
	ORR R0,R0,R2,LSR #16 ; Low bits
	ADDS R0,R0,R1,LSL #1 ; Middle bits
	BCS f1616_sqr_overflow ; Should be OK
	MOV PC,R14
f1616_sqr_overflow
	MVN R0,#&80000000 ; Maximum positive
	MOV PC,R14
	]

f1616_sin ; r0=in,r0=out
	; since sin x=cos (x-90)...
	SUB R0,R0,#90*65536
f1616_cos ; R0=angle, returns value
	CMP R0,#0
	RSBLT R0,R0,#0 ; Make positive
f1616_cos_loop
	CMP R0,#360*65536
	SUBGE R0,R0,#360*65536
	BGT f1616_cos_loop
	LDR R3,f1616_costabptr ; Start this load nice and early
	MOV R1,R0,LSR #12 ; Get location in table
	SUB R0,R0,R1,LSL #12 ; Get interpolation factor
	ADD R2,R1,#1 ; Location of 2nd value 
	; Work out quadrant
	CMP R1,#180*16
	RSBGE R1,R1,#360*16 ; Reflect if >180
	RSBGE R2,R2,#360*16
	CMP R1,#90*16
	CMPGE R2,#90*16
	RSBGE R1,R1,#180*16 ; Reflect again
	RSBGE R2,R2,#180*16
	LDR R1,[R3,R1,LSL #2]
	LDR R2,[R3,R2,LSL #2]
	MUL R2,R0,R2
	RSB R0,R0,#4096
	MLA R1,R0,R1,R2
	RSBGE R1,R1,#0 ; Negate answer if needed
	MOV R0,R1,ASR #12 ; Back to 16.16
	MOV PC,R14

	FBEGIN f1616_atan2,1 ; r0 = x, r1 = y, r0 = out
	; Returns angle such that sin(angle) = x, cos(angle) = y
	; First we need to normalize the values
	STMFD R13!,{R4-R11,R14}
	[ TARGET_CPU = "ARM7M"
	SMULL R5,R3,R0,R0 ; Get length into [R3,R5] like the old version
	SMLAL R5,R3,R1,R1
	CMP R3,#&40000000
	BHS f1616_atan2_vlen_overflow
	MOV R10,#0 ; Result
	MOV R11,#&20000000 ; Twiddle bit
f1616_atan2_vlen_loop
	; Just use UMULL all the way
	UMULL R7,R6,R10,R10
	CMP R6,R3
	CMPEQ R7,R5
	BEQ f1616_atan2_vlen_done
	SUBHI R10,R10,R11 ; Clear twiddle if we've overshot
	ADDLO R10,R10,R11 ; Add twiddle bit if we're too low
	MOVS R11,R11,LSR #1
	BNE f1616_atan2_vlen_loop
	B f1616_atan2_vlen_done
	|
	; Code ripped out of gen_vlen and then reduced to calculate for a 2D vector
	; Use r3 as HH
	; r4 as HL
	; r5 as LL
	CMP R0,#0
	MOVGE R2,R0
	RSBLT R2,R0,#0
	MOV R6,R2,LSR #16 ; High bits
	BIC R7,R2,R6,LSL #16 ; Low bits
	MOV R8,R6 ; High 2
	MOV R9,R7 ; Low 2
	MUL R5,R7,R9 ; LL
	MUL R4,R6,R7 ; HL
	MUL R3,R6,R8 ; HH
	MOV R8,R4,LSL #17
	ADDS R5,R5,R8
	ADCS R3,R3,R4,LSR #15
	BLT f1616_atan2_vlen_overflow
	CMP R1,#0
	MOVGE R2,R1
	RSBLT R2,R1,#0
	MOV R6,R2,LSR #16 ; High bits
	BIC R7,R2,R6,LSL #16 ; Low bits
	MOV R8,R6 ; High 2
	MOV R9,R7 ; Low 2
	MUL R9,R7,R9 ; LL
	ADDS R5,R5,R9
	ADCS R3,R3,#0 ; Dummy add so the BLT works OK when there was no carry
	BLT f1616_atan2_vlen_overflow
	MUL R4,R6,R7 ; HL
	MUL R6,R8,R6 ; HH
	ADDS R3,R3,R6
	BLT f1616_atan2_vlen_overflow
	MOV R8,R4,LSL #17
	ADDS R5,R5,R8
	ADCS R3,R3,R4,LSR #15
	BLT f1616_atan2_vlen_overflow
	; Now [R3,R5] is value to sqrt
	MOV R10,#0 ; Result
	MOV R11,#&40000000 ; Twiddle bit
f1616_atan2_vlen_loop
	MOV R6,R10,LSR #16 ; High bits
	BIC R7,R10,R6,LSL #16 ; Low bits
	MOV R8,R6 ; High 2
	MOV R9,R7 ; Low 2
	MUL R7,R9,R7 ; LL
	MUL R4,R6,R9 ; HL
	MUL R6,R8,R6 ; HH
	MOV R8,R4,LSL #17
	ADDS R7,R7,R8
	ADC R6,R6,R4,LSR #15
	CMP R6,R3
	CMPEQ R7,R5
	BEQ f1616_atan2_vlen_done
	SUBHI R10,R10,R11 ; Clear twiddle if we've overshot
	ADDLO R10,R10,R11 ; Add twiddle bit if we're too low
	MOVS R11,R11,LSR #1
	BNE f1616_atan2_vlen_loop
	B f1616_atan2_vlen_done
	]
f1616_atan2_vlen_overflow
	MVN R10,#&80000000
f1616_atan2_vlen_done
	LDR R6,f1616_costabptr ; Start this load nice and early
	; Now divide by the length :(
	; I'll just cheat and use f1616_div for this
	MOV R5,R1
	MOV R1,R10
	BL f1616_div
	MOV R4,R0
	LDR R7,[R6,#45*16*4] ; Start this load nice and early
	MOV R0,R5
	MOV R1,R10
	BL f1616_div
	MOV R5,R0
	; Now we can do the actual work!
	; R4 = X
	; R5 = Y
	CMP R5,R7
	BGE f1616_atan2_315_to_45
	CMP R4,R7
	BGE f1616_atan2_45_to_135
	CMN R5,R7
	BLE f1616_atan2_135_to_225
	CMN R4,R7
	MOVGT R0,#0 ; Bad args? - just return 0
	LDMGTFD R13!,{R4-R11,PC}
	; Else 225 to 315
	MVN R0,R4 ; Search input
	MOV R1,R5 ; If Y<0 then invert result
	MOV R2,#256*65536 ; Offset
	ADD R2,R2,#14*65536 ; Add up to 270
	B f1616_atan2_search
f1616_atan2_315_to_45
	MOV R0,R5
	MOV R1,R4 ; If X<0 then invert result
	MOV R2,#0 ; Offset
	B f1616_atan2_search
f1616_atan2_45_to_135
	MOV R0,R4
	MVN R1,R5 ; If Y>0 then invert
	MOV R2,#90*65536 ; Offset
	B f1616_atan2_search
f1616_atan2_135_to_225
	MVN R0,R5
	MVN R1,R4 ; If X>0 then invert
	MOV R2,#180*65536 ; Offset
f1616_atan2_search
	MOV R3,#0 ; Current index
	MOV R4,#45*16 ; Twiddle bit (Maybe we only need a twiddle of half this?)
	CMP R0,#65536
	BGE f1616_atan2_search_done ; Safety check to ensure we don't overrun the buffer
f1616_atan2_search_loop
	; Partially unrolled binary search for result
	LDR R5,[R6,R3,LSL #2]
	CMP R5,R0
	ADDGT R3,R3,R4
	SUBLT R3,R3,R4
	MOVNES R4,R4,LSR #1 ; Cunning use of NE to early exit if we've found the exact result
	BEQ f1616_atan2_search_done
	LDR R5,[R6,R3,LSL #2]
	CMP R5,R0
	ADDGT R3,R3,R4
	SUBLT R3,R3,R4
	MOVNES R4,R4,LSR #1
	BNE f1616_atan2_search_loop
f1616_atan2_search_done
	CMP R1,#0
	SUBLT R0,R2,R3,LSL #12
	ADDGE R0,R2,R3,LSL #12
	; Correct output is -180 to 180, but we currently calculate 0 to 360
	; Fix that here
	CMP R0,#180*65536
	SUBGT R0,R0,#360*65536 
	LDMFD R13!,{R4-R11,PC}

	FBEGIN f248_mul,1
	; r0,r1=in
	; r0=out
	[ TARGET_CPU = "ARM7M"
	SMULL R2,R3,R0,R1
	MOV R0,R2,LSR #8
	ORR R0,R0,R3,LSL #24
	; Overflow check
	MOVS R3,R3,ASR #8
	ADDLTS R3,R3,#1
	MOVNE R0,#&80000000 ; Maximum negative
	ADDGT R0,R0,#1 ; Maximum positive
	MOV PC,R14
	|
	STMFD R13!,{R4-R8,R14}
	CMP R0,#0
	MOVLT R8,#1
	RSBLT R0,R0,#0
	MOVGE R8,#0
	CMP R1,#0
	EORLT R8,R8,#1 ; Invert the sign bit
	RSBLT R1,R1,#0
	MOV R2,R0,LSR #16 ; R2=High bits (A)
	MOV R3,R1,LSR #16 ; R3=High bits (B)
	BIC R0,R0,R2,LSL #16 ; R0=Low bits (a)
	BIC R1,R1,R3,LSL #16 ; R1=Low bits (b)
	MUL R7,R2,R3 ; AB
	CMP R7,#256 ; Hmmm... seems a bit low... ah well.
	BGE f248_mul_overflow
	MUL R4,R0,R1 ; Multiply - ab
	MUL R5,R0,R3 ; aB
	TST R5,#&FF000000
	BNE f248_mul_overflow
	MUL R6,R2,R1 ; Ab
	TST R6,#&FF000000
	BNE f248_mul_overflow
	; Now recombine...
	MOV R0,R7,LSL #24 ; High bits
	ORR R0,R0,R4,LSR #8 ; Low bits
	ADDS R0,R0,R5,LSL #8 ; Middle bits
	BLT f248_mul_overflow
	ADDS R0,R0,R6,LSL #8
	BLT f248_mul_overflow
	CMP R8,#1
	RSBEQ R0,R0,#0
	LDMFD R13!,{R4-R8,PC}
f248_mul_overflow
	CMP R8,#0
	MOV R0,#&80000000 ; Maximum negative
	SUBEQ R0,R0,#1 ; Maximum positive
	LDMFD R13!,{R4-R8,PC}
	]


	GEN_DIV 8,8,8,f248_div

	FBEGIN f248_tridiv,1
	; r0-r2=pointers to numbers
	; r3=number to divide by
	STMFD R13!,{R4-R7,R14}
	; Since muls are so quick, calculate 1/r2 and then mul the others
	; Cheat and use mul/div routines at the moment...
	MOV R4,R0
	MOV R5,R1
	MOV R6,R2
	MOV R0,#256
	MOV R1,R3
	BL f248_div
	MOV R7,R0
	MOV R1,R0
	LDR R0,[R4]
	BL f248_mul
	STR R0,[R4]
	MOV R1,R7
	LDR R0,[R5]
	BL f248_mul
	STR R0,[R5]
	MOV R1,R7
	LDR R0,[R6]
	BL f248_mul
	STR R0,[R6]
	LDMFD R13!,{R4-R7,PC}

	FBEGIN f248_sqrt,1
	; r0 = in
	; r0 = out
	[ TARGET_CPU = "ARM7M"
	; Max number would be 4096(*256)...
	; First do negative checks
	CMP R0,#0
	MOVLT R0,#0 ; Assume it's meant to be 0
	MOVLE PC,R14
	MOV R2,#2048*256 ; Shift value
	MOV R1,#0 ; Set up start value
f248_sqrt_loop
	; Just use UMULL all the way
	UMULL R3,R12,R1,R1
	MOV R3,R3,LSR #8
	ORR R3,R3,R12,LSL #24
	CMP R3,R0
	SUBHI R1,R1,R2 ; Use HI and LO to ignore sign - handles >16384 OK now
	ADDLO R1,R1,R2
	MOVNES R2,R2,LSR #1 ; if answer is found this will be skipped & code will exit
	BNE f248_sqrt_loop ; Otherwise the instruction will run and may come to end of cycle
	MOV R0,R1
	MOV PC,R14
	|
	STMFD R13!,{R4-R6,R14}
	; Max number would be 4096(*256)...
	; First do negative checks
	CMP R0,#0
	MOVLT R0,#0 ; Assume it's meant to be 0
	LDMLEFD R13!,{R4-R6,PC}
	MOV R2,#2048*256 ; Shift value
	MOV R1,#0 ; Set up start value
f248_sqrt_loop
	; Multiply!
	; Do quick MULs to start off with
	MOVS R3,R1,LSR #8
	BCS f248_sqrt_loop2
	MUL R3,R1,R3 ; X*256*X=X^2*256 ;)
	CMP R3,R0
	SUBHI R1,R1,R2
	ADDLO R1,R1,R2
	MOVNES R2,R2,LSR #1 ; Hey! Speed improvement - if answer is found this will be skipped & code will exit
	BNE f248_sqrt_loop ; Otherwise the instruction will run and may come to end of cycle
	MOV R0,R1
	LDMFD R13!,{R4-R6,PC}
f248_sqrt_loop2
	MOV R3,R1,LSR #16
	; Need to do a long MUL...
	; R3 is already the high bits
	SUB R4,R1,R3,LSL #16 ; Get low bits
	; Now we need 2xHL+HH+LL...
	MOV R5,R3
	MOV R6,R4
	MUL R5,R3,R5 ; HH
	MUL R6,R4,R6 ; LL
	MUL R3,R4,R3 ; HL
	MOV R6,R6,LSR #8 ; Get to right pos
	ORR R6,R6,R5,LSL #24 ; No need for overflow checks
	ADD R6,R6,R3,LSL #9
	CMP R6,R0
	SUBHI R1,R1,R2
	ADDLO R1,R1,R2
	MOVNES R2,R2,LSR #1
	BNE f248_sqrt_loop2
	MOV R0,R1
	LDMFD R13!,{R4-R6,PC}
	]

	FBEGIN f248_sqr,1
	; r0=in
	; r0=out
	[ TARGET_CPU = "ARM7M"
	SMULL R1,R2,R0,R0
	CMP R2,#128
	MVNGE R0,#&80000000 ; Overflow - set to maximum positive
	MOVLT R0,R1,LSR #8
	ORRLT R0,R0,R2,LSL #24
	MOV PC,R14
	|
	CMP R0,#0
	RSBLT R0,R0,#0
	MOV R1,R0,LSR #16 ; R1=High bits (H)
	BIC R0,R0,R1,LSL #16 ; R0=Low bits (L)
	MOV R2,R0
	MOV R3,R1
	MUL R3,R1,R3 ; HH
	CMP R3,#256
	BGE f248_sqr_overflow
	MUL R2,R0,R2 ; LL
	MUL R1,R0,R1 ; HL
	CMP R1,#&8000000
	BGE f248_sqr_overflow
	; Now recombine...
	MOV R0,R3,LSL #24 ; High bits
	ORR R0,R0,R2,LSR #8 ; Low bits
	ADDS R0,R0,R1,LSL #9 ; Middle bits
	BCS f248_sqr_overflow ; Should be OK
	MOV PC,R14
f248_sqr_overflow
	MVN R0,#&80000000 ; Maximum positive
	MOV PC,R14
	]

f248_sin ; r0=in,r0=out
	; since sin x=cos (x-90)...
	SUB R0,R0,#90*256
f248_cos ; R0=angle, returns value
	CMP R0,#0
	RSBLT R0,R0,#0 ; Make positive
f248_cos_loop
	CMP R0,#360*256
	SUBGE R0,R0,#360*256
	BGT f248_cos_loop
	MOV R1,R0,LSR #4 ; Get location in table
	SUB R0,R0,R1,LSL #4 ; Get interpolation factor
	ADD R2,R1,#1 ; Location of 2nd value 
	; Work out quadrant
	CMP R1,#180*16
	RSBGE R1,R1,#360*16 ; Reflect if >180
	RSBGE R2,R2,#360*16
	CMP R1,#90*16
	CMPGE R2,#90*16
	RSBGE R1,R1,#180*16 ; Reflect again
	RSBGE R2,R2,#180*16
	LDR R3,f1616_costabptr
	LDR R1,[R3,R1,LSL #2]
	LDR R2,[R3,R2,LSL #2]
	MUL R2,R0,R2
	RSB R0,R0,#16
	MLA R1,R0,R1,R2
	RSBGE R1,R1,#0 ; Negate answer if needed
	MOV R0,R1,ASR #12 ; Back to 24.8
	MOV PC,R14

	FBEGIN f2012_mul,1
	; r0,r1=in
	; r0=out
	[ TARGET_CPU = "ARM7M"
	SMULL R2,R3,R0,R1
	MOV R0,R2,LSR #12
	ORR R0,R0,R3,LSL #20
	; Overflow check
	MOVS R3,R3,ASR #12
	ADDLTS R3,R3,#1
	MOVNE R0,#&80000000 ; Maximum negative
	ADDGT R0,R0,#1 ; Maximum positive
	MOV PC,R14
	|
	STMFD R13!,{R4-R8,R14}
	CMP R0,#0
	MOVLT R8,#1
	RSBLT R0,R0,#0
	MOVGE R8,#0
	CMP R1,#0
	EORLT R8,R8,#1 ; Invert the sign bit
	RSBLT R1,R1,#0
	MOV R2,R0,LSR #16 ; R2=High bits (A)
	MOV R3,R1,LSR #16 ; R3=High bits (B)
	BIC R0,R0,R2,LSL #16 ; R0=Low bits (a)
	BIC R1,R1,R3,LSL #16 ; R1=Low bits (b)
	MUL R7,R2,R3 ; AB
	CMP R7,#4096
	BGE f2012_mul_overflow
	MUL R4,R0,R1 ; Multiply - ab
	MUL R5,R0,R3 ; aB
	TST R5,#&F0000000 ; 2^20-(4-12)=2^28
	BNE f248_mul_overflow
	MUL R6,R2,R1 ; Ab
	TST R6,#&F0000000
	BNE f2012_mul_overflow
	; Now recombine...
	MOV R0,R7,LSL #20 ; High bits
	ORR R0,R0,R4,LSR #12 ; Low bits
	ADDS R0,R0,R5,LSL #4 ; Middle bits
	BLT f2012_mul_overflow
	ADDS R0,R0,R6,LSL #4
	BLT f2012_mul_overflow
	CMP R8,#1
	RSBEQ R0,R0,#0
	LDMFD R13!,{R4-R8,PC}
f2012_mul_overflow
	CMP R8,#0
	MOV R0,#&80000000 ; Maximum negative
	SUBEQ R0,R0,#1 ; Maximum positive
	LDMFD R13!,{R4-R8,PC}
	]


	GEN_DIV 12,12,12,f2012_div

	FBEGIN f2012_tridiv,1
	; r0-r2=pointers to numbers
	; r3=number to divide by
	STMFD R13!,{R4-R7,R14}
	; Since muls are so quick, calculate 1/r2 and then mul the others
	; Cheat and use mul/div routines at the moment...
	MOV R4,R0
	MOV R5,R1
	MOV R6,R2
	MOV R0,#4096
	MOV R1,R3
	BL f2012_div
	MOV R7,R0
	MOV R1,R0
	LDR R0,[R4]
	BL f2012_mul
	STR R0,[R4]
	MOV R1,R7
	LDR R0,[R5]
	BL f2012_mul
	STR R0,[R5]
	MOV R1,R7
	LDR R0,[R6]
	BL f2012_mul
	STR R0,[R6]
	LDMFD R13!,{R4-R7,PC}

	FBEGIN f2012_sqrt,1
	; r0 = in
	; r0 = out
	[ TARGET_CPU = "ARM7M"
	; Max number would be 1024(*4096)...
	; First do negative checks
	CMP R0,#0
	MOVLT R0,#0 ; Assume it's meant to be 0
	MOVLE PC,R14
	MOV R2,#512*4096 ; Shift value
	MOV R1,#0 ; Set up start value
f2012_sqrt_loop
	; Just use UMULL all the way
	UMULL R3,R12,R1,R1
	MOV R3,R3,LSR #12
	ORR R3,R3,R12,LSL #20
	CMP R3,R0
	SUBHI R1,R1,R2 ; Use HI and LO to ignore sign - handles >16384 OK now
	ADDLO R1,R1,R2
	MOVNES R2,R2,LSR #1 ; if answer is found this will be skipped & code will exit
	BNE f2012_sqrt_loop ; Otherwise the instruction will run and may come to end of cycle
	MOV R0,R1
	MOV PC,R14
	|
	STMFD R13!,{R4-R6,R14}
	; Max number would be 1024(*4096)...
	; First do negative checks
	CMP R0,#0
	MOVLT R0,#0 ; Assume it's meant to be 0
	LDMLEFD R13!,{R4-R6,PC}
	MOV R2,#512*4096 ; Shift value
	MOV R1,#0 ; Set up start value
f2012_sqrt_loop
	; Multiply!
	; Do quick MULs to start off with
	MOVS R3,R1,LSR #12
	BCS f2012_sqrt_loop2
	MUL R3,R1,R3 ; X*4096*X=X^2*4096 ;)
	CMP R3,R0
	SUBHI R1,R1,R2
	ADDLO R1,R1,R2
	MOVNES R2,R2,LSR #1 ; Hey! Speed improvement - if answer is found this will be skipped & code will exit
	BNE f2012_sqrt_loop ; Otherwise the instruction will run and may come to end of cycle
	MOV R0,R1
	LDMFD R13!,{R4-R6,PC}
f2012_sqrt_loop2
	MOV R3,R1,LSR #16 ; HHHH, +4
	; Need to do a long MUL...
	; R3 is already the high bits
	SUB R4,R1,R3,LSL #16 ; Get low bits - HLLL, -12
	; Now we need 2xHL+HH+LL...
	MOV R5,R3
	MOV R6,R4
	MUL R5,R3,R5 ; HH
	MUL R6,R4,R6 ; LL
	MUL R3,R4,R3 ; HL, -8
	MOV R6,R6,LSR #12 ; Get to right pos
	ORR R6,R6,R5,LSL #20 ; No need for overflow checks
	ADD R6,R6,R3,LSL #5
	CMP R6,R0
	SUBHI R1,R1,R2
	ADDLO R1,R1,R2
	MOVNES R2,R2,LSR #1
	BNE f2012_sqrt_loop2
	MOV R0,R1
	LDMFD R13!,{R4-R6,PC}
	]

	FBEGIN f2012_sqr,1
	; r0=in
	; r0=out
	[ TARGET_CPU = "ARM7M"
	SMULL R1,R2,R0,R0
	CMP R2,#&800
	MVNGE R0,#&80000000 ; Overflow - set to maximum positive
	MOVLT R0,R1,LSR #12
	ORRLT R0,R0,R2,LSL #20
	MOV PC,R14
	|
	CMP R0,#0
	RSBLT R0,R0,#0
	MOV R1,R0,LSR #16 ; R1=High bits (H)
	BIC R0,R0,R1,LSL #16 ; R0=Low bits (L)
	MOV R2,R0
	MOV R3,R1
	MUL R3,R1,R3 ; HH
	CMP R3,#4096
	BGE f2012_sqr_overflow
	MUL R2,R0,R2 ; LL
	MUL R1,R0,R1 ; HL
	CMP R1,#&8000000
	BGE f2012_sqr_overflow
	; Now recombine...
	MOV R0,R3,LSL #20 ; High bits
	ORR R0,R0,R2,LSR #12 ; Low bits
	ADDS R0,R0,R1,LSL #5 ; Middle bits
	BCS f2012_sqr_overflow ; Should be OK
	MOV PC,R14
f2012_sqr_overflow
	MOV R0,#&80000000 ; Maximum negative
	SUB R0,R0,#1 ; Maximum positive
	MOV PC,R14
	]

f2012_sin ; r0=in,r0=out
	; since sin x=cos (x-90)...
	SUB R0,R0,#90*4096
f2012_cos ; R0=angle, returns value
	CMP R0,#0
	RSBLT R0,R0,#0 ; Make positive
f2012_cos_loop
	CMP R0,#360*4096
	SUBGE R0,R0,#360*4096
	BGT f2012_cos_loop
	MOV R1,R0,LSR #8 ; Get location in table
	SUB R0,R0,R1,LSL #8 ; Get interpolation factor
	ADD R2,R1,#1 ; Location of 2nd value 
	; Work out quadrant
	CMP R1,#180*16
	RSBGE R1,R1,#360*16 ; Reflect if >180
	RSBGE R2,R2,#360*16
	CMP R1,#90*16
	CMPGE R2,#90*16
	RSBGE R1,R1,#180*16 ; Reflect again
	RSBGE R2,R2,#180*16
	LDR R3,f1616_costabptr
	LDR R1,[R3,R1,LSL #2]
	LDR R2,[R3,R2,LSL #2]
	MUL R2,R0,R2
	RSB R0,R0,#256
	MLA R1,R0,R1,R2
	RSBGE R1,R1,#0 ; Negate answer if needed
	MOV R0,R1,ASR #12 ; Back to 20.12
	MOV PC,R14


	FBEGIN f4816_mul,1
	; r0,r1=a
	; r2,r3=b
	; return a*b
	; format: hmzl
	;     hmzl
	;     HMZL
	; ---1100-
	; ...   lL  A0=lL>>16+lZ+lM<<16+zL+
	; ...  lZ.     zZ<<16+mL<<16
	; ... lM .  A1=C0+lM>>16+lH+zZ>>16+
	; ...lH  .     zM+zH<<16+mL>>16+mZ+
	; ...  zL.     mM<<16+hL+hZ<<16
	; ... zZ .
	; ...zM  .
	; ..zH   .
	; ... mL .
	; ...mZ  .
	; ..mM   .
	; .mH    .  -- Overflow only, i.e. if (m != 0) && (H != 0) then overflow
	; ...hL  .
	; ..hZ   .
	; .hM    .  -- Overflow only
	; hH.    .  -- Overflow only
	STMFD R13!,{R4-R10,R14}
	CMP R1,#0
	MOV R6,#0
	BGE f4816_mul_apos
	RSBS R0,R0,#0
	RSC R1,R1,#0
	MOV R6,#1
f4816_mul_apos
	CMP R3,#0
	BGE f4816_mul_bpos
	RSBS R2,R2,#0
	RSC R3,R3,#0
	EOR R6,R6,#1
f4816_mul_bpos
	; Now split up vars...
	; Only really need to split one of them, since the other can be split on the fly
	MOV R5,R3,LSR #16 ; H
	SUB R4,R3,R5,LSL #16 ; M
	MOV R3,R2,LSR #16 ; Z
	SUB R2,R2,R3,LSL #16 ; L
	; Calc l values
	MOV R7,R0,LSL #16
	MOV R7,R7,LSR #16 ; l
	MUL R8,R7,R4 ; lM
	MOV R9,R8,LSR #16 ; A1=lM>>16
	MOV R8,R8,LSL #16 ; A0=lM<<16
	MUL R10,R7,R3 ; lZ
	ADDS R8,R8,R10 ; A0+=lZ
	MUL R10,R7,R5 ; lH
	ADCS R9,R9,R10 ; A1+=C0+lH
	BLT f4816_mul_overflow
	MUL R10,R7,R2 ; lL
	ADDS R8,R8,R10,LSR #16 ; A0+=lL>>16
	ADCS R9,R9,#0 ; A1+=C0
	BLT f4816_mul_overflow
	; Now do z
	MOV R7,R0,LSR #16 ; z
	MUL R10,R7,R2 ; zL
	ADDS R8,R8,R10 ; A0+=zL
	MUL R10,R7,R3 ; zZ
	ADCS R9,R9,R10,LSR #16 ; A1+=zZ>>16+C0
	BLT f4816_mul_overflow
	ADDS R8,R8,R10,LSL #16 ; A0+=zZ<<16
	MUL R10,R7,R5 ; zH
	ADCS R9,R9,R10,LSL #16 ; A1+=C0+zH<<16
	MULGE R10,R7,R4 ; zM
	ADDGES R9,R9,R10 ; A1+=zM, defer overflow branch
	BLT f4816_mul_overflow
	; Do m
	MOV R7,R1,LSL #16
	MOV R7,R7,LSR #16 ; m
	CMP R7,#0
	CMPNE R5,#0
	BNE f4816_mul_overflow ; Both m and H !=0, so overflow
	MUL R10,R7,R2 ; mL
	ADDS R0,R8,R10,LSL #16 ; A0+=mL<<16
	ADCS R9,R9,R10,LSR #16 ; A1+=C0+mL>>16
	MULGE R10,R7,R3 ; mZ
	ADDGES R9,R9,R10 ; A1+=mZ
	MULGE R10,R7,R4 ; mM
	ADDGES R9,R9,R10,LSL #16 ; A1+=mM<<16
	BLT f4816_mul_overflow ; 3 overflow checks in 1
	; Do h
	MOV R7,R1,LSR #16 ; h
	ORRS R10,R5,R4 ; If M or H...
	CMPNE R7,#0 ; ... and h...
	BNE f4816_mul_overflow ; ... then overflow
	MUL R10,R7,R2 ; hL
	ADDS R1,R9,R10 ; A1+=hL
	MULGE R10,R7,R3 ; hZ
	ADDGES R1,R1,R10,LSL #16 ; A1+=hZ<<16
	BLT f4816_mul_overflow
	CMP R6,#0
	LDMEQFD R13!,{R4-R10,PC}
	RSBS R0,R0,#0
	RSC R1,R1,#0
	LDMFD R13!,{R4-R10,PC}
f4816_mul_overflow
	CMP R6,#0
	MOV R0,#0
	MOV R1,#&80000000 ; Negative
	SUBEQ R0,R0,#1 ; Or positive if positive
	SUBEQ R1,R1,#1
	LDMFD R13!,{R4-R10,PC}

	FBEGIN f4816_div,1
	; r1.r0=r1.r0/r3.r2
	; yay.
	STMFD R13!,{R4-R10,R14}
	; Hmm...
	; Use standard fourier div, but need to have 16 more bits to the result, and shift extra data in from other regs
	CMP R1,#0
	MOVGE R5,#0 ; Negative answer flag
	BGE f4816_div_pos
	RSBS R0,R0,#0
	RSC R1,R1,#0
	MOV R5,#1
f4816_div_pos
	CMP R2,#0
	CMPEQ R3,#0
	BEQ f4816_div_overflow ; Go straight to overflow code for /0
	CMP R3,#0
	BGE f4816_div_pos2
	RSBS R2,R2,#0
	RSC R3,R3,#0
	EOR R5,R5,#1 ; Invert the sign bit
f4816_div_pos2
	MOV R6,#80 ; count (64+16 extra)
	MOV R7,#0 ; Remainder
	MOV R8,#0
	MOV R9,#0 ; Result
	MOV R10,#0
f4816_div_loop
	MOVS R0,R0,LSL #1 ; Shift dividend left
	ADCS R1,R1,R1
	ADCS R7,R7,R7 ; Shift remainder left & add 1 if dividend overflows
	ADC R8,R8,R8
	MOVS R9,R9,LSL #1 ; Shift result left
	ADCS R10,R10,R10
	BLT f4816_div_overflow
	CMP R8,R3 ; Remainder >= divisor?
	CMPEQ R7,R2
	BLO f4816_div_skip ; Nope (Use unsigned comparison since lower word won't have a sign bit)
	SUBS R7,R7,R2 ; Yes, so decrease
	SBC R8,R8,R3
	ADDS R9,R9,#1 ; And increase result
	ADDCS R10,R10,#1
f4816_div_skip
	SUBS R6,R6,#1
	BGT f4816_div_loop
	MOV R0,R9 ; Transfer result
	MOV R1,R10
	CMP R5,#0
	LDMEQFD R13!,{R4-R10,PC}
	RSBS R0,R0,#0
	RSC R1,R1,#0
	LDMFD R13!,{R4-R10,PC}
f4816_div_overflow
	CMP R5,#0
	MOV R0,#0 ; Maximum negative
	MOV R1,#&80000000
	SUBEQ R0,R0,#1 ; Maximum positive
	SUBEQ R1,R1,#1
	LDMFD R13!,{R4-R10,PC}

	FBEGIN f4816_tridiv,1
	; r0-r2=pointers to numbers
	; r3,stack1=number to divide by
	STMFD R13!,{R4-R8,R14}
	MOV R4,R0
	MOV R5,R1
	MOV R6,R2
	MOV R7,R3
	LDR R8,[R13,#24] ; Get 1st APCS stack reg
	LDMIA R4,{R0-R1}
	MOV R2,R7
	MOV R3,R8
	BL f4816_div
	STMIA R4,{R0-R1}
	LDMIA R5,{R0-R1}
	MOV R2,R7
	MOV R3,R8
	BL f4816_div
	STMIA R5,{R0-R1}
	LDMIA R6,{R0-R1}
	MOV R2,R7
	MOV R3,R8
	BL f4816_div
	STMIA R6,{R0-R1}
	LDMFD R13!,{R4-R8,PC}

	FBEGIN f4816_sqrt,1
	; r0,r1 = in
	; r0,r1 = out
	STMFD R13!,{R4-R12,R14}
	; Max number would be 2^24(*65536)...
	; First do negative checks
	CMP R1,#0
	MOVLT R1,#0 ; Assume it's meant to be 0
	MOVLT R0,#0
	LDMLTFD R13!,{R4-R12,PC}
	MOV R8,#0 ; Start shift value (low)
	MOV R9,#&80 ; Start shift value (high)
	MOV R10,#0 ; Start result value
	MOV R11,#0
f4816_sqrt_loop
	; Do long mul since I cba doing lots of short ones
	MOV R3,R10,LSR #16 ; Z
	SUB R2,R10,R3,LSL #16 ; L
	MOV R6,R11,LSR #16 ; H
	SUB R5,R11,R6,LSL #16 ; M
	MOV R12,R2
	MUL R12,R2,R12 ; LL
	MOV R12,R12,LSR #16
	MUL R14,R2,R3 ; ZL
	ADDS R12,R12,R14,LSL #1
	MOV R14,R14,LSR #31
	ADDCS R14,R14,#1
	MUL R7,R2,R5 ; LM
	ADDS R12,R12,R7,LSL #17
	ADC R14,R14,R7,LSR #15
	MOV R7,R3
	MUL R7,R3,R7 ; ZZ
	ADDS R12,R12,R7,LSL #16
	ADC R14,R14,R7,LSR #16
	MUL R7,R2,R6 ; LH
	MLA R7,R3,R5,R7 ; ZM
	ADD R14,R14,R7,LSL #1
	MUL R7,R3,R6 ; ZH
	ADD R14,R14,R7,LSL #17
	MOV R7,R5
	MUL R7,R5,R7 ; MM
	ADD R14,R14,R7,LSL #16
	; Now R12,R14 is sqr of R10,R11
	CMP R14,R1
	CMPEQ R12,R0
	BEQ f4816_sqrt_found
	BHI f4816_sqrt_hi
	ADDS R10,R10,R8 ; Increase our guess
	ADC R11,R11,R9
	MOVS R9,R9,LSR #1
	MOV R8,R8,RRX
	CMP R8,#0
	CMPEQ R9,#0
	BNE f4816_sqrt_loop
f4816_sqrt_found
	MOV R0,R10
	MOV R1,R11
	LDMFD R13!,{R4-R12,PC}
f4816_sqrt_hi
	SUBS R10,R10,R8 ; Decrease our guess
	SBC R11,R11,R9
	MOVS R9,R9,LSR #1
	MOV R8,R8,RRX
	CMP R8,#0
	CMPEQ R9,#0
	BNE f4816_sqrt_loop
	MOV R0,R10
	MOV R1,R11
	LDMFD R13!,{R4-R12,PC}


	FBEGIN f4816_sqr,1
	; r0,r1=in
	; r0,r1=out
	CMP R1,#0
	BGE f4816_sqr_pos
	RSBS R0,R0,#0
	RSC R1,R1,#0
f4816_sqr_pos
	STMFD R13!,{R4-R7,R14}
	MOV R3,R0,LSR #16 ; Z
	SUB R2,R0,R3,LSL #16 ; L
	MOVS R6,R1,LSR #16 ; H
	BNE f4816_sqr_overflow ; Overflow check
	SUB R5,R1,R6,LSL #16 ; M
	;     HMZL                 
	;     HMZL
	; ---1100-
	; ...   LL  A0=LL>>16+ZL<<1+LM<<17+ZZ<<16
	; ...  LZ.  A1=LM>>15+LH<<1+ZZ>>16+ZM<<1+ZH<<17+MM<<16+ZL>>31
	; ... LM .
	; ...LH  .
	; ...  ZL.
	; ... ZZ .
	; ...ZM  .
	; ..ZH   .
	; ... ML .
	; ...MZ  .
	; ..MM   .
	; .MH    .  -- Overflow check only
	; ...HL  .
	; ..HZ   .
	; .HM    .  -- Overflow check only
	; HH.    .  -- Overflow check only, encompasses above checks
	MOV R0,R2
	MUL R0,R2,R0 ; LL
	MOV R0,R0,LSR #16
	MUL R1,R2,R3 ; ZL
	ADDS R0,R0,R1,LSL #1
	MOV R1,R1,LSR #31
	ADDCS R1,R1,#1
	MUL R7,R2,R5 ; LM
	ADDS R0,R0,R7,LSL #17
	ADC R1,R1,R7,LSR #15
	MOV R7,R3
	MUL R7,R3,R7 ; ZZ
	ADDS R0,R0,R7,LSL #16
	ADC R1,R1,R7,LSR #16
	MUL R7,R2,R6 ; LH
	MLA R7,R3,R5,R7 ; ZM
	ADD R1,R1,R7,LSL #1
	MUL R7,R3,R6 ; ZH
	ADD R1,R1,R7,LSL #17
	MOV R7,R5
	MUL R7,R5,R7 ; MM
	ADD R1,R1,R7,LSL #16
	CMP R1,#0 ; Extra overflow check. Adding S to the ADD above doesn't seem to work though.
	LDMGEFD R13!,{R4-R7,PC}
f4816_sqr_overflow
	MOV R0,#0
	MOV R1,#&80000000
	SUB R0,R0,#1
	SUB R1,R1,#1
	LDMFD R13!,{R4-R7,PC}

f4816_sin
	; since sin x=cos (x-90)
	SUB R0,R0,#90*65536
f4816_cos
	STMFD R13!,{R14}
	BL f1616_cos ; Ignore top word (could cause problems if angle overflows into it though)
	CMP R0,#0
	MOV R1,#0
	SUBLT R1,R1,#1
	LDMFD R13!,{PC}

	[ F16E8_CODE = 1

	FBEGIN f16e8_add,1
	; r0,r1=in
	; r0=out
	STMFD R13!,{R4-R7,R14}
	; Need to get shift difference...
	MOV R2,R0,LSL #24
	MOV R3,R1,LSL #24
	SUBS R4,R2,R3
	MOV R5,R0,ASR #16
	MOV R6,R1,ASR #16
	MOVGT R6,R6,ASR R4
	RSBLT R4,R4,#0
	MOVLT R5,R5,ASR R4
	MOVLT R2,R3
	ADD R0,R5,R6
	CMP R0,#0
	MOVLT R7,#1
	MOVGE R7,#0
	RSBLT R0,R0,#0
	CMP R0,#&00008000
	MOVLT R0,R0,LSL #16
	ADDGE R2,R2,#16*&1000000
	CMP R0,#&00800000
	MOVLT R0,R0,LSL #8
	SUBLT R2,R2,#8*&1000000
	CMP R0,#&08000000
	MOVLT R0,R0,LSL #4
	SUBLT R2,R2,#4*&1000000
	CMP R0,#&20000000
	MOVLT R0,R0,LSL #2
	SUBLT R2,R2,#2*&1000000
	CMP R0,#&40000000
	MOVLT R0,R0,LSL #1
	SUBLT R2,R2,#1*&1000000
	CMP R7,#1
	RSBNE R0,R0,#0
	MOV R0,R0,LSR #16
	MOV R0,R0,LSL #16
	ORR R0,R0,R2,LSR #24
	LDMFD R13!,{R4-R7,PC}

	FBEGIN f16e8_sub,1
	; r0,r1=in
	; r0=out
	STMFD R13!,{R4-R7,R14}
	; Need to get shift difference...
	MOV R2,R0,LSL #24
	MOV R3,R1,LSL #24
	SUBS R4,R2,R3
	MOV R5,R0,ASR #16
	MOV R6,R1,ASR #16
	MOVGT R6,R6,ASR R4
	RSBLT R4,R4,#0
	MOVLT R5,R5,ASR R4
	MOVLT R2,R3
	SUB R0,R5,R6
	CMP R0,#0
	MOVLT R7,#1
	MOVGE R7,#0
	RSBLT R0,R0,#0
	CMP R0,#&00008000
	MOVLT R0,R0,LSL #16
	ADDGE R2,R2,#16*&1000000
	CMP R0,#&00800000
	MOVLT R0,R0,LSL #8
	SUBLT R2,R2,#8*&1000000
	CMP R0,#&08000000
	MOVLT R0,R0,LSL #4
	SUBLT R2,R2,#4*&1000000
	CMP R0,#&20000000
	MOVLT R0,R0,LSL #2
	SUBLT R2,R2,#2*&1000000
	CMP R0,#&40000000
	MOVLT R0,R0,LSL #1
	SUBLT R2,R2,#1*&1000000
	CMP R7,#1
	RSBNE R0,R0,#0
	MOV R0,R0,LSR #16
	MOV R0,R0,LSL #16
	ORR R0,R0,R2,LSR #24
	LDMFD R13!,{R4-R7,PC}

	FBEGIN f16e8_mul,1
	STMFD R13!,{R4,R14}
	MOV R2,R0,LSL #24
	MOV R3,R1,LSL #24
	MOV R0,R0,ASR #16
	MOV R1,R1,ASR #16
	MUL R0,R1,R0
	ADD R2,R2,R3
	CMP R0,#0
	MOVLT R4,#1
	MOVGE R4,#0
	RSBLT R0,R0,#0
	CMP R0,#&00008000
	MOVLT R0,R0,LSL #16
	ADDGE R2,R2,#16*&1000000
	CMP R0,#&00800000
	MOVLT R0,R0,LSL #8
	SUBLT R2,R2,#8*&1000000
	CMP R0,#&08000000
	MOVLT R0,R0,LSL #4
	SUBLT R2,R2,#4*&1000000
	CMP R0,#&20000000
	MOVLT R0,R0,LSL #2
	SUBLT R2,R2,#2*&1000000
	CMP R0,#&40000000
	MOVLT R0,R0,LSL #1
	SUBLT R2,R2,#1*&1000000
	CMP R4,#1
	RSBNE R0,R0,#0
	MOV R0,R0,LSR #16
	MOV R0,R0,LSL #16
	ORR R0,R0,R2,LSR #24
	LDMFD R13!,{R4,PC}

	FBEGIN f16e8_toint,1
	MOV R1,R0,ASR #16
	MOVS R2,R0,LSL #24
	RSBLT R2,R2,#0
	MOVGE R0,R1,LSL R2
	MOVLT R0,R1,ASR R2
	MOV PC,R14

	FBEGIN f16e8_fromint,1
	MOV R2,#0
	CMP R0,#0
	MOVLT R1,#1
	MOVGE R1,#0
	RSBLT R0,R0,#0
	CMP R0,#&00008000
	MOVLT R0,R0,LSL #16
	ADDLT R2,R2,#16*&1000000
	CMP R0,#&00800000
	MOVLT R0,R0,LSL #8
	SUBLT R2,R2,#8*&1000000
	CMP R0,#&08000000
	MOVLT R0,R0,LSL #4
	SUBLT R2,R2,#4*&1000000
	CMP R0,#&20000000
	MOVLT R0,R0,LSL #2
	SUBLT R2,R2,#2*&1000000
	CMP R0,#&40000000
	MOVLT R0,R0,LSL #1
	SUBLT R2,R2,#1*&1000000
	CMP R1,#1
	RSBNE R0,R0,#0
	MOV R0,R0,LSR #16
	MOV R0,R0,LSL #16
	ORR R0,R0,R2,LSR #24
	MOV PC,R14

	]

	FBEGIN gen_vlen,1 ; r0,r1,r2=in,r0=out
	[ TARGET_CPU = "ARM7M"
	STMFD R13!,{R4-R5,R14}
	SMULL R5,R3,R0,R0
	SMLAL R5,R3,R1,R1
	SMLAL R5,R3,R2,R2
	CMP R3,#&40000000
	BHS vlen_overflow
	MOV R0,#0 ; Result
	MOV R1,#&20000000 ; Twiddle bit
vlen_loop
	; Just use UMULL all the way
	UMULL R4,R2,R0,R0
	CMP R2,R3
	CMPEQ R4,R5
	SUBHI R0,R0,R1 ; Clear twiddle if we've overshot
	ADDLO R0,R0,R1 ; Add twiddle bit if we're too low
	MOVNES R1,R1,LSR #1
	BNE vlen_loop
	LDMFD R13!,{R4-R5,PC}
vlen_overflow
	MVN R0,#&80000000
	LDMFD R13!,{R4-R5,PC}
	|
	STMFD R13!,{R4-R9,R14}
	; Use r3 as HH
	; r4 as HL
	; r5 as LL
	CMP R0,#0
	RSBLT R0,R0,#0
	CMP R1,#0
	RSBLT R1,R1,#0
	CMP R2,#0
	RSBLT R2,R2,#0
	MOV R6,R0,LSR #16 ; High bits
	BIC R7,R0,R6,LSL #16 ; Low bits
	MOV R8,R6 ; High 2
	MOV R9,R7 ; Low 2
	MUL R5,R7,R9 ; LL
	MUL R4,R6,R7 ; HL
	MUL R3,R6,R8 ; HH
	MOV R8,R4,LSL #17
	ADDS R5,R5,R8
	ADCS R3,R3,R4,LSR #15
	BLT vlen_overflow
	MOV R6,R1,LSR #16 ; High bits
	BIC R7,R1,R6,LSL #16 ; Low bits
	MOV R8,R6 ; High 2
	MOV R9,R7 ; Low 2
	MUL R9,R7,R9 ; LL
	ADDS R5,R5,R9
	ADCS R3,R3,#0 ; Dummy add so the BLT works OK when there was no carry
	BLT vlen_overflow
	MUL R4,R6,R7 ; HL
	MUL R6,R8,R6 ; HH
	ADDS R3,R3,R6
	BLT vlen_overflow
	MOV R8,R4,LSL #17
	ADDS R5,R5,R8
	ADCS R3,R3,R4,LSR #15
	BLT vlen_overflow
	CMP R2,#0 ; 2D optimisation - set R0,R1 ready for looping and branch straight to loop if R2 == 0
	MOV R0,#0 ; Result
	MOV R1,#&40000000 ; Twiddle bit
	BEQ vlen_loop
	MOV R6,R2,LSR #16 ; High bits
	BIC R7,R2,R6,LSL #16 ; Low bits
	MOV R8,R6 ; High 2
	MOV R9,R7 ; Low 2
	MUL R9,R7,R9 ; LL
	ADDS R5,R5,R9
	ADCS R3,R3,#0
	BLT vlen_overflow
	MUL R4,R6,R7 ; HL
	MUL R6,R8,R6 ; HH
	ADDS R3,R3,R6 ; Why the fsck won't a MLAS do instead?
	BLT vlen_overflow
	MOV R8,R4,LSL #17
	ADDS R5,R5,R8
	ADCS R3,R3,R4,LSR #15
	BLT vlen_overflow
	; Now [R3,R5] is value to sqrt
vlen_loop
;	ADD R0,R0,R1 ; Set twiddle
	MOV R6,R0,LSR #16 ; High bits
	BIC R7,R0,R6,LSL #16 ; Low bits
	MOV R8,R6 ; High 2
	MOV R9,R7 ; Low 2
	MUL R7,R9,R7 ; LL
	MUL R4,R6,R9 ; HL
	MUL R6,R8,R6 ; HH
	MOV R8,R4,LSL #17
	ADDS R7,R7,R8
	ADC R6,R6,R4,LSR #15
	CMP R6,R3
	CMPEQ R7,R5
	LDMEQFD R13!,{R4-R9,PC}
	SUBHI R0,R0,R1 ; Clear twiddle if we've overshot
	ADDLO R0,R0,R1 ; Add twiddle bit if we're too low
	MOVS R1,R1,LSR #1
	BNE vlen_loop
	LDMFD R13!,{R4-R9,PC}
vlen_overflow
	MVN R0,#&80000000
	LDMFD R13!,{R4-R9,PC}
	]

; fast moveto* routine
; know that target point is between two coords
; so can use binary search to find correct x,y,z values
; (for movetox) need dx to be positive to enable search termination checks
; operation:
; 1. calc D
; 2. add half D to compute mid point (and divide D by 2)
; 3. if dx<0 then invert D
; 4. divide D by 2
; 5. if dx=0 finish (account for error?) - will this ever occur?
; 6. if x>targx, subtract D, else if x<targx add
; 7. if x!=targx go to 4
; 8. finish
	FBEGIN gen_movetox,1 ; r0=vec A, r1=vec B, r2=targ x, r3=targ vec (can be a, b, or something else
	STMFD R13!,{R4-R7,R14}
	LDMIA R1,{R5-R7}
	LDMIA R0,{R0-R1,R4}
	; Calc D
	SUB R5,R5,R0
	SUB R6,R6,R1
	SUB R7,R7,R4
	MOVS R5,R5,ASR #1 ; Divide by 2
	MOV R6,R6,ASR #1
	MOV R7,R7,ASR #1
	ADD R0,R0,R5 ; Add back on to compute mid point
	ADD R1,R1,R6
	ADD R4,R4,R7
	RSBMI R5,R5,#0 ; Invert D if <0
	RSBMI R6,R6,#0
	RSBMI R7,R7,#0
gen_movetox_loop
	MOVS R5,R5,ASR #1 ; Divide D by 2
	MOV R6,R6,ASR #1 ; Use ASR since dy/dz may be negative
	MOV R7,R7,ASR #1
	BEQ gen_movetox_finish ; Finish if out of range
	CMP R0,R2
	ADDLT R0,R0,R5 ; Add D if we are below the mark
	ADDLT R1,R1,R6
	ADDLT R4,R4,R7
	SUBGT R0,R0,R5 ; Else subtract
	SUBGT R1,R1,R6
	SUBGT R4,R4,R7
	CMP R0,R2
	BNE gen_movetox_loop
gen_movetox_finish
	MOV R0,R2 ; Just to make sure it gets set to the right value
	STMIA R3,{R0-R1,R4}
	LDMFD R13!,{R4-R7,PC}

	FBEGIN gen_movetoy,1 ; r0=vec A, r1=vec B, r2=targ y, r3=targ vec
	STMFD R13!,{R4-R7,R14}
	LDMIA R1,{R5-R7}
	LDMIA R0,{R0-R1,R4}
	; Calc D
	SUB R5,R5,R0
	SUB R6,R6,R1
	SUB R7,R7,R4
	MOV R5,R5,ASR #1
	MOVS R6,R6,ASR #1
	MOV R7,R7,ASR #1
	ADD R0,R0,R5
	ADD R1,R1,R6
	ADD R4,R4,R7
	RSBMI R5,R5,#0
	RSBMI R6,R6,#0
	RSBMI R7,R7,#0
gen_movetoy_loop
	MOV R5,R5,ASR #1
	MOVS R6,R6,ASR #1
	MOV R7,R7,ASR #1
	BEQ gen_movetoy_finish
	CMP R1,R2
	ADDLT R0,R0,R5
	ADDLT R1,R1,R6
	ADDLT R4,R4,R7
	SUBGT R0,R0,R5
	SUBGT R1,R1,R6
	SUBGT R4,R4,R7
	CMP R1,R2
	BNE gen_movetoy_loop
gen_movetoy_finish
	; MOV R1,R2 <-- Not needed since we can store R2 directly
	STMIA R3,{R0,R2,R4}
	LDMFD R13!,{R4-R7,PC}

	FBEGIN gen_movetoz,1 ; r0=vec A, r1=vec B, r2=targ z, r3=targ vec
	STMFD R13!,{R4-R7,R14}
	LDMIA R1,{R5-R7}
	LDMIA R0,{R0-R1,R4}
	; Calc D
	SUB R5,R5,R0
	SUB R6,R6,R1
	SUB R7,R7,R4
	MOV R5,R5,ASR #1
	MOV R6,R6,ASR #1
	MOVS R7,R7,ASR #1
	ADD R0,R0,R5
	ADD R1,R1,R6
	ADD R4,R4,R7
	RSBMI R5,R5,#0
	RSBMI R6,R6,#0
	RSBMI R7,R7,#0
gen_movetoz_loop
	MOV R5,R5,ASR #1
	MOV R6,R6,ASR #1
	MOVS R7,R7,ASR #1
	BEQ gen_movetoz_finish
	CMP R4,R2
	ADDLT R0,R0,R5
	ADDLT R1,R1,R6
	ADDLT R4,R4,R7
	SUBGT R0,R0,R5
	SUBGT R1,R1,R6
	SUBGT R4,R4,R7
	CMP R4,R2
	BNE gen_movetoz_loop
gen_movetoz_finish
	; MOV R4,R2 <-- Not needed since we can store R2 directly
	STMIA R3,{R0-R2}
	LDMFD R13!,{R4-R7,PC}

; General perspective moveto* routines (For x and y only)
; Works by comparing x (or y) against z
	FBEGIN gen_pmovetox,1 ; r0=vec A, r1=vec B, r2=negation flag, r3=vec out
	STMFD R13!,{R4-R8,R14}
	LDMIA R1,{R5-R7}
	LDMIA R0,{R0-R1,R4}
	MOV R8,#32 ; Number of steps to make (since we can't rely on values becoming 0)
	; Flip A and B x if negation flag says so
	CMP R2,#0
	RSBNE R0,R0,#0
	RSBNE R5,R5,#0
	; Calc D
	SUB R5,R5,R0
	SUB R6,R6,R1
	SUB R7,R7,R4
	MOV R5,R5,ASR #1
	MOV R6,R6,ASR #1
	MOV R7,R7,ASR #1
	ADD R0,R0,R5
	ADD R1,R1,R6
	ADD R4,R4,R7
	; Now flip D if dz-dx<0
	CMP R7,R5
	RSBLT R5,R5,#0
	RSBLT R6,R6,#0
	RSBLT R7,R7,#0
	; Loop!
gen_pmovetox_loop
	MOV R5,R5,ASR #1
	MOV R6,R6,ASR #1
	MOV R7,R7,ASR #1
	CMP R0,R4
	SUBLT R0,R0,R5
	SUBLT R1,R1,R6
	SUBLT R4,R4,R7
	ADDGT R0,R0,R5
	ADDGT R1,R1,R6
	ADDGT R4,R4,R7
	CMP R0,R4 ; If not reached targ
	SUBNES R8,R8,#1 ; And not lost accuracy
	BNE gen_pmovetox_loop ; Then loop again
	CMP R2,#0 ; Negate?
	MOVEQ R0,R4 ; Clamp value if not equal
	RSBNE R0,R4,#0 ; Clamp to negative value
	STMIA R3,{R0-R1,R4}
	LDMFD R13!,{R4-R8,PC}

	FBEGIN gen_pmovetoy,1 ; r0=vec A, r1=vec B, r2=negation flag, r3=vec out
	STMFD R13!,{R4-R8,R14}
	LDMIA R1,{R5-R7}
	LDMIA R0,{R0-R1,R4}
	MOV R8,#32 ; Number of steps to make (since we can't rely on values becoming 0)
	; Flip A and B y if negation flag says so
	CMP R2,#0
	RSBNE R1,R1,#0
	RSBNE R6,R6,#0
	; Calc D
	SUB R5,R5,R0
	SUB R6,R6,R1
	SUB R7,R7,R4
	MOV R5,R5,ASR #1
	MOV R6,R6,ASR #1
	MOV R7,R7,ASR #1
	ADD R0,R0,R5
	ADD R1,R1,R6
	ADD R4,R4,R7
	; Now flip D if dz-dy<0
	CMP R7,R6
	RSBLT R5,R5,#0
	RSBLT R6,R6,#0
	RSBLT R7,R7,#0
	; Loop!
gen_pmovetoy_loop
	MOV R5,R5,ASR #1
	MOV R6,R6,ASR #1
	MOV R7,R7,ASR #1
	CMP R1,R4
	SUBLT R0,R0,R5
	SUBLT R1,R1,R6
	SUBLT R4,R4,R7
	ADDGT R0,R0,R5
	ADDGT R1,R1,R6
	ADDGT R4,R4,R7
	CMP R1,R4 ; If not reached targ
	SUBNES R8,R8,#1 ; And not lost accuracy
	BNE gen_pmovetoy_loop ; Then loop again
	CMP R2,#0 ; Negate?
	MOVEQ R1,R4 ; Clamp value if not equal
	RSBNE R1,R4,#0 ; Clamp to negative value
	STMIA R3,{R0-R1,R4}
	LDMFD R13!,{R4-R8,PC}

	FBEGIN gen_move2d,1 ; r0=x1, r1=y1, r2=x2, r3=y2, stack1=targ x
	; Returns the corresponding y value in r0
	LDR R12,[R13,#0] ; Get targ x
	; Calc D
	SUB R2,R2,R0
	SUB R3,R3,R1
	MOVS R2,R2,ASR #1 ; Divide by 2
	MOV R3,R3,ASR #1
	ADD R0,R0,R2 ; Add back on to compute mid point
	ADD R1,R1,R3
	RSBMI R2,R2,#0 ; Invert D if <0
	RSBMI R3,R3,#0
gen_move2d_loop
	MOVS R2,R2,ASR #1 ; Divide D by 2
	MOV R3,R3,ASR #1
	MOVEQ R0,R1 ; Exit if nothing left to shift by
	MOVEQ PC,R14
	CMP R0,R12
	ADDLT R0,R0,R2 ; Add D if we are below the mark
	ADDLT R1,R1,R3
	SUBGT R0,R0,R2 ; Else subtract
	SUBGT R1,R1,R3
	CMP R0,R12
	BNE gen_move2d_loop
	MOV R0,R1
	MOV PC,R14


	FBEGIN int_sqrt,1 ; R0=in, R0=out
	MOVS R1,R0
	MOV R0,#0
	MOVLE PC,R14 ; Return with 0 if <= 0
	MOV R2,#&8000 ; twiddle bit
int_sqrt_loop
	MOV R3,R0
	MUL R3,R0,R3 ; Calc value
	CMP R3,R1
	SUBHI R0,R0,R2 ; Decrease guess if we're too high
	ADDLO R0,R0,R2 ; Increase guess if we're too low
	MOVNES R2,R2,LSR #1
	MOVEQ PC,R14 ; Return if we've got the answer, or ran out of bits to twiddle
	; .. unwind the loop a bit ..
	MOV R3,R0
	MUL R3,R0,R3 ; Calc value
	CMP R3,R1
	SUBHI R0,R0,R2 ; Decrease guess if we're too high
	ADDLO R0,R0,R2 ; Increase guess if we're too low
	MOVNES R2,R2,LSR #1
	BNE int_sqrt_loop
	MOV PC,R14

	[ OLDCODE = 1

	FBEGIN float_to_long_long,1
	MOV R1,R0,LSR #31 ; R1=Sign bit
	MOV R2,R0,LSR #23 ; R2=Exponent
	BIC R0,R0,R2,LSL #23 ; R0=Fraction
	AND R2,R2,#255
	CMP R2,#0 ; Exponent=0?
	MOVEQ R0,#0 ; Yes, so return 0
	MOVEQ R1,#0 ; (I.e. fraction is also 0 hence number is 0, or fraction != 0 in which case 0.fraction*2^-126 is considered too small & rounded to 0)
	MOVEQ PC,R14
	CMP R2,#189 ; Max exponent we can allow is 62, since 63 will land the 1 digit in the sign slot. Tracing that back results in an R2 of 62+127 = 189
	BGT float_to_long_long_big
	; Else exponent >0 and <190, so treat as 1.fraction*2^(exponent-127)
	ORR R3,R0,#&800000 ; Add the 1
	SUBS R2,R2,#127+23 ; Adjust exponent
	MOVGE R0,R3,LSL R2 ; Calculate low word
	RSBLT R2,R2,#0
	MOVLT R0,R3,LSR R2
	RSBLT R2,R2,#0
	SUBS R2,R2,#32
	MOVGE R3,R3,LSL R2 ; Calculate high word
	RSBLT R2,R2,#0
	MOVLT R3,R3,LSR R2
	CMP R1,#0
	MOV R1,R3 ; Get high word to correct reg
	MOVEQ PC,R14 ; Return
	RSBS R0,R0,#0 ; Or negate
	RSC R1,R1,#0
	MOV PC,R14
float_to_long_long_big ; For big numbers, including infinity and NaN
	CMP R1,#0
	MOV R0,#0
	MOV R1,#&80000000 ; -infinity
	SUBEQ R0,R0,#1
	SUBEQ R1,R1,#1 ; +infinity
	MOV PC,R14

	FBEGIN double_to_long_long,1
	MOV R2,R1 ; Swap input around
	MOV R1,R0
	MOV R0,R2
	MOV R2,R1,LSR #31 ; R2=Sign bit
	MOV R3,R1,LSR #20 ; R3=Exponent
	BIC R1,R1,R3,LSL #20 ; R1=fraction
	BIC R3,R3,#&800 ; Clear sign bit from exponent
	CMP R3,#0 ; Exponent=0?
	MOVEQ R0,#0 ; Yes, so return 0
	MOVEQ R1,#0 ; (I.e. fraction is also 0 hence number is 0, or fraction != 0 in which case 0.fraction*2^-1022 is considered too small & rounded to 0)
	MOVEQ PC,R14
	SUB R3,R3,#1024 ; Adjust exponent
	CMP R3,#61
	BGT double_to_long_long_big ; Too big to fit in result, or NaN
	SUBS R3,R3,#19+32 ; Adjust again so the 1 bit will lie in bit 0 of R0
	ORR R1,R1,#&100000 ; Add the 1 bit
	BGE double_to_long_long_left ; Shift left needed
	; Else shift right
	RSB R3,R3,#0
	MOV R0,R0,LSR R3 ; Shift lower word right
	RSBS R3,R3,#32
	ORRGE R0,R0,R1,LSL R3 ; Add corresponding bits from upper word
	RSBLT R3,R3,#0
	ORRLT R0,R0,R1,LSR R3
	RSBLT R3,R3,#0
	RSB R3,R3,#32
	MOV R1,R1,LSR R3 ; And shift upper word right
	CMP R2,#0
	MOVEQ PC,R14
	RSBS R0,R0,#0
	RSC R1,R1,#0
	MOV PC,R14
double_to_long_long_left
	MOV R1,R1,LSL R3 ; Shift upper word left
	RSB R3,R3,#32
	ORR R1,R1,R0,LSR R3 ; Add corresponding bits from lower word
	RSB R3,R3,#32
	MOV R0,R0,LSL R3 ; Shift lower word left
	CMP R2,#0
	MOVEQ PC,R14
	RSBS R0,R0,#0
	RSC R1,R1,#0
	MOV PC,R14
double_to_long_long_big ; For big numbers, including infinity and NaN
	CMP R2,#0
	MOV R0,#0
	MOV R1,#&80000000 ; -infinity
	SUBEQ R0,R0,#1
	SUBEQ R1,R1,#1 ; +infinity
	MOV PC,R14

	]

	FBEGIN gen_inverse,1
	; r0=(1/r1)*2^r0
	; yay.
	STMFD R13!,{R4-R5,R14}
	; Hmm...
	; Use standard fourier div, but need to have r0 more bits to the result...
	MOV R5,#0
	CMP R1,#0
	BEQ gen_inverse_overflow ; Go straight to overflow code for /0
	MOVLT R5,#1 ; Invert the sign bit if needed
	RSBLT R1,R1,#0
	ADD R2,R0,#32 ; Number of bits we need to produce
	MOV R0,#1 ; dividend
	MOV R3,#0 ; Remainder
	MOV R4,#0 ; Result
	CMP R1,#65536 ; Skip early non-dividing shifts if possible
	SUBGE R2,R2,#16
	MOVGE R0,#65536
	CMP R1,R0,LSL #8 ; And again
	SUBGE R2,R2,#8
	MOVGE R0,R0,LSL #8
gen_inverse_loop
	MOVS R0,R0,LSL #1
	ADC R3,R3,R3
	MOVS R4,R4,LSL #1
	BMI gen_inverse_overflow
	CMP R3,R1
	SUBHS R3,R3,R1
	ADDHS R4,R4,#1 ; ADC?
	SUBS R2,R2,#1
	BGT gen_inverse_loop
	MOV R0,R4
	CMP R5,#1 ; If 1, make negative
	RSBEQ R0,R0,#0
	LDMFD R13!,{R4-R5,PC}
gen_inverse_overflow
	CMP R5,#0
	MOV R0,#&80000000 ; Maximum negative
	SUBEQ R0,R0,#1 ; Maximum positive
	LDMFD R13!,{R4-R5,PC}

	[ DEBUG = 1

debugblock
	DCD 0,0,0,0,0,0,0,0,0,0,0,0,0

debugstr
	DCD 0,0,0

	FBEGIN dodebug,1
	STMFD R13!,{R0-R3,R14}
	ADR R14,debugblock
	STMIA R14,{R0-R12}
	MOV R3,#0
debugloop
	LDR R0,[R14,R3]
	ADR R1,debugstr
	MOV R2,#12
	SWI &D4 ; ConvertHex8
	SWI &2 ; Write0
	ADD R3,R3,#4
	SWI &100+32
	TST R3,#&F
	SWIEQ &3 ; NewLine
	CMP R3,#13*4
	BLT debugloop
	SWI &3
	SWI &3
	LDMFD R13!,{R0-R3,PC}

	]

	FBEGIN calc_izgrad,1
	; R0 = INVZ_ACCURACY/0x2000
	; R1 = span_?
	; R2 -> d (Can be negative)
	; R3 -> nrm.? (Can be negative)
	; Returns answer
	STMFD R13!,{R4-R11,R14}
	LDMIA R3,{R4,R5} ; numerator
	LDMIA R2,{R2,R3} ; denominator
;	BL dodebug
	MOV R12,#0 ; Negative answer by default
	CMP R5,#0
	BGE calc_izgrad_numpos
	RSBS R4,R4,#0
	RSC R5,R5,#0
	MOV R12,#1
calc_izgrad_numpos
	CMP R3,#0
	BGE calc_izgrad_denpos
	RSBS R2,R2,#0
	RSC R3,R3,#0
	EOR R12,R12,#1
calc_izgrad_denpos
;	BL dodebug
	; Now multiply
	; T=A*N
	; B=S*D
	; R=-T/B
	;A      10
	;N    3210
	;T  543210
	;AN 13
	;    03
	;    12
	;     02
	;     11
	;      01
	;      10
	;       00
	MOV R6,R0,LSR #16 ; A1
	SUB R0,R0,R6,LSL #16 ; A0
	MOV R7,R5,LSR #16 ; N3
	SUB R5,R5,R7,LSL #16 ; N2
	MUL R8,R6,R7 ; T45 = A1*N3
	MUL R9,R0,R5 ; T32 = A0*N2
	MUL R10,R0,R7 ; T43 = A0*N3
	ADDS R9,R9,R10,LSL #16
	ADC R8,R8,R10,LSR #16
	MUL R10,R6,R5 ; T43 = A1*N2
	ADDS R9,R9,R10,LSL #16
	ADC R8,R8,R10,LSR #16
	MOV R5,R4,LSR #16 ; N1
	SUB R4,R4,R5,LSL #16 ; N0
	MUL R10,R6,R5 ; A1*N1
	ADDS R9,R9,R10
	ADDCS R8,R8,#1
	MUL R10,R0,R4 ; T10 = A0*N0
	MUL R11,R0,R5 ; T21 = A0*N1
	ADDS R10,R10,R11,LSL #16
	ADCS R9,R9,R11,LSR #16
	ADDCS R8,R8,#1
	MUL R11,R6,R4 ; T21 = A1*N0
	ADDS R10,R10,R11,LSL #16
	ADCS R9,R9,R11,LSR #16
	ADDCS R8,R8,#1
;	BL dodebug
	; Now calculate denominator
	; S      10
	; D    3210
	; B  543210
	; SD 13
	;     03
	;     12
	;      02
	;      11
	;       01
	;       10
	;        00
	MOV R0,R1,LSR #16 ; S1
	SUB R1,R1,R0,LSL #16 ; S0
	MOV R4,R3,LSR #16 ; D3
	SUB R3,R3,R4,LSL #16 ; D2
	MUL R5,R0,R4 ; B54 = S1*D3
	MUL R6,R1,R3 ; B32 = S0*D2
	MUL R7,R1,R4 ; B43 = S0*D3
	ADDS R6,R6,R7,LSL #16
	ADC R5,R5,R7,LSR #16
	MUL R7,R0,R3 ; B43 = S1*D2
	ADDS R6,R6,R7,LSL #16
	ADC R5,R5,R7,LSR #16
	MOV R3,R2,LSR #16 ; D1
	SUB R2,R2,R3,LSL #16 ; D0
	MUL R7,R0,R3 ; S1*D1
	ADDS R6,R6,R7
	ADDCS R5,R5,#1
	MUL R7,R1,R2 ; B10 = S0*D0
	MUL R11,R1,R3 ; B21 = S0*D1
	ADDS R7,R7,R11,LSL #16
	ADCS R6,R6,R11,LSR #16
	ADDCS R5,R5,#1
	MUL R11,R0,R2 ; B21 = S1*D0
	ADDS R7,R7,R11,LSL #16
	ADCS R6,R6,R11,LSR #16
	ADDCS R5,R5,#1
;	BL dodebug
	; Now divide!
	; R0={R8,R9,R10}/{R5,R6,R7}
	MOV R0,#0 ; Result
	MOV R1,#0 ; Remainder
	MOV R2,#0
	MOV R3,#0
	MOV R4,#96 ; count
calc_izgrad_loop
;	BL dodebug
	MOVS R10,R10,LSL #1 ; Shift dividend (aka numerator) left
	ADCS R9,R9,R9
	ADCS R8,R8,R8
	ADCS R3,R3,R3 ; Shift remainder left & add 1 if dividend overflows (Using same big-endian format to make life easier)
	ADCS R2,R2,R2
	ADC R1,R1,R1
	MOVS R0,R0,LSL #1 ; Shift result left
	BMI calc_izgrad_overflow
	CMP R1,R5 ; Remainder >= divisor?
	CMPEQ R2,R6
	CMPEQ R3,R7
	BLO calc_izgrad_skip ; Nope
	SUBS R3,R3,R7 ; Yes, so decrease
	SBCS R2,R2,R6
	SBC R1,R1,R5
	ADD R0,R0,#1 ; And increase result
calc_izgrad_skip
	SUBS R4,R4,#1
	BGT calc_izgrad_loop
	CMP R12,#0
	RSBEQ R0,R0,#1 ; Negate result if needed
	LDMFD R13!,{R4-R11,PC}
calc_izgrad_overflow
	CMP R12,#0
	MVNNE R0,#&80000000 ; +max
	MOVEQ R0,#&80000000 ; -max
	LDMFD R13!,{R4-R11,PC}

	END
