; Function profiling ARM code V1.00 10/1/04
; See jprof.h for usage stuff
; Copyright 2008 Jeffrey Lee
; This file is part of WOUM.
; WOUM is free software: you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation, either version 3 of the License, or
; (at your option) any later version.
; WOUM is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; GNU General Public License for more details.
; You should have received a copy of the GNU General Public License
; along with WOUM.  If not, see <http://www.gnu.org/licenses/>.

R0	RN	0
R1	RN	1
R2	RN	2
R3	RN	3
R4	RN	4
R5	RN	5
R6	RN	6
R7	RN	7
R8	RN	8
R9	RN	9
R10	RN	10
R11	RN	11
R12	RN	12
R13	RN	13
R14	RN	14
PC	RN	15

X_Bit	EQU	&20000
XOS_ReadMonotonicTime		EQU	&000042 + X_Bit

	AREA	jprof, CODE, READONLY

	EXPORT	__cyg_profile_func_enter
	EXPORT	__cyg_profile_func_exit
	EXPORT	jprof_save

	IMPORT	malloc
	IMPORT	memset
	IMPORT	atexit
	IMPORT	fopen
	IMPORT	fprintf
	IMPORT	fclose
	IMPORT	|Image$$RO$$Base|
	IMPORT	|Image$$RO$$Limit|

state
	DCD 0 ; 0=not initialised, 1=initialised

stackbase
	DCD 0 ; Pointer to base of 4K stack of function data obj ptr:entry time pairs

stack
	DCD 0 ; Top of function data obj ptr stack (i.e. blank entry)

block
	DCD 0 ; Pointer to memory block used as index to function data

ltime
	DCD 0 ; Last time a profiling func was called

timeptr
	DCD 0 ; Address of time value

savestr
	DCB "jprof_save",0
	ALIGN

accessstr
	DCB "w",0
	ALIGN

format1
	DCB "%08x\t",0
	ALIGN

format2
	DCB "%s",0
	ALIGN

format3
	DCB "\t%u",0
	ALIGN

format4
	DCB "\n",0
	ALIGN

nullstr
	DCD 0

local_ro_base
	DCD |Image$$RO$$Base|

local_ro_limit
	DCD |Image$$RO$$Limit|

; Each block is of the form:
; 0 - function call count
; 4 - exclusive time spent in function
; 8 - inclusive time spent in function

__cyg_profile_func_enter ; R0=function ptr, R1=call site
	STMFD R13!,{R4-R8,R14}
	LDR R2,state
	CMP R2,#0
	BLEQ jprof_init ; ... else do some work.
	; Need to:
	; 1. Get current time
	; 2. Get last entry time
	; 3. If func stack ! empty, increase exclusive time of caller
	; 4. Increase call count of func (if address in range)
	; 5. Push func onto stack (if address in range)
	; 6. Set new last entry time
	MOV R4,R0 ; Preserve func ptr
;	SWI XOS_ReadMonotonicTime ; Get time
	LDR R0,timeptr ; Get time
	LDR R0,[R0]
	ADR R5,stackbase
	LDMIA R5,{R5-R8} ; stack base, stack, block, ltime
	CMP R5,R6
	BEQ _enter_stackempty
	; Increase exclusive time of caller
	LDR R1,[R6,#-8] ; Ptr to caller
	SUB R2,R0,R8 ; Elapsed time
	STR R2,[R1,#4] ; Increase exclusive time
_enter_stackempty
	; Need to translate to data ptr
	LDR R1,local_ro_limit
	CMP R4,R1
	LDR R1,local_ro_base
	CMPLE R1,R4
	BGT _enter_oob ; Address is out of range!
	SUB R4,R4,R1 ; Get offset from base
	ADD R4,R7,R4 ; Add our base
	LDR R1,[R4] ; Get data ptr
	CMP R1,#0
	BLEQ jprof_makeentry
	; Now increase call count of func
	LDR R2,[R1] ; Get call count
	ADD R2,R2,#1 ; Increase
	STR R2,[R1] ; Store
	; Now store onto stack
	STR R1,[R6],#4 ; func data ptr
	STR R0,[R6],#4 ; Entry time
	STR R6,stack ; Update stack ptr
_enter_oob
	STR R0,ltime ; Update last entry ptr
	LDMFD R13!,{R4-R8,PC}

jprof_makeentry ; Make a blank block for function in R4, return data ptr in R1
	STMFD R13!,{R0,R2-R3,R14}
	MOV R0,#12
	BL malloc
	MOV R1,#0
	MOV R2,#0
	MOV R3,#0
	STMIA R0,{R1-R3} ; Zero out
	MOV R1,R0
	STR R1,[R4]
	LDMFD R13!,{R0,R2-R3,PC}

jprof_init ; Initialise data, preserving R0
	LDR R1,state
	CMP R1,#0
	MOVNE PC,R14 ; Already inited!
	STMFD R13!,{R0,R14}
	SWI &4D400 ; DTime_GetPtr
	STR R0,timeptr
	MOV R0,#1
	STR R0,state
	LDR R0,local_ro_base
	LDR R1,local_ro_limit
	SUB R0,R1,R0
	BL malloc
	STR R0,block
	LDR R1,local_ro_base
	LDR R2,local_ro_limit
	SUB R2,R2,R1
	MOV R1,#0
	BL memset
	MOV R0,#4096
	BL malloc
	STR R0,stackbase
	STR R0,stack
	ADR R0,jprof_default_save
	BL atexit
	LDMFD R13!,{R0,PC}

__cyg_profile_func_exit ; R0=function ptr, R1=call site
	LDR R2,state ; Initialised
	CMP R2,#0
	LDRNE R2,stackbase ; With stack data
	LDRNE R3,stack
	CMPNE R2,R3
	MOVEQ PC,R14
	STMFD R13!,{R4-R8,R14}
	; Need to:
	; * Get time
	; * Pop value off func stack (if address in range)
	; * Increase its exclusive time (if address in range)
	; * Increase its inclusive time (if address in range)
	; * Update ltime
	MOV R4,R0 ; Preserve ptr
;	SWI XOS_ReadMonotonicTime ; Get time
	LDR R0,timeptr
	LDR R0,[R0]
	LDR R5,local_ro_base
	LDR R6,local_ro_limit
	CMP R4,R5
	CMPGT R6,R4
	BLE _exit_oob ; Address is out of range
	ADR R5,stackbase
	LDMIA R5,{R5-R8} ; stack base, stack, block, ltime
	LDMDB R6!,{R1,R2} ; Get value off func stack
	STR R6,stack ; Update ptr to free a reg
	LDMIB R1,{R3,R5} ; Get exclusive & inclusive times
	SUB R6,R0,R8 ; Elapsed exclusive time
	ADD R3,R3,R6 ; Increase exclusive time
	SUB R6,R0,R2 ; Elapsed inclusive time
	ADD R5,R5,R6 ; Increase inclusive time
	STMIB R1,{R3,R5} ; Update times
_exit_oob
	STR R0,ltime ; Update ltime
	LDMFD R13!,{R4-R8,PC}

jprof_default_save ; atexit handler
	ADR R0,savestr

	; ... fall through ...

jprof_save ; Save data under name in R0
	   ; doesn't take into account running functions though
	LDR R1,state
	CMP R1,#0
	MOVEQ PC,R14 ; Not initialised!
	LDMFD R13!,{R4-R8,R14}
	ADR R1,accessstr
	BL fopen
	MOV R4,R0
	LDR R5,block
	LDR R6,local_ro_base
	LDR R7,local_ro_limit
	; Loop!
jprof_save_loop
	LDR R8,[R5],#4
	CMP R8,#0
	BEQ jprof_save_noentry
	; Else a function here
	MOV R0,R4
	ADR R1,format1 ; Hex
	MOV R2,R6 ; Adr
	BL fprintf
	; Search for fn name
	SUB R2,R6,#4
	LDR R0,[R2]
	MOV R1,R0,LSR #24
	CMP R1,#255
	ADRNE R2,nullstr
	BICEQ R0,R0,#&FF000000
	SUBEQ R2,R2,R0
	MOV R0,R4
	ADR R1,format2 ; Str
	BL fprintf
	; Now print out data
	MOV R0,R4
	ADR R1,format3 ; num
	LDR R2,[R8],#4 ; call count
	BL fprintf
	MOV R0,R4
	ADR R1,format3
	LDR R2,[R8],#4 ; exclusive count
	BL fprintf
	MOV R0,R4
	ADR R1,format3
	LDR R2,[R8] ; inclusive count
	BL fprintf
	MOV R0,R4
	ADR R1,format4 ; newline
	BL fprintf
jprof_save_noentry
	ADD R6,R6,#4
	CMP R6,R7 ; At end?
	BLT jprof_save_loop
	MOV R0,R4
	BL fclose
	LDMFD R13!,{R4-R8,PC}

	END
