# import time
# from rp2 import PIO, asm_pio
# from machine import Pin
import array
import time
# test assembler
# a[3] is number of interations
asm_mem=array.array('i',[ 1, 35, 3, 1000000])

###################################
### fist, find out how fast a function call is
@micropython.asm_thumb
# passing an array name to the assembler
# actually passes in the address
def null_fn(r0):  
      pass
    
# now use the assembler routinet
#@micropython.native
def timing_null():
    t0 = time.ticks_us()
    t1 = time.ticks_us()
    ovhd = t1-t0 # timing overhead
    t0 = time.ticks_us()
    null_fn(asm_mem) 
    t1 = time.ticks_us()
    return (t1-t0-ovhd)
#
# call three times and report time
t_single_call1 = timing_null()
t_single_call2 = timing_null()
t_single_call3 = timing_null()
print('single fn =',t_single_call1, t_single_call2, t_single_call3)

#########################################
### now time multipe calls in a loop 
# now put the funciton call in a loop and time it
def loop_fun(asm_mem):
   c = 0
   while c<10000:
      null_fn(asm_mem)
      c += 1
      
@micropython.native      
def loop_fun_native(asm_mem):
   c = 0
   while c<10000:
      null_fn(asm_mem)
      c += 1
      
@micropython.viper      
def loop_fun_viper(asm_mem):
   c = 0
   while c<10000:
      null_fn(asm_mem)
      c += 1   

print('========')
# Get the timing overhead
t0 = time.ticks_us()
t1 = time.ticks_us()
ovhd = t1-t0
print('timing overhead=',ovhd, 'uSec')
# time the looped function
# the null_fn function, in the loop,
# (after subtracing 5.2 uSec for loop overhead)
t0 = time.ticks_us()
loop_fun(asm_mem)
t1 = time.ticks_us()
print ('multiple fun_loop_time=',(t1-t0-ovhd)/10000-5.2)

t0 = time.ticks_us()
loop_fun_native(asm_mem)
t1 = time.ticks_us()
print ('multiple fun_native_time=',(t1-t0-ovhd)/10000-1.75)

t0 = time.ticks_us()
loop_fun_viper(asm_mem)
t1 = time.ticks_us()
print ('multiple fun_viper_time=',(t1-t0-ovhd)/10000-0.12)
#========================================
# now recode to run a loop of
# retreive-multiply-store
# assembler is 1 cycle/instruction
# except for load, store and branch whihc are two
@micropython.asm_thumb
def asm_mult_loop(r0):
    ldr(r7,[r0,12]) # get a[3], the iteration count
    mov(r6, 0) # initialize counter
    ldr(r2,[r0,4]) # get a[1] to mult by
    ### compute loop
    label(loop_pt)   
    mov(r1, r6) # mult by counter
    mul(r1, r2) # counter*a[1] into r1
    str(r1, [r0,8]) # product -> a[2]
    add(r6, 1) # increment the counter
    cmp(r6, r7) # above limit?
    ble(loop_pt) #back to loop_pt
    mov(r0, r1) # set r0 is return value
    
 #  use the assembler loop routine
def timing_mult_loop(ovhd):
    t0 = time.ticks_us()
    d = asm_mult_loop(asm_mem) 
    t1 = time.ticks_us()
    # subtract funcion call time plus timing overhead
    print('========')
    print('asm_multiply_loop_time=',((t1-t0)-ovhd)/asm_mem[3],
          'count=', asm_mem[3])
    # at 130 MHz clock rate, and
    # a loop count of 8 cycles, one
    # pass through the loop should take
    # 61.5 nSec. Actual time is 61.3 to 64.
# 
timing_mult_loop(ovhd)
#
#########################################
# timing simple loop

def speed_bytecode():
    x = 0
    for i in range(100000):
       x += 2

@micropython.native
def speed_native():
    x = 0
    for i in range(100000):
       x += 2

@micropython.viper
def speed_viper():
    x = 0
    for i in range(1000000):
       x += 2      

@micropython.asm_thumb
def speed_asm(r0):
    ldr(r7,[r0,12]) # get a[3], the iteration count
    mov(r6, 0) # initialize counter
    mov(r5, 0) # initialize variablle x
    ### compute loop
    label(loop_pt)   
    add(r5, 2) # x = x + 2
    add(r6, 1) # increment the counter
    cmp(r6, r7) # above limit?
    ble(loop_pt) #back to loop_pt
    mov(r0, r5) # set r0 is return value
    
# Get the timing overhead
t0 = time.ticks_us()
t1 = time.ticks_us()
ovhd = t1-t0
print('========')
print('timing overhead=',ovhd, 'uSec')
# time the looped function
# the null_fn function, in the loop,
# (after subtracing 5 uSec for loop overhead)
t0 = time.ticks_us()
speed_bytecode()
t1 = time.ticks_us()
print ('bytecode_loop_time=',(t1-t0-ovhd)/100000, 'usec')

t0 = time.ticks_us()
speed_native()
t1 = time.ticks_us()
print ('native_loop_time=',(t1-t0-ovhd)/100000)

t0 = time.ticks_us()
speed_viper()
t1 = time.ticks_us()
print ('viper_loop_time=',(t1-t0-ovhd)/1000000)

t0 = time.ticks_us()
speed_asm(asm_mem)
t1 = time.ticks_us()
print ('asm_loop_time=',(t1-t0-ovhd)/1000000)