#!/usr/bin/env python

import sys

def baseline():
    # Spend exactly 4 cycles here.
    print('\tadd r1, r1, #1')
    print('\tadd r1, r1, #1')
    print('\tadd r1, r1, #1')
    print('\tadd r1, r1, #1')

def ldr_stall():
    # Spend exactly 4 cycles here.
    print('\tldr r1, [sp]')
    # STALL
    # STALL
    print('\tadd r1, r1, r1')

if len(sys.argv) != 2:
    print('usage: arm_insn_cycles.py FREQ_MHZ')
    exit(1)

cpu_freq_mhz = int(sys.argv[1])

print('''
\t.text
\t.align 2
\t.global main
\t.type main, %function
main:''')

# r1 gets modified during test loop.
# r2 is loop counter.
# r3 is scratch.

# Eventually, we'll want to end up with r2
# holding the CPU's frequency in kilohertz.
# Start by loading the frequency in megahertz
# and multiply r2 by 1000.
print('\tmov r2, #{}'.format(cpu_freq_mhz))

print('''
\tmov r3, #1000
\tmul r2, r3, r2''')

# Don't bother initializing r1: we don't care about its value.

print('.Lloop_start:')
for _ in xrange(0, 250):
    # loop body should take exactly 4 cycles.

    #baseline()
    ldr_stall()

print('''
\tsubs r2, r2, #1
\tbne .Lloop_start

\tmov r0, #0
\tmov pc, lr''')
