Leaf Functions: Realization in Assembly

What is shown here is a simple and efficient way for realizing a function call convention. However, there is a serious catch to it. It only works for so called leaf functions, i.e. functions that do not call another functions. So in particular something like recursive function calls will not work with that.

Provided Material

This is the assembly program that was developed in the video:

The line with the halt instruction

1
halt  %RET_VAL

was changed to

1
halt  0

So that it has an exit code of 0.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
    .equ    CALLER0,    1
    .equ    CALLER1,    CALLER0 + 1
    .equ    CALLER2,    CALLER1 + 1
    .equ    CALLER3,    CALLER2 + 1
    .equ    CALLER_LAST,CALLER3

    .equ    CALLEE0,    CALLER_LAST + 1
    .equ    CALLEE1,    CALLEE0 + 1
    .equ    CALLEE2,    CALLEE1 + 1
    .equ    CALLEE3,    CALLEE2 + 1
    .equ    CALLEE_LAST,CALLEE3

    .equ    FUNC_ADDR,  CALLER0
    .equ    RET_ADDR,   CALLER1
    .equ    RET_VAL,    CALLER2

    .equ    PARAM0,     CALLEE0

//------------------------------------------------------------------------------
//  program:  main
    .data
msg0:
    .string "n = "
msg1:
    .string "n + 1 = "

    .equ    N,  CALLER3

    .text
#   puts("n = ");
    ldzwq   msg0,           %PARAM0
    ldzwq   puts,           %FUNC_ADDR
    call    %FUNC_ADDR,     %RET_ADDR

#   %RET_VAL = get_uint64();
    ldzwq   get_uint64,     %FUNC_ADDR
    call    %FUNC_ADDR,     %RET_ADDR

#   %N  = %RET_VAL + 1
    addq    1,  %RET_VAL,   %N

#   puts("n + 1 = ");
    ldzwq   msg1,           %PARAM0
    ldzwq   puts,           %FUNC_ADDR
    call    %FUNC_ADDR,     %RET_ADDR

#   print_uint64(%N)
    movq    %N,             %PARAM0
    ldzwq   print_uint64,   %FUNC_ADDR
    call    %FUNC_ADDR,     %RET_ADDR

    putc    '\n'

    halt    %0

//------------------------------------------------------------------------------
//  function: void puts(%PARAM0)
puts:

    .equ    addr,   PARAM0
    .equ    ch,     CALLEE1

    .text
.puts.fetch:
    movzbq  (%addr),%ch
    subq    0,      %ch,    %0
    je      .puts.ret
    putc    %ch
    addq    1,      %addr,  %addr
    jmp     .puts.fetch
.puts.ret:
    ret     %RET_ADDR

//------------------------------------------------------------------------------
//  function:   %RET_VAL get_uint64()
get_uint64:
    .equ    dest,   RET_VAL
    .equ    ch,     CALLEE0

    .text
    ldzwq   0,  %dest

.get_uint64.read:
    getc    %ch
#   if %ch < '0' then we are done
    subq    '0',    %ch,    %0
    jb      .get_uint64.ret
#   if %ch > '9' then we are done
    subq    '9',    %ch,    %0
    ja      .get_uint64.ret

    subq    '0',    %ch,    %ch
    imulq   10,     %dest,  %dest
    addq    %ch,    %dest,  %dest
    jmp     .get_uint64.read

.get_uint64.ret:
    ret     %RET_ADDR

//------------------------------------------------------------------------------
//  function:   void    print_uint64(%PARAM0)
print_uint64
    .data
    .equ    val,        PARAM0
    .equ    digit,      CALLEE1
    .equ    p,          CALLEE2

    .bss
.print_uint64.buf:
    .space  20

    .text
#   init p with .print_uint64.buf
    ldzwq   .print_uint64.buf,    %p

.print_uint64.get_digit:
    divq    10,     %val,   %val
    addq    '0',    %digit, %digit
    movb    %digit, (%p)
    addq    1,      %p,     %p
    subq    0,      %val,   %0
    jnz     .print_uint64.get_digit

.print_uint64.print_digit:
    subq    1,      %p,     %p
    movzbq  (%p),   %digit
    putc    %digit
    subq    .print_uint64.buf,    %p,       %0
    jnz     .print_uint64.print_digit

    ret     %RET_ADDR

And here the ULM Instruction Set and its isa.txt source code:

RRR     (OP u 8) (X u 8) (Y u 8) (Z u 8)
J26     (OP u 8) (XYZ j 24)
U16R    (OP u 8) (XY u 16) (Z u 8)


0x01    RRR
:   halt    %X
    ulm_halt(ulm_regVal(X));

0x02    RRR
:   getc    %X
    ulm_setReg(ulm_readChar() & 0xFF, X);

0x03    RRR
:   putc    %X
    ulm_printChar(ulm_regVal(X) & 0xFF);

0x04    J26
#   Unconditional jump.
:   jmp XYZ
    ulm_unconditionalRelJump(XYZ);

0x05    RRR
:   subq    X, %Y, %Z
    ulm_sub64(X, ulm_regVal(Y), Z);

0x06    J26
:   jnz XYZ
:   jne XYZ
    ulm_conditionalRelJump(ulm_statusReg[ULM_ZF] == 0, XYZ);

0x07    J26
:   jz  XYZ
:   je  XYZ
    ulm_conditionalRelJump(ulm_statusReg[ULM_ZF] == 1, XYZ);

0x08    U16R
:   ldzwq   XY, %Z
    ulm_setReg(XY, Z);

0x09    RRR
:   movzbq (%X), %Z
    ulm_fetch64(0, X, 0, 0, ULM_ZERO_EXT, 1, Z);

0x0A    RRR
:   addq    X, %Y, %Z
    ulm_add64(X, ulm_regVal(Y), Z);

0x0B    RRR
:   imulq   %X, %Y, %Z
    ulm_mul64(ulm_regVal(X), ulm_regVal(Y), Z);

0x0C    J26
:   ja      XYZ
    ulm_conditionalRelJump(ulm_statusReg[ULM_ZF] == 0
                        && ulm_statusReg[ULM_CF] == 0, XYZ);

0x0D    J26
:   jb      XYZ
    ulm_conditionalRelJump(ulm_statusReg[ULM_CF] == 1, XYZ);

0x0E    RRR
:   addq    %X, %Y, %Z
:   movq    %X, %Z
    ulm_add64(ulm_regVal(X), ulm_regVal(Y), Z);

0x0F    RRR
:   imulq   X, %Y, %Z
    ulm_mul64(X, ulm_regVal(Y), Z);

0x10    RRR
#   Uses the 128-bit unsigned integer divider. The 64-bits stored in %Y are
#   therefore zero extended with %0 to 128 bits and then used as numerator.
#   The immediate value X is used as unsigned denominator.
#
#   The result of the operation is the quotient and the remainder. These results
#   will be stored in a register pair: The quotient is stored in %Z and the
#   remainder in %{Z + 1}.
:   divq    X, %Y, %Z
    ulm_div128(X, ulm_regVal(Y), ulm_regVal(0), Z, 0, Z + 1);

0x11    RRR
#   The least significant byte in %X gets stored at address %Z.
:   movb    %X, (%Z)
    ulm_store64(0, Z, 0, 0, 1, X);

0x12    RRR
#   Fetches a quad word from address %X into register %Z
:   movq    (%X), %Z
    ulm_fetch64(0, X, 0, 0, ULM_ZERO_EXT, 8, Z);

0x13    RRR
:   putc    X
    ulm_printChar(X & 0xFF);

0x14    RRR
#   Function call. Return address gets stored in %Y, then jumps to address %X.
:   jmp     %X,     %Y
:   call    %X,     %Y
:   ret     %X
    ulm_absJump(ulm_regVal(X), Y);


# General meaning of the mnemonics

@addq
#   Integer addition.
@getc
#   Get character
@divq
#   64-bit unsigned integer division
@halt
#   Halt program
@imulq
#   64-bit unsigned and signed integer multiplication.
@ja
#   Jump if above (conditional jump).
@jb
#   Jump if below (conditional jump).
@je
#   Jump if equal (conditional jump).
@jmp
#   Jump.
@jne
#   Jump if not equal (conditional jump).
@jnz
#   Jump if not zero (conditional jump).
@jz
#   Jump if zero (conditional jump).
@ldzwq
#   Load zero extended word to quad word register.
@putc
#   Put character
@subq
#   Integer subtraction.
@movb
#   Move a byte from a register to memory (store instruction).
@movzbq
#   Move zero extended byte to quad word register (fetch instruction).
@movq
#   Move quad word

Function-Call Jump Instruction

Key for supporting functions calls on the ULM is the jmp instruction with Opcode 0x14. It has two register operands to which we refer as %FUNC_ADDR and %RET_ADDR respectively. When executed it does two things:

  • It stores the return address %RET_ADDR and then

  • it jumps to the address in %FUNC_ADDR.

So what does return address mean? It is the address of the instruction that follows the jump instruction in memory. For example, when the jump instruction is at address 0x20 then the return address is 0x24.

In general the control flow can not be visualized with a flow chart. But in simple cases we can use for examples colors to enhance the information in such a flow chart. For example if in a program a leaf-function is called twice the different colors used in the paths make clear to which return address the function returns:

Here another example where two different leaf-functions are called:

How the Function-Call Jump Instruction is Realized

Recall that the ULM implements this simple von Neumann cycle for executing a program. So when an instruction gets executed it was loaded from address %IP (instruction pointer) into the %IR (instruction register). So what the instruction does when it gets executed is

\[\left( u(\%\text{ip}) + 4 \right) \bmod 2^{64} \to u(\%\text{RET})\quad\text{and}\quad u(\%\text{CALL}) \to u(\%\text{ip})\]

Returning from a function is then possible with a jmp %RET_ADDR, %0 instruction. Of course this requires that %RET_ADDR by then still (or again) contains the original return address. So when you program you should know that you can not simply use register %RET+ADDR without care. So you see that we need some rules, a protocol or call it convention for function calls.

Descriptions of such conventions usually the terms caller and callee. As long as we only deal with leaf functions these terms have a simple meaning. The program text can be divided into a main program and functions. In the main program we only have code that eventually calls a function, i.e. caller code. In the functions we only have code that gets executed when a function was called, i.e. callee code. Now think of two persons (caller and callee) writing a program together. The caller is writing the main program, the callee all the functions. The function call conventions are a contract between the caller and callee and allows to collaborate almost independently from each other. Actually the contract specifies what information one has to provide to another.

Caller

The caller has to know the address of the function, how to pass arguments and eventually how to get a result back from the function. The caller also has to know what registers the function might modify.

In the simplest form the function does not expect any arguments and does not return a result. In this case the caller just loads the address of the function into %CALL and does the (function) jump:

Also shown in this picture is the entry point of the program. The entry point is the address of the instruction that gets executed first when execution of the program starts. Of course a correct program should also have at least one exit point, i.e. some halt instruction. As a programmer you have to make sure that at least on of these exit points gets reached.

Callee

Accordingly the callee needs to know how to receive arguments, how to return and eventually how to give back a result. And of course, what registers can be used and modified for computations.

Again we first consider that the function does not receive any arguments and does not return any result. Then we basically have this (we talk about what registers the function implementation is allowed to modify below):

How the ARM Architecture Supports Function Calls

The ARM architecture provides the “branch and link” instruction BL for calling a function:

1
    BL  function_label

This instruction stores the return address in a register called lr (for link register) and then jumps to the address referred to by the label function_label (Sound familiar?). Returning from the function is done on ARM with a move-instruction. You simple overwrite the instruction pointer (so what we call IP for instruction pointer on ARM it's called PC for program counter) with the link register.

1
2
3
4
    MOV     pc, lr            /* Return from subroutine.
                                 Note: The MOV copies the right-hand-side to the
                                 left-hand-side
                              */

Note that on the ULM you can note use %IP explicitly in an instruction, but jmp %RET, %0 is implicitly is equivalent to that.

Conventions for Caller and Callee

Registers either belong to the caller or the callee. However, we keep it simple here only a subset of registers actually have a owner. If the subset turns out as too small we still can use more registers.

Registers %CALLER0, ..., %CALLER3 belong to the caller, and registers %CALLEE0, ..., %CALLEE3 to the callee. Using .equ directives we make sure that these identifiers are mapped to distinct registers (and we also make it easy to extend each set):

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
    .equ    CALLER0,    1
    .equ    CALLER1,    CALLER0 + 1
    .equ    CALLER2,    CALLER1 + 1
    .equ    CALLER3,    CALLER2 + 1
    .equ    CALLER_LAST,CALLER3

    .equ    CALLEE0,    CALLER_LAST + 1
    .equ    CALLEE1,    CALLEE0 + 1
    .equ    CALLEE2,    CALLEE1 + 1
    .equ    CALLEE3,    CALLEE2 + 1
    .equ    CALLEE_LAST,CALLEE3

Return Address

The callee needs to know where the caller stored the return address. For that we specify with

1
    .equ    RET_ADDR,       CALLER1

that it will be in %RET_ADDR.

Passing arguments

If the callee expects some parameters then the caller needs to know where the callee expects these parameters. We can easily extend this list

1
2
    .equ    PARAM0,         CALLEE0
    .equ    PARAM1,         PARAM0 + 1

to specify that a first parameter is always expected in %PARAM0, a second always in %PARAM1, etc.

Returning results

If a callee returns a value it needs to know where the caller wants to have the result. With

1
    .equ    RET_VAL,        CALLER2

we specify that the callee should store its result in %RET_VAL.