// cost: 6 loads + 6 offsets + 5 adds = 6*4 + 6*1 + 5*2 = 40
%macro load_fp6(offset)
    // stack:
    PUSH $offset
    %add_const(5)
    %mload_kernel_general
    // stack:                     x5
    PUSH $offset
    %add_const(4)
    %mload_kernel_general
    // stack:                 x4, x5
    PUSH $offset
    %add_const(3)
    %mload_kernel_general
    // stack:             x3, x4, x5
    PUSH $offset
    %add_const(2)
    %mload_kernel_general
    // stack:         x2, x3, x4, x5
    PUSH $offset
    %add_const(1)
    %mload_kernel_general
    // stack:     x1, x2, x3, x4, x5
    PUSH $offset
    %mload_kernel_general
    // stack: x0, x1, x2, x3, x4, x5
%endmacro

// cost: 40
%macro store_fp6(offset)
    // stack: x0, x1, x2, x3, x4, x5
    PUSH $offset
    %mstore_kernel_general
    // stack:     x1, x2, x3, x4, x5
    PUSH $offset
    %add_const(1)
    %mstore_kernel_general
    // stack:         x2, x3, x4, x5
    PUSH $offset
    %add_const(2)
    %mstore_kernel_general
    // stack:             x3, x4, x5
    PUSH $offset
    %add_const(3)
    %mstore_kernel_general
    // stack:                 x4, x5
    PUSH $offset
    %add_const(4)
    %mstore_kernel_general
    // stack:                     x5
    PUSH $offset
    %add_const(5)
    %mstore_kernel_general
    // stack:
%endmacro

// cost: 49
%macro store_fp6_sh(offset)
    // stack: x0, x1, x2, x3, x4, x5
    PUSH $offset
    %add_const(2)
    %mstore_kernel_general
    // stack:     x1, x2, x3, x4, x5
    PUSH $offset
    %add_const(3)
    %mstore_kernel_general
    // stack:         x2, x3, x4, x5
    PUSH $offset
    %add_const(4)
    %mstore_kernel_general
    // stack:             x3, x4, x5
    PUSH $offset
    %add_const(5)
    %mstore_kernel_general
    // stack:                 x4, x5
    %i9
    // stack:                 y5, y4
    PUSH $offset
    %add_const(1)
    %mstore_kernel_general
    // stack:                     y4
    PUSH $offset
    %mstore_kernel_general
    // stack:
%endmacro

// cost: 6
%macro dup1_fp6
    // stack:       F: 6
    DUP6
    DUP6
    DUP6
    DUP6
    DUP6
    DUP6
    // stack: F: 6, F: 6
%endmacro 

// cost: 6
%macro dup2_fp6
    // stack:       F: 6, G: 6
    DUP12
    DUP12
    DUP12
    DUP12
    DUP12
    DUP12
    // stack: G: 6, F: 6, G: 6
%endmacro

// cost: 16
%macro swap_fp6
    // stack: f0, f1, f2, f3, f4, f5, g0, g1, g2, g3, g4, g5
    SWAP6
    // stack: g0, f1, f2, f3, f4, f5, f0, g1, g2, g3, g4, g5
    SWAP1
    SWAP7
    SWAP1
    // stack: g0, g1, f2, f3, f4, f5, f0, f1, g2, g3, g4, g5
    SWAP2
    SWAP8
    SWAP2
    // stack: g0, g1, g2, f3, f4, f5, f0, f1, f2, g3, g4, g5
    SWAP3
    SWAP9
    SWAP3    
    // stack: g0, g1, g2, g3, f4, f5, f0, f1, f2, f3, g4, g5
    SWAP4
    SWAP10
    SWAP4
    // stack: g0, g1, g2, g3, g4, f5, f0, f1, f2, f3, f4, g5
    SWAP5
    SWAP11
    SWAP5
    // stack: g0, g1, g2, g3, g4, g5, f0, f1, f2, f3, f4, f5
%endmacro

// cost: 16
%macro add_fp6
    // stack: f0, f1, f2, f3, f4, f5, g0, g1, g2, g3, g4, g5
    SWAP7
    ADD
    SWAP6
    // stack: f0,     f2, f3, f4, f5, g0, h1, g2, g3, g4, g5 
    SWAP7
    ADD
    SWAP6
    // stack: f0,         f3, f4, f5, g0, h1, h2, g3, g4, g5 
    SWAP7
    ADD
    SWAP6
    // stack: f0,             f4, f5, g0, h1, h2, h3, g4, g5
    SWAP7
    ADD
    SWAP6
    // stack: f0,                 f5, g0, h1, h2, h3, h4, g5
    SWAP7
    ADD
    SWAP6
    // stack: f0,                     g0, h1, h2, h3, h4, h5
    ADD
    // stack:                         h0, h1, h2, h3, h4, h5
%endmacro

// *backwards order subtraction* cost: 16
%macro bus_fp6
    // stack: f0, f1, f2, f3, f4, f5, g0, g1, g2, g3, g4, g5
    SWAP7
    SUB
    SWAP6
    // stack: f0,     f2, f3, f4, f5, g0, h1, g2, g3, g4, g5 
    SWAP7
    SUB
    SWAP6
    // stack: f0,         f3, f4, f5, g0, h1, h2, g3, g4, g5 
    SWAP7
    SUB
    SWAP6
    // stack: f0,             f4, f5, g0, h1, h2, h3, g4, g5
    SWAP7
    SUB
    SWAP6
    // stack: f0,                 f5, g0, h1, h2, h3, h4, g5
    SWAP7
    SUB
    SWAP6
    // stack: f0,                     g0, h1, h2, h3, h4, h5
    SUB
    // stack:                         h0, h1, h2, h3, h4, h5
%endmacro

// cost: 9; note this returns y, x for x + yi
%macro i9
    // stack:          a , b
    DUP2
    DUP2
    // stack:  a , b,  a , b
    %mul_const(9)
    SUB
    // stack: 9a - b,  a , b
    SWAP2 
    // stack:  b , a, 9a - b
    %mul_const(9)
    ADD
    // stack: 9b + a, 9a - b 
%endmacro