plonky2/evm/src/cpu/kernel/asm/fields/field_macros.asm
2022-11-14 17:39:33 -08:00

680 lines
18 KiB
NASM

%macro offset_fp6
%add_const(6)
%endmacro
// cost: 2 loads + 6 dup/swaps + 5 adds = 6*4 + 6*1 + 5*2 = 40
%macro load_fp2
// stack: ptr
DUP1 %add_const(1)
// stack: ind1, ptr
%mload_kernel_general
// stack: x1, ptr
SWAP1
// stack: ind0, x1
%mload_kernel_general
// stack: x0, x1
%endmacro
// cost: 6 loads + 6 dup/swaps + 5 adds = 6*4 + 6*1 + 5*2 = 40
%macro load_fp6
// stack: ptr
DUP1 %add_const(4)
// stack: ind4, ptr
%mload_kernel_general
// stack: x4, ptr
DUP2 %add_const(3)
// stack: ind3, x4, ptr
%mload_kernel_general
// stack: x3, x4, ptr
DUP3 %add_const(2)
// stack: ind2, x3, x4, ptr
%mload_kernel_general
// stack: x2, x3, x4, ptr
DUP4 %add_const(1)
// stack: ind1, x2, x3, x4, ptr
%mload_kernel_general
// stack: x1, x2, x3, x4, ptr
DUP5 %add_const(5)
// stack: ind5, x1, x2, x3, x4, ptr
%mload_kernel_general
// stack: x5, x1, x2, x3, x4, ptr
SWAP5
// stack: ind0, x1, x2, x3, x4, x5
%mload_kernel_general
// stack: x0, x1, x2, x3, x4, x5
%endmacro
// cost: 6 loads + 6 pushes + 5 adds = 6*4 + 6*1 + 5*2 = 40
%macro load_fp6(ptr)
// stack:
PUSH $ptr %add_const(5)
// stack: ind5
%mload_kernel_general
// stack: x5
PUSH $ptr %add_const(4)
// stack: ind4, x5
%mload_kernel_general
// stack: x4, x5
PUSH $ptr %add_const(3)
// stack: ind3, x4, x5
%mload_kernel_general
// stack: x3, x4, x5
PUSH $ptr %add_const(2)
// stack: ind2, x3, x4, x5
%mload_kernel_general
// stack: x2, x3, x4, x5
PUSH $ptr %add_const(1)
// stack: ind1, x2, x3, x4, x5
%mload_kernel_general
// stack: x1, x2, x3, x4, x5
PUSH $ptr
// stack: ind0, x1, x2, x3, x4, x5
%mload_kernel_general
// stack: x0, x1, x2, x3, x4, x5
%endmacro
// cost: 6 stores + 6 swaps/dups + 5 adds = 6*4 + 6*1 + 5*2 = 40
%macro store_fp6
// stack: ptr, x0, x1, x2, x3, x4 , x5
SWAP5
// stack: x4, x0, x1, x2, x3, ptr, x5
DUP6 %add_const(4)
// stack: ind4, x4, x0, x1, x2, x3, ptr, x5
%mstore_kernel_general
// stack: x0, x1, x2, x3, ptr, x5
DUP5
// stack: ind0, x0, x1, x2, x3, ptr, x5
%mstore_kernel_general
// stack: x1, x2, x3, ptr, x5
DUP4 %add_const(1)
// stack: ind1, x1, x2, x3, ptr, x5
%mstore_kernel_general
// stack: x2, x3, ptr, x5
DUP3 %add_const(2)
// stack: ind2, x2, x3, ptr, x5
%mstore_kernel_general
// stack: x3, ptr, x5
DUP2 %add_const(3)
// stack: ind3, x3, ptr, x5
%mstore_kernel_general
// stack: ptr, x5
%add_const(5)
// stack: ind5, x5
%mstore_kernel_general
// stack:
%endmacro
// cost: 6 stores + 7 swaps/dups + 5 adds + 6 doubles = 6*4 + 7*1 + 5*2 + 6*2 = 53
%macro store_fp6_double
// stack: ptr, x0, x1, x2, x3, x4, x5
SWAP6
// stack: x5, x0, x1, x2, x3, x4, ptr
PUSH 2 MULFP254
// stack: 2*x5, x0, x1, x2, x3, x4, ptr
DUP7 %add_const(5)
// stack: ind5, 2*x5, x0, x1, x2, x3, x4, ptr
%mstore_kernel_general
// stack: x0, x1, x2, x3, x4, ptr
PUSH 2 MULFP254
// stack: 2*x0, x1, x2, x3, x4, ptr
DUP6
// stack: ind0, 2*x0, x1, x2, x3, x4, ptr
%mstore_kernel_general
// stack: x1, x2, x3, x4, ptr
PUSH 2 MULFP254
// stack: 2*x1, x2, x3, x4, ptr
DUP5 %add_const(1)
// stack: ind1, 2*x1, x2, x3, x4, ptr
%mstore_kernel_general
// stack: x2, x3, x4, ptr
PUSH 2 MULFP254
// stack: 2*x2, x3, x4, ptr
DUP4 %add_const(2)
// stack: ind2, 2*x2, x3, x4, ptr
%mstore_kernel_general
// stack: x3, x4, ptr
PUSH 2 MULFP254
// stack: 2*x3, x4, ptr
DUP3 %add_const(3)
// stack: ind3, 2*x3, x4, ptr
%mstore_kernel_general
// stack: x4, ptr
PUSH 2 MULFP254
// stack: 2*x4, ptr
SWAP1
// stack: ptr, 2*x4
%add_const(4)
// stack: ind4, 2*x4
%mstore_kernel_general
// stack:
%endmacro
// cost: 6 stores + 6 pushes + 5 adds = 6*4 + 6*1 + 5*2 = 40
%macro store_fp6(ptr)
// stack: x0, x1, x2, x3, x4, x5
PUSH $ptr
// stack: ind0, x0, x1, x2, x3, x4, x5
%mstore_kernel_general
// stack: x1, x2, x3, x4, x5
PUSH $ptr %add_const(1)
// stack: ind1, x1, x2, x3, x4, x5
%mstore_kernel_general
// stack: x2, x3, x4, x5
PUSH $ptr %add_const(2)
// stack: ind2, x2, x3, x4, x5
%mstore_kernel_general
// stack: x3, x4, x5
PUSH $ptr %add_const(3)
// stack: ind3, x3, x4, x5
%mstore_kernel_general
// stack: x4, x5
PUSH $ptr %add_const(4)
// stack: ind4, x4, x5
%mstore_kernel_general
// stack: x5
PUSH $ptr %add_const(5)
// stack: ind5, x5
%mstore_kernel_general
// stack:
%endmacro
// cost: store (40) + i9 (9) = 49
%macro store_fp6_sh(ptr)
// stack: x0, x1, x2, x3, x4, x5
PUSH $ptr %add_const(2)
// stack: ind2, x0, x1, x2, x3, x4, x5
%mstore_kernel_general
// stack: x1, x2, x3, x4, x5
PUSH $ptr %add_const(3)
// stack: ind3, x1, x2, x3, x4, x5
%mstore_kernel_general
// stack: x2, x3, x4, x5
PUSH $ptr %add_const(4)
// stack: ind4, x2, x3, x4, x5
%mstore_kernel_general
// stack: x3, x4, x5
PUSH $ptr %add_const(5)
// stack: ind5, x3, x4, x5
%mstore_kernel_general
// stack: x4, x5
%i9
// stack: y5, y4
PUSH $ptr %add_const(1)
// stack: ind1, y5, y4
%mstore_kernel_general
// stack: y4
PUSH $ptr
// stack: ind0, y4
%mstore_kernel_general
// stack:
%endmacro
// cost: 6
%macro dup_fp6_0
// stack: f: 6
DUP6
DUP6
DUP6
DUP6
DUP6
DUP6
// stack: f: 6, f: 6
%endmacro
// cost: 6
%macro dup_fp6_2
// stack: X: 2, f: 6
DUP8
DUP8
DUP8
DUP8
DUP8
DUP8
// stack: f: 6, X: 2, f: 6
%endmacro
// cost: 6
%macro dup_fp6_6
// stack: X: 6, f: 6
DUP12
DUP12
DUP12
DUP12
DUP12
DUP12
// stack: f: 6, X: 6, f: 6
%endmacro
// cost: 6
%macro dup_fp6_7
// stack: X: 7, f: 6
DUP13
DUP13
DUP13
DUP13
DUP13
DUP13
// stack: f: 6, X: 7, f: 6
%endmacro
// cost: 6
%macro dup_fp6_8
// stack: X: 8, f: 6
DUP14
DUP14
DUP14
DUP14
DUP14
DUP14
// stack: f: 6, X: 8, f: 6
%endmacro
// cost: 16
%macro swap_fp6
// stack: f0, f1, f2, f3, f4, f5, g0, g1, g2, g3, g4, g5
SWAP6
// stack: g0, f1, f2, f3, f4, f5, f0, g1, g2, g3, g4, g5
SWAP1
SWAP7
SWAP1
// stack: g0, g1, f2, f3, f4, f5, f0, f1, g2, g3, g4, g5
SWAP2
SWAP8
SWAP2
// stack: g0, g1, g2, f3, f4, f5, f0, f1, f2, g3, g4, g5
SWAP3
SWAP9
SWAP3
// stack: g0, g1, g2, g3, f4, f5, f0, f1, f2, f3, g4, g5
SWAP4
SWAP10
SWAP4
// stack: g0, g1, g2, g3, g4, f5, f0, f1, f2, f3, f4, g5
SWAP5
SWAP11
SWAP5
// stack: g0, g1, g2, g3, g4, g5, f0, f1, f2, f3, f4, f5
%endmacro
// cost: 16
// swap two fp6 elements with a stack term separating them
// (f: 6, X, g: 6) -> (g: 6, X, f: 6)
%macro swap_fp6_hole
// stack: f0, f1, f2, f3, f4, f5, X, g0, g1, g2, g3, g4, g5
SWAP7
// stack: g0, f1, f2, f3, f4, f5, X, f0, g1, g2, g3, g4, g5
SWAP1
SWAP8
SWAP1
// stack: g0, g1, f2, f3, f4, f5, X, f0, f1, g2, g3, g4, g5
SWAP2
SWAP9
SWAP2
// stack: g0, g1, g2, f3, f4, f5, X, f0, f1, f2, g3, g4, g5
SWAP3
SWAP10
SWAP3
// stack: g0, g1, g2, g3, f4, f5, X, f0, f1, f2, f3, g4, g5
SWAP4
SWAP11
SWAP4
// stack: g0, g1, g2, g3, g4, f5, X, f0, f1, f2, f3, f4, g5
SWAP5
SWAP12
SWAP5
// stack: g0, g1, g2, g3, g4, g5, X, f0, f1, f2, f3, f4, f5
%endmacro
// cost: 16
// swap two fp6 elements with two stack terms separating them
// (f: 6, X: 2, g: 6) -> (g: 6, X: 2, f: 6)
%macro swap_fp6_hole_2
// stack: f0, f1, f2, f3, f4, f5, X, g0, g1, g2, g3, g4, g5
SWAP8
// stack: g0, f1, f2, f3, f4, f5, X, f0, g1, g2, g3, g4, g5
SWAP1
SWAP9
SWAP1
// stack: g0, g1, f2, f3, f4, f5, X, f0, f1, g2, g3, g4, g5
SWAP2
SWAP10
SWAP2
// stack: g0, g1, g2, f3, f4, f5, X, f0, f1, f2, g3, g4, g5
SWAP3
SWAP11
SWAP3
// stack: g0, g1, g2, g3, f4, f5, X, f0, f1, f2, f3, g4, g5
SWAP4
SWAP12
SWAP4
// stack: g0, g1, g2, g3, g4, f5, X, f0, f1, f2, f3, f4, g5
SWAP5
SWAP13
SWAP5
// stack: g0, g1, g2, g3, g4, g5, X, f0, f1, f2, f3, f4, f5
%endmacro
// cost: 16
%macro add_fp6
// stack: f0, f1, f2, f3, f4, f5, g0, g1, g2, g3, g4, g5
SWAP7
ADDFP254
SWAP6
// stack: f0, f2, f3, f4, f5, g0, h1, g2, g3, g4, g5
SWAP7
ADDFP254
SWAP6
// stack: f0, f3, f4, f5, g0, h1, h2, g3, g4, g5
SWAP7
ADDFP254
SWAP6
// stack: f0, f4, f5, g0, h1, h2, h3, g4, g5
SWAP7
ADDFP254
SWAP6
// stack: f0, f5, g0, h1, h2, h3, h4, g5
SWAP7
ADDFP254
SWAP6
// stack: f0, g0, h1, h2, h3, h4, h5
ADDFP254
// stack: h0, h1, h2, h3, h4, h5
%endmacro
// cost: 18
// add two fp6 elements with a to-be-popped stack term separating them
// (f: 6, X, g: 6) -> (f + g: 6)
%macro add_fp6_hole
// stack: f0, f1, f2, f3, f4, f5, X, g0, g1, g2, g3, g4, g5
SWAP8
ADDFP254
SWAP7
// stack: f0, f2, f3, f4, f5, X, g0, h1, g2, g3, g4, g5
SWAP8
ADDFP254
SWAP7
// stack: f0, f3, f4, f5, X, g0, h1, h2, g3, g4, g5
SWAP8
ADDFP254
SWAP7
// stack: f0, f4, f5, X, g0, h1, h2, h3, g4, g5
SWAP8
ADDFP254
SWAP7
// stack: f0, f5, X, g0, h1, h2, h3, h4, g5
SWAP8
ADDFP254
SWAP7
// stack: f0, X, g0, h1, h2, h3, h4, h5
SWAP1
POP
ADDFP254
// stack: h0, h1, h2, h3, h4, h5
%endmacro
// *reversed argument subtraction* cost: 17
%macro subr_fp6
// stack: f0, f1, f2, f3, f4, f5, g0, g1, g2, g3, g4, g5
SWAP7
SUBFP254
SWAP6
// stack: f0, f2, f3, f4, f5, g0, h1, g2, g3, g4, g5
SWAP7
SUBFP254
SWAP6
// stack: f0, f3, f4, f5, g0, h1, h2, g3, g4, g5
SWAP7
SUBFP254
SWAP6
// stack: f0, f4, f5, g0, h1, h2, h3, g4, g5
SWAP7
SUBFP254
SWAP6
// stack: f0, f5, g0, h1, h2, h3, h4, g5
SWAP7
SUBFP254
SWAP6
// stack: f0, g0, h1, h2, h3, h4, h5
SWAP1
SUBFP254
// stack: h0, h1, h2, h3, h4, h5
%endmacro
// cost: 21
%macro mul_fp_fp6
// stack: c , f0, f1, f2, f3, f4, f5
SWAP6
DUP7
MULFP254
SWAP6
// stack: c , f0, f1, f2, f3, f4, c * f5
SWAP5
DUP6
MULFP254
SWAP5
// stack: c , f0, f1, f2, f3, c * f4, c * f5
SWAP4
DUP5
MULFP254
SWAP4
// stack: c , f0, f1, f2, c * f3, c * f4, c * f5
SWAP3
DUP4
MULFP254
SWAP3
// stack: c , f0, f1, c * f2, c * f3, c *f 4, c * f5
SWAP2
DUP3
MULFP254
SWAP2
// stack: c , f0, c * f1, c * f2, c * f3, c * f4, c * f5
MULFP254
// stack: c * f0, c * f1, c * f2, c * f3, c * f4, c * f5
%endmacro
/// cost: 1 i9 (9) + 16 dups + 15 swaps + 12 muls + 6 adds/subs = 58
///
/// G0 + G1t + G2t^2 = (a+bi)t * (F0 + F1t + F2t^2)
/// = (c+di)F2 + (a+bi)F0t + (a+bi)F1t^2
/// where c+di = (a+bi)(9+i) = (9a-b) + (a+9b)i
///
/// G0 = (c+di)(f0+f0_i) = (cf2 - df2_) + (df2 + cf2_)i
/// G1 = (a+bi)(f0+f0_i) = (af0 - bf0_) + (bf0 + af0_)i
/// G2 = (a+bi)(f1+f1_i) = (af1 - bf1_) + (bf1 + af1_)i
%macro mul_fp2_fp6_sh
// stack: a, b, f0, f0_, f1, f1_, f2, f2_
DUP6
DUP3
MULFP254
// stack: bf1_, a, b, f0, f0_, f1, f1_, f2, f2_
DUP6
DUP3
MULFP254
// stack: af1 , bf1_, a, b, f0, f0_, f1, f1_, f2, f2_
SUBFP254
// stack: g2, a, b, f0, f0_, f1, f1_, f2, f2_
SWAP7
// stack: f2, a, b, f0, f0_, f1, f1_, g2, f2_
SWAP5
// stack: f1, a, b, f0, f0_, f2, f1_, g2, f2_
DUP3
MULFP254
// stack: bf1, a, b, f0, f0_, f2, f1_, g2, f2_
SWAP1
SWAP6
// stack: f1_, bf1, b, f0, f0_, f2, a, g2, f2_
DUP7
MULFP254
// stack: af1_, bf1, b, f0, f0_, f2, a, g2, f2_
ADDFP254
// stack: g2_, b, f0, f0_, f2, a, g2, f2_
SWAP7
// stack: f2_, b, f0, f0_, f2, a, g2, g2_
DUP4
DUP3
MULFP254
// stack: bf0_, f2_, b, f0, f0_, f2, a, g2, g2_
DUP4
DUP8
MULFP254
// stack: af0, bf0_, f2_, b, f0, f0_, f2, a, g2, g2_
SUBFP254
// stack: g1, f2_, b, f0, f0_, f2, a, g2, g2_
SWAP5
// stack: f2, f2_, b, f0, f0_, g1, a, g2, g2_
SWAP3
// stack: f0, f2_, b, f2, f0_, g1, a, g2, g2_
DUP3
MULFP254
// stack: bf0, f2_, b, f2, f0_, g1, a, g2, g2_
SWAP1
SWAP4
// stack: f0_, bf0, b, f2, f2_, g1, a, g2, g2_
DUP7
MULFP254
// stack: af0_, bf0, b, f2, f2_, g1, a, g2, g2_
ADDFP254
// stack: g1_, b, f2, f2_, g1, a, g2, g2_
SWAP5
// stack: a, b, f2, f2_, g1, g1_, g2, g2_
%i9
// stack: d, c, f2, f2_, g1, g1_, g2, g2_
DUP4
DUP2
MULFP254
// stack: df2_, d, c, f2, f2_, g1, g1_, g2, g2_
DUP4
DUP4
MULFP254
// stack: cf2, df2_, d, c, f2, f2_, g1, g1_, g2, g2_
SUBFP254
// stack: g0, d, c, f2, f2_, g1, g1_, g2, g2_
SWAP3
// stack: f2, d, c, g0, f2_, g1, g1_, g2, g2_
MULFP254
// stack: df2, c, g0, f2_, g1, g1_, g2, g2_
SWAP3
MULFP254
// stack: cf2_, g0, df2, g1, g1_, g2, g2_
SWAP1
SWAP2
// stack: df2, cf2_, g0, g1, g1_, g2, g2_
ADDFP254
// stack: g0_, g0, g1, g1_, g2, g2_
SWAP1
// stack: g0, g0_, g1, g1_, g2, g2_
%endmacro
/// cost: 1 i9 (9) + 16 dups + 17 swaps + 12 muls + 6 adds/subs = 60
///
/// G0 + G1t + G2t^2 = (a+bi)t^2 * (F0 + F1t + F2t^2)
/// = (c+di)F1 + (c+di)F2t + (a+bi)F0t^2
/// where c+di = (a+bi)(9+i) = (9a-b) + (a+9b)i
///
/// G0 = (c+di)(f0+f0_i) = (cf1 - df1_) + (df1 + cf1_)i
/// G1 = (a+bi)(f0+f0_i) = (cf2 - df2_) + (df2 + cf2_)i
/// G2 = (a+bi)(f1+f1_i) = (af0 - bf0_) + (bf0 + af0_)i
%macro mul_fp2_fp6_sh2
// stack: a, b, f0, f0_, f1, f1_, f2, f2_
DUP4
DUP3
MULFP254
// stack: bf0_, a, b, f0, f0_, f1, f1_, f2, f2_
DUP4
DUP3
MULFP254
// stack: af0, bf0_, a, b, f0, f0_, f1, f1_, f2, f2_
SUBFP254
// stack: g2, a, b, f0, f0_, f1, f1_, f2, f2_
SWAP7
SWAP3
// stack: f0, a, b, f2, f0_, f1, f1_, g2, f2_
DUP3
MULFP254
// stack: bf0, a, b, f2, f0_, f1, f1_, g2, f2_
SWAP1
SWAP4
// stack: f0_, bf0, b, f2, a, f1, f1_, g2, f2_
DUP5
MULFP254
// stack: af0_, bf0, b, f2, a, f1, f1_, g2, f2_
ADDFP254
// stack: g2_, b, f2, a, f1, f1_, g2, f2_
SWAP7
SWAP3
// stack: a, b, f2, f2_, f1, f1_, g2, g2_
%i9
// stack: d, c, f2, f2_, f1, f1_, g2, g2_
DUP4
DUP2
MULFP254
// stack: df2_, d, c, f2, f2_, f1, f1_, g2, g2_
DUP4
DUP4
MULFP254
// stack: cf2, df2_, d, c, f2, f2_, f1, f1_, g2, g2_
SUBFP254
// stack: g1, d, c, f2, f2_, f1, f1_, g2, g2_
SWAP5
SWAP3
// stack: f2, d, c, f1, f2_, g1, f1_, g2, g2_
DUP2
MULFP254
// stack: df2, d, c, f1, f2_, g1, f1_, g2, g2_
SWAP1
SWAP4
// stack: f2_, df2, c, f1, d, g1, f1_, g2, g2_
DUP3
MULFP254
// stack: cf2_, df2, c, f1, d, g1, f1_, g2, g2_
ADDFP254
// stack: g1_, c, f1, d, g1, f1_, g2, g2_
SWAP5
// stack: f1_, c, f1, d, g1, g1_, g2, g2_
DUP1
DUP5
MULFP254
// stack: df1_, f1_, c, f1, d, g1, g1_, g2, g2_
DUP4
DUP4
MULFP254
// stack: cf1, df1_, f1_, c, f1, d, g1, g1_, g2, g2_
SUBFP254
// stack: g0, f1_, c, f1, d, g1, g1_, g2, g2_
SWAP3
// stack: f1, f1_, c, g0, d, g1, g1_, g2, g2_
SWAP2
MULFP254
// stack: cf1_, f1, g0, d, g1, g1_, g2, g2_
SWAP3
MULFP254
// stack: df1, g0, cf1_, g1, g1_, g2, g2_
SWAP1
SWAP2
// stack: cf1_, df1, g0, g1, g1_, g2, g2_
ADDFP254
// stack: g0_, g0, g1, g1_, g2, g2_
SWAP1
// stack: g0, g0_, g1, g1_, g2, g2_
%endmacro
// cost: 9; note this returns y, x for the output x + yi
%macro i9
// stack: a , b
DUP2
// stack: b, a , b
DUP2
// stack: a , b, a , b
PUSH 9 MULFP254
// stack: 9a , b, a , b
SUBFP254
// stack: 9a - b, a , b
SWAP2
// stack: b , a, 9a - b
PUSH 9 MULFP254
// stack 9b , a, 9a - b
ADDFP254
// stack: 9b + a, 9a - b
%endmacro