From 784a4c0869c2400c8a15f1eecaef9d9f15704950 Mon Sep 17 00:00:00 2001 From: Dmitry Vagner Date: Wed, 9 Nov 2022 19:39:49 -0800 Subject: [PATCH] scaling macros --- .../cpu/kernel/asm/fields/field_macros.asm | 273 ++++++++++++++++-- evm/src/cpu/kernel/asm/fields/fp12_mul.asm | 12 +- 2 files changed, 259 insertions(+), 26 deletions(-) diff --git a/evm/src/cpu/kernel/asm/fields/field_macros.asm b/evm/src/cpu/kernel/asm/fields/field_macros.asm index 02651a1b..2d12aead 100644 --- a/evm/src/cpu/kernel/asm/fields/field_macros.asm +++ b/evm/src/cpu/kernel/asm/fields/field_macros.asm @@ -2,9 +2,22 @@ %add_const(6) %endmacro +// cost: 2 loads + 6 dup/swaps + 5 adds = 6*4 + 6*1 + 5*2 = 40 +%macro load_fp2 + // stack: ptr + DUP1 %add_const(1) + // stack: ind1, ptr + %mload_kernel_general + // stack: x1, ptr + SWAP1 + // stack: ind0, x1 + %mload_kernel_general + // stack: x0, x1 +%endmacro + // cost: 6 loads + 6 dup/swaps + 5 adds = 6*4 + 6*1 + 5*2 = 40 %macro load_fp6 - // stack: ptr + // stack: ptr DUP1 %add_const(4) // stack: ind4, ptr %mload_kernel_general @@ -196,25 +209,6 @@ // stack: %endmacro -// cost: 9; note this returns y, x for the output x + yi -%macro i9 - // stack: a , b - DUP2 - // stack: b, a, b - DUP2 - // stack: a , b, a , b - PUSH 9 MULFP254 - // stack: 9a , b, a , b - SUBFP254 - // stack: 9a - b, a , b - SWAP2 - // stack: b , a, 9a - b - PUSH 9 MULFP254 - // stack 9b , a, 9a - b - ADDFP254 - // stack: 9b + a, 9a - b -%endmacro - // cost: 6 %macro dup_fp6_0 // stack: f: 6 @@ -445,3 +439,242 @@ SUBFP254 // stack: h0, h1, h2, h3, h4, h5 %endmacro + +// cost: 21 +%macro mul_fp_fp6 + // stack: c, f0, f1, f2, f3, f4, f5 + SWAP6 + DUP7 + MULFP254 + SWAP6 + // stack: c, f0, f1, f2, f3, f4, c * f5 + SWAP5 + DUP6 + MULFP254 + SWAP5 + // stack: c , f0, f1, f2, f3, c * f4, c * f5 + SWAP4 + DUP5 + MULFP254 + SWAP4 + // stack: c , f0, f1, f2, c * f3, c * f4, c * f5 + SWAP3 + DUP4 + MULFP254 + SWAP3 + // stack: c , f0, f1, c * f2, c * f3, c *f 4, c * f5 + SWAP2 + DUP3 + MULFP254 + SWAP2 + // stack: c , f0, c * f1, c * f2, c * f3, c * f4, c * f5 + MULFP254 + // stack: c * f0, c * f1, c * f2, c * f3, c * f4, c * f5 +%endmacro + +/// cost: 1 i9 (9) + 16 dups + 15 swaps + 12 muls + 6 adds/subs = 58 +/// +/// G0 + G1t + G2t^2 = (a+bi)t * (F0 + F1t + F2t^2) +/// = (c+di)F2 + (a+bi)F0t + (a+bi)F1t^2 +/// where c+di = (a+bi)(9+i) = (9a-b) + (a+9b)i +/// +/// G0 = (c+di)(f0+f0_i) = (cf2 - df2_) + (df2 + cf2_)i +/// G1 = (a+bi)(f0+f0_i) = (af0 - bf0_) + (bf0 + af0_)i +/// G2 = (a+bi)(f1+f1_i) = (af1 - bf1_) + (bf1 + af1_)i + +%macro mul_fp2_fp6_sh + // stack: a, b, f0, f0_, f1, f1_, f2, f2_ + DUP6 + DUP3 + MULFP254 + // stack: bf1_, a, b, f0, f0_, f1, f1_, f2, f2_ + DUP6 + DUP3 + MULFP254 + // stack: af1 , bf1_, a, b, f0, f0_, f1, f1_, f2, f2_ + SUBFP254 + // stack: g2, a, b, f0, f0_, f1, f1_, f2, f2_ + SWAP7 + // stack: f2, a, b, f0, f0_, f1, f1_, g2, f2_ + SWAP5 + // stack: f1, a, b, f0, f0_, f2, f1_, g2, f2_ + DUP3 + MULFP254 + // stack: bf1, a, b, f0, f0_, f2, f1_, g2, f2_ + SWAP1 + SWAP6 + // stack: f1_, bf1, b, f0, f0_, f2, a, g2, f2_ + DUP7 + MULFP254 + // stack: af1_, bf1, b, f0, f0_, f2, a, g2, f2_ + ADDFP254 + // stack: g2_, b, f0, f0_, f2, a, g2, f2_ + SWAP7 + // stack: f2_, b, f0, f0_, f2, a, g2, g2_ + DUP4 + DUP3 + MULFP254 + // stack: bf0_, f2_, b, f0, f0_, f2, a, g2, g2_ + DUP4 + DUP8 + MULFP254 + // stack: af0, bf0_, f2_, b, f0, f0_, f2, a, g2, g2_ + SUBFP254 + // stack: g1, f2_, b, f0, f0_, f2, a, g2, g2_ + SWAP5 + // stack: f2, f2_, b, f0, f0_, g1, a, g2, g2_ + SWAP3 + // stack: f0, f2_, b, f2, f0_, g1, a, g2, g2_ + DUP3 + MULFP254 + // stack: bf0, f2_, b, f2, f0_, g1, a, g2, g2_ + SWAP1 + SWAP4 + // stack: f0_, bf0, b, f2, f2_, g1, a, g2, g2_ + DUP7 + MULFP254 + // stack: af0_, bf0, b, f2, f2_, g1, a, g2, g2_ + ADDFP254 + // stack: g1_, b, f2, f2_, g1, a, g2, g2_ + SWAP5 + // stack: a, b, f2, f2_, g1, g1_, g2, g2_ + %i9 + // stack: d, c, f2, f2_, g1, g1_, g2, g2_ + DUP4 + DUP2 + MULFP254 + // stack: df2_, d, c, f2, f2_, g1, g1_, g2, g2_ + DUP4 + DUP4 + MULFP254 + // stack: cf2, df2_, d, c, f2, f2_, g1, g1_, g2, g2_ + SUBFP254 + // stack: g0, d, c, f2, f2_, g1, g1_, g2, g2_ + SWAP3 + // stack: f2, d, c, g0, f2_, g1, g1_, g2, g2_ + MULFP254 + // stack: df2, c, g0, f2_, g1, g1_, g2, g2_ + SWAP3 + MULFP254 + // stack: cf2_, g0, df2, g1, g1_, g2, g2_ + SWAP1 + SWAP2 + // stack: df2, cf2_, g0, g1, g1_, g2, g2_ + ADDFP254 + // stack: g0_, g0, g1, g1_, g2, g2_ + SWAP1 + // stack: g0, g0_, g1, g1_, g2, g2_ +%endmacro + +/// cost: 1 i9 (9) + 16 dups + 17 swaps + 12 muls + 6 adds/subs = 60 +/// +/// G0 + G1t + G2t^2 = (a+bi)t^2 * (F0 + F1t + F2t^2) +/// = (c+di)F1 + (c+di)F2t + (a+bi)F0t^2 +/// where c+di = (a+bi)(9+i) = (9a-b) + (a+9b)i +/// +/// G0 = (c+di)(f0+f0_i) = (cf1 - df1_) + (df1 + cf1_)i +/// G1 = (a+bi)(f0+f0_i) = (cf2 - df2_) + (df2 + cf2_)i +/// G2 = (a+bi)(f1+f1_i) = (af0 - bf0_) + (bf0 + af0_)i + +%macro mul_fp2_fp6_sh2 + // stack: a, b, f0, f0_, f1, f1_, f2, f2_ + DUP4 + DUP3 + MULFP254 + // stack: bf0_, a, b, f0, f0_, f1, f1_, f2, f2_ + DUP4 + DUP3 + MULFP254 + // stack: af0, bf0_, a, b, f0, f0_, f1, f1_, f2, f2_ + SUBFP254 + // stack: g2, a, b, f0, f0_, f1, f1_, f2, f2_ + SWAP7 + SWAP3 + // stack: f0, a, b, f2, f0_, f1, f1_, g2, f2_ + DUP3 + MULFP254 + // stack: bf0, a, b, f2, f0_, f1, f1_, g2, f2_ + SWAP1 + SWAP4 + // stack: f0_, bf0, b, f2, a, f1, f1_, g2, f2_ + DUP5 + MULFP254 + // stack: af0_, bf0, b, f2, a, f1, f1_, g2, f2_ + ADDFP254 + // stack: g2_, b, f2, a, f1, f1_, g2, f2_ + SWAP7 + SWAP3 + // stack: a, b, f2, f2_, f1, f1_, g2, g2_ + %i9 + // stack: d, c, f2, f2_, f1, f1_, g2, g2_ + DUP4 + DUP2 + MULFP254 + // stack: df2_, d, c, f2, f2_, f1, f1_, g2, g2_ + DUP4 + DUP4 + MULFP254 + // stack: cf2, df2_, d, c, f2, f2_, f1, f1_, g2, g2_ + SUBFP254 + // stack: g1, d, c, f2, f2_, f1, f1_, g2, g2_ + SWAP5 + SWAP3 + // stack: f2, d, c, f1, f2_, g1, f1_, g2, g2_ + DUP2 + MULFP254 + // stack: df2, d, c, f1, f2_, g1, f1_, g2, g2_ + SWAP1 + SWAP4 + // stack: f2_, df2, c, f1, d, g1, f1_, g2, g2_ + DUP3 + MULFP254 + // stack: cf2_, df2, c, f1, d, g1, f1_, g2, g2_ + ADDFP254 + // stack: g1_, c, f1, d, g1, f1_, g2, g2_ + SWAP5 + // stack: f1_, c, f1, d, g1, g1_, g2, g2_ + DUP1 + DUP5 + MULFP254 + // stack: df1_, f1_, c, f1, d, g1, g1_, g2, g2_ + DUP4 + DUP4 + MULFP254 + // stack: cf1, df1_, f1_, c, f1, d, g1, g1_, g2, g2_ + ADDFP254 + // stack: g0, f1_, c, f1, d, g1, g1_, g2, g2_ + SWAP3 + // stack: f1, f1_, c, g0, d, g1, g1_, g2, g2_ + SWAP2 + MULFP254 + // stack: cf1_, f1, g0, d, g1, g1_, g2, g2_ + SWAP3 + MULFP254 + // stack: df1, g0, cf1_, g1, g1_, g2, g2_ + SWAP1 + SWAP2 + // stack: cf1_, df1, g0, g1, g1_, g2, g2_ + ADDFP254 + // stack: g0_, g0, g1, g1_, g2, g2_ + SWAP1 + // stack: g0, g0_, g1, g1_, g2, g2_ +%endmacro + +// cost: 9; note this returns y, x for the output x + yi +%macro i9 + // stack: a , b + DUP2 + // stack: b, a , b + DUP2 + // stack: a , b, a , b + PUSH 9 MULFP254 + // stack: 9a , b, a , b + SUBFP254 + // stack: 9a - b, a , b + SWAP2 + // stack: b , a, 9a - b + PUSH 9 MULFP254 + // stack 9b , a, 9a - b + ADDFP254 + // stack: 9b + a, 9a - b +%endmacro \ No newline at end of file diff --git a/evm/src/cpu/kernel/asm/fields/fp12_mul.asm b/evm/src/cpu/kernel/asm/fields/fp12_mul.asm index 253103da..8f220e5b 100644 --- a/evm/src/cpu/kernel/asm/fields/fp12_mul.asm +++ b/evm/src/cpu/kernel/asm/fields/fp12_mul.asm @@ -329,7 +329,7 @@ global mul_fp12_sparse: // stack: inB, f, f', inB, f, inB, f', out, f, inB, f', inA, inB, out %mload_kernel_general // stack: g0 , f, f', inB, f, inB, f', out, f, inB, f', inA, inB, out - %mul_fp_fp12 + %mul_fp_fp6 // stack: g0 * f, f', inB, f, inB, f', out, f, inB, f', inA, inB, out %swap_fp6 // stack: f' , g0 * f, inB, f, inB, f', out, f, inB, f', inA, inB, out @@ -337,7 +337,7 @@ global mul_fp12_sparse: // stack: inB2, f' , g0 * f, inB, f, inB, f', out, f, inB, f', inA, inB, out %load_fp2 // stack: G2 , f' , g0 * f, inB, f, inB, f', out, f, inB, f', inA, inB, out - %mul_fp2_fp12_sh2 + %mul_fp2_fp6_sh2 // stack: G2 * sh2(f') , g0 * f, inB, f, inB, f', out, f, inB, f', inA, inB, out %add_fp6 // stack: G2 * sh2(f') + g0 * f, inB, f, inB, f', out, f, inB, f', inA, inB, out @@ -347,7 +347,7 @@ global mul_fp12_sparse: // stack: inB1, f , inB, G2 * sh2(f') + g0 * f, inB, f', out, f, inB, f', inA, inB, out %load_fp2 // stack: G1 , f , inB, G2 * sh2(f') + g0 * f, inB, f', out, f, inB, f', inA, inB, out - %mul_fp2_fp12_sh + %mul_fp2_fp6_sh // stack: G1 * sh(f), inB, G2 * sh2(f') + g0 * f, inB, f', out, f, inB, f', inA, inB, out %add_fp6_hole // stack: G1 * sh(f) + G2 * sh2(f') + g0 * f, inB, f', out, f, inB, f', inA, inB, out @@ -361,7 +361,7 @@ global mul_fp12_sparse: // stack: inB, f', f, inB, f', inA, inB, out %mload_kernel_general // stack: g0 , f', f, inB, f', inA, inB, out - %mul_fp_fp12 + %mul_fp_fp6 // stack: g0 * f', f, inB, f', inA, inB, out %swap_fp6 // stack: f , g0 * f', inB, f', inA, inB, out @@ -369,7 +369,7 @@ global mul_fp12_sparse: // stack: inB2, f , g0 * f', inB, f', inA, inB, out %load_fp2 // stack: G2 , f , g0 * f', inB, f', inA, inB, out - %mul_fp2_fp12_sh + %mul_fp2_fp6_sh // stack: G2 * sh(f) , g0 * f', inB, f', inA, inB, out %add_fp6 // stack: G2 * sh(f) + g0 * f', inB, f', inA, inB, out @@ -379,7 +379,7 @@ global mul_fp12_sparse: // stack: inB1, f' , inB, G2 * sh(f) + g0 * f', inA, inB, out %load_fp2 // stack: G1 , f' , inB, G2 * sh(f) + g0 * f', inA, inB, out - %mul_fp2_fp12_sh + %mul_fp2_fp6_sh // stack: G1 * sh(f'), inB, G2 * sh(f) + g0 * f', inA, inB, out %add_fp6_hole // stack: G1 * sh(f') + G2 * sh(f) + g0 * f', inA, inB, out