plonky2/evm/src/cpu/kernel/asm/fields/fp12_mul.asm

436 lines
17 KiB
NASM
Raw Normal View History

2022-10-20 17:35:43 -04:00
/// Note: uncomment this to test
2022-11-09 16:17:07 -08:00
/// global test_mul_fp12:
/// // stack: f, inA , f', g, inB , g', inB, out, inA, out
2022-10-28 02:03:52 -07:00
/// DUP7
2022-11-09 16:17:07 -08:00
/// // stack: inA, f, inA , f', g, inB , g', inB, out, inA, out
2022-10-28 02:03:52 -07:00
/// %store_fp6
2022-11-09 16:17:07 -08:00
/// // stack: inA , f', g, inB , g', inB, out, inA, out
/// %offset_fp6
/// // stack: inA', f', g, inB , g', inB, out, inA, out
2022-10-28 02:03:52 -07:00
/// %store_fp6
2022-11-09 16:17:07 -08:00
/// // stack: g, inB , g', inB, out, inA, out
2022-10-28 02:03:52 -07:00
/// DUP7
2022-11-09 16:17:07 -08:00
/// // stack: inB, g, inB , g', inB, out, inA, out
2022-10-28 02:03:52 -07:00
/// %store_fp6
2022-11-09 16:17:07 -08:00
/// // stack: inB , g', inB, out, inA, out
/// %offset_fp6
/// // stack: inB', g', inB, out, inA, out
2022-10-28 02:03:52 -07:00
/// %store_fp6
2022-11-09 16:17:07 -08:00
/// // stack: inB, out, inA, out
2022-10-28 02:03:52 -07:00
/// PUSH ret_stack
2022-11-09 16:17:07 -08:00
/// // stack: ret_stack, inB, out, inA, out
2022-10-28 02:03:52 -07:00
/// SWAP3
2022-11-09 16:17:07 -08:00
/// // stack: inA, inB, out, ret_stack, out
/// %jump(mul_fp12)
2022-10-28 02:03:52 -07:00
/// ret_stack:
/// // stack: out
2022-11-09 16:17:07 -08:00
/// DUP1 %offset_fp6
2022-10-28 02:03:52 -07:00
/// // stack: out', out
/// %load_fp6
/// // stack: h', out
/// DUP7
/// // stack: out, h', out
/// %load_fp6
/// // stack: h, h', out
/// %jump(0xdeadbeef)
2022-10-13 20:06:19 -04:00
2022-10-20 17:35:43 -04:00
2022-10-25 17:09:25 -04:00
/// fp6 functions:
/// fn | num | ops | cost
2022-10-18 11:23:43 -04:00
/// -------------------------
2022-10-18 12:47:31 -04:00
/// load | 8 | 40 | 320
/// store | 5 | 40 | 200
/// dup | 5 | 6 | 30
/// swap | 4 | 16 | 64
2022-10-20 17:30:23 -04:00
/// add | 4 | 16 | 64
2022-10-20 17:35:43 -04:00
/// subr | 1 | 17 | 17
2022-10-22 13:44:01 -04:00
/// mul | 3 | 157 | 471
2022-10-18 12:47:31 -04:00
/// i9 | 1 | 9 | 9
2022-10-18 11:23:43 -04:00
///
2022-10-20 16:18:41 -04:00
/// lone stack operations:
/// op | num
/// ------------
/// ADD | 3
/// SWAP | 2
/// DUP | 6
/// PUSH | 6
/// POP | 2
2022-11-09 16:17:07 -08:00
/// JUMP | 6
2022-10-20 16:18:41 -04:00
///
2022-11-09 16:17:07 -08:00
/// TOTAL: 1201
2022-10-18 11:23:43 -04:00
2022-10-25 17:09:25 -04:00
/// inputs:
/// F = f + f'z
/// G = g + g'z
///
/// output:
/// H = h + h'z = FG
2022-10-18 11:23:43 -04:00
///
2022-10-25 17:09:25 -04:00
/// h = fg + sh(f'g')
/// h' = (f+f')(g+g') - fg - f'g'
2022-10-18 11:23:43 -04:00
///
2022-10-28 02:01:04 -07:00
/// memory pointers [ind' = ind+6]
2022-11-09 16:17:07 -08:00
/// {inA: f, inA: f', inB: g, inB':g', out: h, out': h'}
2022-10-18 11:23:43 -04:00
///
2022-10-25 17:09:25 -04:00
/// f, f', g, g' consist of six elements on the stack
2022-10-18 11:23:43 -04:00
2022-11-09 16:17:07 -08:00
global mul_fp12:
// stack: inA, inB, out
DUP1 %offset_fp6
// stack: inA', inA, inB, out
2022-10-20 16:07:39 -04:00
%load_fp6
2022-11-09 16:17:07 -08:00
// stack: f', inA, inB, out
DUP8 %offset_fp6
// stack: inB', f', inA, inB, out
2022-10-20 16:07:39 -04:00
%load_fp6
2022-11-09 16:17:07 -08:00
// stack: g', f', inA, inB, out
2022-10-22 13:44:01 -04:00
PUSH ret_1
2022-11-09 16:17:07 -08:00
// stack: ret_1, g', f', inA, inB, out
2022-10-20 16:07:39 -04:00
%dup_fp6_7
2022-11-09 16:17:07 -08:00
// stack: f', ret_1, g', f', inA, inB, out
2022-10-20 16:07:39 -04:00
%dup_fp6_7
2022-11-09 16:17:07 -08:00
// stack: g', f', ret_1, g', f', inA, inB, out
2022-10-20 11:55:05 -04:00
%jump(mul_fp6)
2022-10-22 13:44:01 -04:00
ret_1:
2022-11-09 16:17:07 -08:00
// stack: f'g', g' , f', inA, inB, out
2022-10-20 16:07:39 -04:00
%dup_fp6_0
2022-11-09 16:17:07 -08:00
// stack: f'g', f'g', g' , f', inA, inB, out
%store_fp6_sh(0)
// stack: f'g', g' , f', inA, inB, out {0: sh(f'g')}
%store_fp6(6)
// stack: g' , f', inA, inB, out {0: sh(f'g'), 6: f'g'}
2022-10-20 11:55:05 -04:00
DUP13
2022-11-09 16:17:07 -08:00
// stack: inA, g' , f', inA, inB, out {0: sh(f'g'), 6: f'g'}
2022-10-20 16:07:39 -04:00
DUP15
2022-11-09 16:17:07 -08:00
// stack: inB, inA, g' , f', inA, inB, out {0: sh(f'g'), 6: f'g'}
2022-10-20 16:07:39 -04:00
%load_fp6
2022-11-09 16:17:07 -08:00
// stack: g , inA, g' , f', inA, inB, out {0: sh(f'g'), 6: f'g'}
2022-10-20 11:55:05 -04:00
%swap_fp6_hole
2022-11-09 16:17:07 -08:00
// stack: g', inA, g , f', inA, inB, out {0: sh(f'g'), 6: f'g'}
2022-10-22 13:16:07 -04:00
%dup_fp6_7
2022-11-09 16:17:07 -08:00
// stack: g,g', inA, g , f', inA, inB, out {0: sh(f'g'), 6: f'g'}
2022-10-07 15:41:46 -07:00
%add_fp6
2022-11-09 16:17:07 -08:00
// stack: g+g', inA, g , f', inA, inB, out {0: sh(f'g'), 6: f'g'}
2022-10-20 11:55:05 -04:00
%swap_fp6_hole
2022-11-09 16:17:07 -08:00
// stack: g, inA, g+g', f', inA, inB, out {0: sh(f'g'), 6: f'g'}
2022-10-22 13:44:01 -04:00
PUSH ret_2
2022-11-09 16:17:07 -08:00
// stack: ret_2, g, inA, g+g', f', inA, inB, out {0: sh(f'g'), 6: f'g'}
2022-10-20 11:55:05 -04:00
SWAP7
2022-11-09 16:17:07 -08:00
// stack: inA, g, ret_2, g+g', f', inA, inB, out {0: sh(f'g'), 6: f'g'}
2022-10-20 11:55:05 -04:00
%load_fp6
2022-11-09 16:17:07 -08:00
// stack: f, g, ret_2, g+g', f', inA, inB, out {0: sh(f'g'), 6: f'g'}
2022-10-20 11:55:05 -04:00
%jump(mul_fp6)
2022-10-22 13:44:01 -04:00
ret_2:
2022-11-09 16:17:07 -08:00
// stack: fg, g+g', f', inA, inB, out {0: sh(f'g'), 6: f'g'}
%store_fp6(12)
// stack: g+g', f', inA, inB, out {0: sh(f'g'), 6: f'g', 12: fg}
2022-10-07 15:41:46 -07:00
%swap_fp6
2022-11-09 16:17:07 -08:00
// stack: f', g+g', inA, inB, out {0: sh(f'g'), 6: f'g', 12: fg}
2022-10-22 13:44:01 -04:00
PUSH ret_3
2022-11-09 16:17:07 -08:00
// stack: ret_3, f', g+g', inA, inB, out {0: sh(f'g'), 6: f'g', 12: fg}
2022-10-20 17:30:23 -04:00
SWAP13
2022-11-09 16:17:07 -08:00
// stack: inA, f', g+g', ret_3, inB, out {0: sh(f'g'), 6: f'g', 12: fg}
2022-10-20 16:07:39 -04:00
%load_fp6
2022-11-09 16:17:07 -08:00
// stack: f,f', g+g', ret_3, inB, out {0: sh(f'g'), 6: f'g', 12: fg}
2022-10-07 15:41:46 -07:00
%add_fp6
2022-11-09 16:17:07 -08:00
// stack: f+f', g+g', ret_3, inB, out {0: sh(f'g'), 6: f'g', 12: fg}
2022-10-20 11:55:05 -04:00
%jump(mul_fp6)
2022-10-22 13:44:01 -04:00
ret_3:
2022-11-09 16:17:07 -08:00
// stack: (f+f')(g+g'), inB, out {0: sh(f'g'), 6: f'g', 12: fg}
%load_fp6(12)
// stack: fg, (f+f')(g+g'), inB, out {0: sh(f'g'), 6: f'g', 12: fg}
2022-10-07 15:41:46 -07:00
%swap_fp6
2022-11-09 16:17:07 -08:00
// stack: (f+f')(g+g'), fg, inB, out {0: sh(f'g'), 6: f'g', 12: fg}
2022-10-20 16:07:39 -04:00
%dup_fp6_6
2022-11-09 16:17:07 -08:00
// stack: fg, (f+f')(g+g'), fg, inB, out {0: sh(f'g'), 6: f'g', 12: fg}
%load_fp6(6)
// stack: f'g',fg, (f+f')(g+g'), fg, inB, out {0: sh(f'g'), 6: f'g', 12: fg}
2022-10-20 16:48:31 -04:00
%add_fp6
2022-11-09 16:17:07 -08:00
// stack: f'g'+fg, (f+f')(g+g'), fg, inB, out {0: sh(f'g'), 6: f'g', 12: fg}
2022-10-20 10:56:55 -04:00
%subr_fp6
2022-11-09 16:17:07 -08:00
// stack: (f+f')(g+g') - (f'g'+fg), fg, inB, out {0: sh(f'g'), 6: f'g', 12: fg}
DUP14 %offset_fp6
// stack: out', (f+f')(g+g') - (f'g'+fg), fg, inB, out {0: sh(f'g'), 6: f'g', 12: fg}
2022-10-20 16:07:39 -04:00
%store_fp6
2022-11-09 16:17:07 -08:00
// stack: fg, inB, out {0: sh(f'g'), 6: f'g', 12: fg}
%load_fp6(0)
// stack: sh(f'g') , fg, inB, out {0: sh(f'g'), 6: f'g', 12: fg}
2022-10-07 15:41:46 -07:00
%add_fp6
2022-11-09 16:17:07 -08:00
// stack: sh(f'g') + fg, inB, out {0: sh(f'g'), 6: f'g', 12: fg}
2022-10-20 17:30:23 -04:00
DUP8
2022-11-09 16:17:07 -08:00
// stack: out, sh(f'g') + fg, inB, out {0: sh(f'g'), 6: f'g', 12: fg}
2022-10-20 16:07:39 -04:00
%store_fp6
2022-11-09 16:17:07 -08:00
// stack: inB, out {0: sh(f'g'), 6: f'g', 12: fg}
2022-10-20 16:07:39 -04:00
%pop2
JUMP
2022-11-09 16:17:07 -08:00
/// fp6 functions:
/// fn | num | ops | cost
/// -------------------------
/// load | 2 | 40 | 80
/// store | 2 | 40 | 80
/// dup | 2 | 6 | 12
/// swap | 2 | 16 | 32
/// add | 1 | 16 | 16
/// mul | 1 | 157 | 157
/// sq | 2 | |
/// dbl | 1 | 13 | 13
///
/// lone stack operations:
/// op | num
/// ------------
/// ADD | 3
/// SWAP | 4
/// DUP | 5
/// PUSH | 6
/// POP | 3
/// JUMP | 4
///
/// TOTAL:
/// input:
/// F = f + f'z
///
/// output:
/// H = h + h'z = FF
///
/// h = ff + sh(f'f')
/// h' = 2ff'
///
/// memory pointers [ind' = ind+6]
/// {inp: f, inp: f', out: h, out': h'}
///
/// f, f' consist of six elements on the stack
global square_fp12:
// stack: inp, out
DUP1 %offset_fp6
// stack: inp, inp, out
%load_fp6
// stack: f, inp, out
PUSH post_sq2
// stack: post_sq2, f, inp, out
SWAP7
// stack: inp, f, post_sq2, out
PUSH post_sq1
// stack: post_sq1, inp, f, post_sq2, out
%dup_fp6_2
// stack: f , post_sq1, inp, f, post_sq2, out
DUP16 %offset_fp6
// stack: out', f , post_sq1, inp, f, post_sq2, out
PUSH post_mul
// stack: post_mul, out', f , post_sq1, inp, f, post_sq2, out
DUP10 %offset_fp6
// stack: inp', post_mul, out', f , post_sq1, inp, f, post_sq2, out
%load_fp6
// stack: f', post_mul, out', f , post_sq1, inp, f, post_sq2, out
%swap_fp6_hole_2
// stack: f , post_mul, out', f', post_sq1, inp, f, post_sq2, out
%dup_fp6_8
// stack: f', f , post_mul, out', f', post_sq1, inp, f, post_sq2, out
%jump(mul_fp6)
post_mul:
// stack: f'f, out', f', post_sq1, inp, f, post_sq2, out
DUP7
// stack: out', f'f, out', f', post_sq1, inp, f, post_sq2, out
%store_fp6_double
// stack: out', f', post_sq1, inp, f, post_sq2, out
POP
// stack: f', post_sq1, inp, f, post_sq2, out
%jump(square_fp6)
post_sq1:
// stack: f'f', inp, f, post_sq2, out
%swap_fp6_hole
// stack: f, inp, f'f', post_sq2, out
SWAP6 SWAP13 SWAP6
// stack: f, post_sq2, f'f', inp, out
%jump(square_fp6)
post_sq2:
// stack: ff , f'f', inp, out
%add_fp6
// stack: ff + f'f', inp, out
DUP8
// stack: out, ff + f'f', inp, out
%store_fp6
// stack: inp, out
%pop2
JUMP
/// fp6 functions:
/// fn | num | ops | cost
/// -------------------------
/// load | 2 | 40 | 80
/// store | 2 | 40 | 80
/// dup | 2 | 6 | 12
/// swap | 2 | 16 | 32
/// add | 1 | 16 | 16
/// mul | 1 | 157 | 157
/// sq | 2 | |
/// dbl | 1 | 13 | 13
///
/// lone stack operations:
/// op | num
/// ------------
/// ADD | 3
/// SWAP | 4
/// DUP | 5
/// PUSH | 6
/// POP | 3
/// JUMP | 4
///
/// TOTAL:
/// input:
/// F = f + f'z
/// G = g0 + (G1)t + (G2)tz
///
/// output:
/// H = h + h'z = FG
/// = g0 * [f + f'z] + G1 * [sh(f) + sh(f')z] + G2 * [sh2(f') + sh(f)z]
///
/// h = g0 * f + G1 * sh(f ) + G2 * sh2(f')
/// h' = g0 * f' + G1 * sh(f') + G2 * sh (f )
///
/// memory pointers [ind' = ind+6, inB2 = inB1 + 2 = inB + 3]
/// { inA: f, inA': f', inB: g0, inB1: G1, inB2: G2, out: h, out': h'}
///
/// f, f' consist of six elements; G1, G1' consist of two elements; and g0 of one element
global mul_fp12_sparse:
// stack: inA, inB, out
DUP1 %offset_fp6
// stack: inA', inA, inB, out
%load_fp6
// stack: f', inA, inB, out
DUP8
// stack: inB, f', inA, inB, out
DUP8
// stack: inA, inB, f', inA, inB, out
%load_fp6
// stack: f, inB, f', inA, inB, out
DUP16
// stack: out, f, inB, f', inA, inB, out
%dup_fp6_8
// stack: f', out, f, inB, f', inA, inB, out
DUP14
// stack: inB, f', out, f, inB, f', inA, inB, out
%dup_fp6_8
// stack: f, inB, f', out, f, inB, f', inA, inB, out
DUP7
// stack: inB, f, inB, f', out, f, inB, f', inA, inB, out
%dup_fp6_8
// stack: f', inB, f, inB, f', out, f, inB, f', inA, inB, out
%dup_fp6_7
// stack: f, f', inB, f, inB, f', out, f, inB, f', inA, inB, out
DUP13
// stack: inB, f, f', inB, f, inB, f', out, f, inB, f', inA, inB, out
%mload_kernel_general
// stack: g0 , f, f', inB, f, inB, f', out, f, inB, f', inA, inB, out
2022-11-09 19:39:49 -08:00
%mul_fp_fp6
2022-11-09 16:17:07 -08:00
// stack: g0 * f, f', inB, f, inB, f', out, f, inB, f', inA, inB, out
%swap_fp6
// stack: f' , g0 * f, inB, f, inB, f', out, f, inB, f', inA, inB, out
DUP13 %add_const(3)
// stack: inB2, f' , g0 * f, inB, f, inB, f', out, f, inB, f', inA, inB, out
%load_fp2
// stack: G2 , f' , g0 * f, inB, f, inB, f', out, f, inB, f', inA, inB, out
2022-11-09 19:39:49 -08:00
%mul_fp2_fp6_sh2
2022-11-09 16:17:07 -08:00
// stack: G2 * sh2(f') , g0 * f, inB, f, inB, f', out, f, inB, f', inA, inB, out
%add_fp6
// stack: G2 * sh2(f') + g0 * f, inB, f, inB, f', out, f, inB, f', inA, inB, out
%swap_fp6_hole
// stack: f , inB, G2 * sh2(f') + g0 * f, inB, f', out, f, inB, f', inA, inB, out
DUP7 %add_const(1)
// stack: inB1, f , inB, G2 * sh2(f') + g0 * f, inB, f', out, f, inB, f', inA, inB, out
%load_fp2
// stack: G1 , f , inB, G2 * sh2(f') + g0 * f, inB, f', out, f, inB, f', inA, inB, out
2022-11-09 19:39:49 -08:00
%mul_fp2_fp6_sh
2022-11-09 16:17:07 -08:00
// stack: G1 * sh(f), inB, G2 * sh2(f') + g0 * f, inB, f', out, f, inB, f', inA, inB, out
%add_fp6_hole
// stack: G1 * sh(f) + G2 * sh2(f') + g0 * f, inB, f', out, f, inB, f', inA, inB, out
DUP14
// stack: out, G1 * sh(f) + G2 * sh2(f') + g0 * f, inB, out, f', f, inB, f', inA, inB, out
%store_fp6
// stack: inB, out, f', f, inB, f', inA, inB, out
%pop2
// stack: f', f, inB, f', inA, inB, out
DUP13
// stack: inB, f', f, inB, f', inA, inB, out
%mload_kernel_general
// stack: g0 , f', f, inB, f', inA, inB, out
2022-11-09 19:39:49 -08:00
%mul_fp_fp6
2022-11-09 16:17:07 -08:00
// stack: g0 * f', f, inB, f', inA, inB, out
%swap_fp6
// stack: f , g0 * f', inB, f', inA, inB, out
DUP13 %add_const(3)
// stack: inB2, f , g0 * f', inB, f', inA, inB, out
%load_fp2
// stack: G2 , f , g0 * f', inB, f', inA, inB, out
2022-11-09 19:39:49 -08:00
%mul_fp2_fp6_sh
2022-11-09 16:17:07 -08:00
// stack: G2 * sh(f) , g0 * f', inB, f', inA, inB, out
%add_fp6
// stack: G2 * sh(f) + g0 * f', inB, f', inA, inB, out
%swap_fp6_hole
// stack: f' , inB, G2 * sh(f) + g0 * f', inA, inB, out
DUP7 %add_const(1)
// stack: inB1, f' , inB, G2 * sh(f) + g0 * f', inA, inB, out
%load_fp2
// stack: G1 , f' , inB, G2 * sh(f) + g0 * f', inA, inB, out
2022-11-09 19:39:49 -08:00
%mul_fp2_fp6_sh
2022-11-09 16:17:07 -08:00
// stack: G1 * sh(f'), inB, G2 * sh(f) + g0 * f', inA, inB, out
%add_fp6_hole
// stack: G1 * sh(f') + G2 * sh(f) + g0 * f', inA, inB, out
DUP9 %offset_fp6
// stack: out', G1 * sh(f') + G2 * sh(f) + g0 * f', inA, inB, out
%store_fp6
// stack: inA, inB, out
%pop3
/// global mul_fp12_sparse_fast:
/// // stack: inA, inB, out
/// DUP2
/// // stack: inB, inA, inB, out
/// %load_fp12_sparse
/// // stack: g0, G1, G1', inA, inB, out
/// DUP6 %offset_fp6
/// // stack: inA', g0, G1, G1', inA, inB, out
/// %load_fp6
/// // stack: f', g0, G1, G1', inA, inB, out
/// DUP12
/// // stack: inA, f', g0, G1, G1', inA, inB, out
/// %load_fp6
/// // stack: f, f', g0, G1, G1', inA, inB, out
/// %clone_mul_fp_fp6
/// // stack: (g0)f, f, f', g0, G1, G1', inA, inB, out
/// %clone_mul_fp2_fp6_sh
/// // stack: (G1)sh(f) , (g0)f, f, f', g0, G1, G1', inA, inB, out
/// %add_fp6
/// // stack: (G1)sh(f) + (g0)f, f, f', g0, G1, G1', inA, inB, out
/// %clone_mul_fp2_fp6_sh2
/// // stack: (G1')sh2(f') , (G1)sh(f) + (g0)f, f, f', g0, G1, G1', inA, inB, out
/// %add_fp6
/// // stack: (G1')sh2(f') + (G1)sh(f) + (g0)f, f, f', g0, G1, G1', inA, inB, out
/// DUP26
/// // stack: out, (G1')sh2(f') + (G1)sh(f) + (g0)f, f, f', g0, G1, G1', inA, inB, out
/// %store_fp6
/// // stack: f, f', g0, G1, G1', inA, inB, out
/// %semiclone_mul_fp2_fp6_sh
/// // stack: (G1')sh(f), f', g0, G1, G1', inA, inB, out
/// %clone_mul_fp2_fp6_sh
/// // stack: (G1)sh(f') , (G1')sh(f), f', g0, G1, G1', inA, inB, out
/// %add_fp6
/// // stack: (G1)sh(f') + (G1')sh(f), f', g0, G1, G1', inA, inB, out
/// %clone_mul_fp_fp6
/// // stack: (g0)f' , (G1)sh(f') + (G1')sh(f), f', g0, G1, G1', inA, inB, out
/// %add_fp6
/// // stack: (g0)f' + (G1)sh(f') + (G1')sh(f), f', g0, G1, G1', inA, inB, out
/// DUP20 offset_fp6
/// // stack: out', (g0)f' + (G1)sh(f') + (G1')sh(f), f', g0, G1, G1', inA, inB, out
/// %store_fp6
/// // stack: f', g0, G1, G1', inA, inB, out
/// %pop14