448 lines
9.0 KiB
NASM
Raw Normal View History

2022-10-07 15:41:46 -07:00
// cost: 6 loads + 6 offsets + 5 adds = 6*4 + 6*1 + 5*2 = 40
%macro load_fp6(offset)
// stack:
PUSH $offset
%add_const(5)
%mload_kernel_general
// stack: x5
PUSH $offset
%add_const(4)
%mload_kernel_general
// stack: x4, x5
PUSH $offset
%add_const(3)
%mload_kernel_general
// stack: x3, x4, x5
PUSH $offset
%add_const(2)
%mload_kernel_general
// stack: x2, x3, x4, x5
PUSH $offset
%add_const(1)
%mload_kernel_general
// stack: x1, x2, x3, x4, x5
PUSH $offset
%mload_kernel_general
// stack: x0, x1, x2, x3, x4, x5
%endmacro
// cost: 40
%macro store_fp6(offset)
// stack: x0, x1, x2, x3, x4, x5
PUSH $offset
%mstore_kernel_general
// stack: x1, x2, x3, x4, x5
PUSH $offset
%add_const(1)
%mstore_kernel_general
// stack: x2, x3, x4, x5
PUSH $offset
%add_const(2)
%mstore_kernel_general
// stack: x3, x4, x5
PUSH $offset
%add_const(3)
%mstore_kernel_general
// stack: x4, x5
PUSH $offset
%add_const(4)
%mstore_kernel_general
// stack: x5
PUSH $offset
%add_const(5)
%mstore_kernel_general
// stack:
%endmacro
2022-10-18 11:23:43 -04:00
// cost: store (40) + i9 (9) = 49
2022-10-07 18:13:32 -07:00
%macro store_fp6_sh(offset)
// stack: x0, x1, x2, x3, x4, x5
PUSH $offset
%add_const(2)
%mstore_kernel_general
// stack: x1, x2, x3, x4, x5
PUSH $offset
%add_const(3)
%mstore_kernel_general
// stack: x2, x3, x4, x5
PUSH $offset
%add_const(4)
%mstore_kernel_general
// stack: x3, x4, x5
PUSH $offset
%add_const(5)
%mstore_kernel_general
// stack: x4, x5
%i9
// stack: y5, y4
PUSH $offset
%add_const(1)
%mstore_kernel_general
// stack: y4
PUSH $offset
%mstore_kernel_general
// stack:
%endmacro
2022-10-18 12:47:31 -04:00
// cost: 9; note this returns y, x for x + yi
%macro i9
// stack: a , b
DUP2
DUP2
// stack: a , b, a , b
PUSH 9
MULFP254
SUBFP254
// stack: 9a - b, a , b
SWAP2
// stack: b , a, 9a - b
PUSH 9
MULFP254
ADDFP254
// stack: 9b + a, 9a - b
%endmacro
2022-10-07 15:41:46 -07:00
// cost: 6
%macro dup1_fp6
2022-10-18 12:04:26 -04:00
// stack: f: 6
2022-10-07 15:41:46 -07:00
DUP6
DUP6
DUP6
DUP6
DUP6
DUP6
2022-10-18 12:04:26 -04:00
// stack: f: 6, g: 6
2022-10-07 15:41:46 -07:00
%endmacro
// cost: 6
%macro dup2_fp6
2022-10-18 12:04:26 -04:00
// stack: f: 6, g: 6
2022-10-07 15:41:46 -07:00
DUP12
DUP12
DUP12
DUP12
DUP12
DUP12
2022-10-18 12:04:26 -04:00
// stack: g: 6, f: 6, g: 6
2022-10-07 15:41:46 -07:00
%endmacro
// cost: 16
%macro swap_fp6
// stack: f0, f1, f2, f3, f4, f5, g0, g1, g2, g3, g4, g5
SWAP6
// stack: g0, f1, f2, f3, f4, f5, f0, g1, g2, g3, g4, g5
SWAP1
SWAP7
SWAP1
// stack: g0, g1, f2, f3, f4, f5, f0, f1, g2, g3, g4, g5
SWAP2
SWAP8
SWAP2
// stack: g0, g1, g2, f3, f4, f5, f0, f1, f2, g3, g4, g5
SWAP3
SWAP9
SWAP3
// stack: g0, g1, g2, g3, f4, f5, f0, f1, f2, f3, g4, g5
SWAP4
SWAP10
SWAP4
// stack: g0, g1, g2, g3, g4, f5, f0, f1, f2, f3, f4, g5
SWAP5
SWAP11
SWAP5
// stack: g0, g1, g2, g3, g4, g5, f0, f1, f2, f3, f4, f5
%endmacro
// cost: 16
%macro add_fp6
// stack: f0, f1, f2, f3, f4, f5, g0, g1, g2, g3, g4, g5
SWAP7
2022-10-17 11:02:24 -04:00
ADDFP254
2022-10-07 15:41:46 -07:00
SWAP6
// stack: f0, f2, f3, f4, f5, g0, h1, g2, g3, g4, g5
SWAP7
2022-10-17 11:02:24 -04:00
ADDFP254
2022-10-07 15:41:46 -07:00
SWAP6
// stack: f0, f3, f4, f5, g0, h1, h2, g3, g4, g5
SWAP7
2022-10-17 11:02:24 -04:00
ADDFP254
2022-10-07 15:41:46 -07:00
SWAP6
// stack: f0, f4, f5, g0, h1, h2, h3, g4, g5
SWAP7
2022-10-17 11:02:24 -04:00
ADDFP254
2022-10-07 15:41:46 -07:00
SWAP6
// stack: f0, f5, g0, h1, h2, h3, h4, g5
SWAP7
2022-10-17 11:02:24 -04:00
ADDFP254
2022-10-07 15:41:46 -07:00
SWAP6
// stack: f0, g0, h1, h2, h3, h4, h5
2022-10-17 11:02:24 -04:00
ADDFP254
2022-10-07 15:41:46 -07:00
// stack: h0, h1, h2, h3, h4, h5
%endmacro
2022-10-19 10:31:43 -04:00
// *reversed argument subtraction* cost: 17
2022-10-07 15:41:46 -07:00
%macro bus_fp6
// stack: f0, f1, f2, f3, f4, f5, g0, g1, g2, g3, g4, g5
SWAP7
2022-10-17 11:02:24 -04:00
SUBFP254
2022-10-07 15:41:46 -07:00
SWAP6
// stack: f0, f2, f3, f4, f5, g0, h1, g2, g3, g4, g5
SWAP7
2022-10-17 11:02:24 -04:00
SUBFP254
2022-10-07 15:41:46 -07:00
SWAP6
// stack: f0, f3, f4, f5, g0, h1, h2, g3, g4, g5
SWAP7
2022-10-17 11:02:24 -04:00
SUBFP254
2022-10-07 15:41:46 -07:00
SWAP6
// stack: f0, f4, f5, g0, h1, h2, h3, g4, g5
SWAP7
2022-10-17 11:02:24 -04:00
SUBFP254
2022-10-07 15:41:46 -07:00
SWAP6
// stack: f0, f5, g0, h1, h2, h3, h4, g5
SWAP7
2022-10-17 11:02:24 -04:00
SUBFP254
2022-10-07 15:41:46 -07:00
SWAP6
// stack: f0, g0, h1, h2, h3, h4, h5
SWAP1
2022-10-17 11:02:24 -04:00
SUBFP254
2022-10-07 15:41:46 -07:00
// stack: h0, h1, h2, h3, h4, h5
%endmacro
2022-10-18 00:21:12 -04:00
// cost: 156
%macro mul_fp6
/// C = C0 + C1t + C2t^2
/// = (c0 + c0_i) + (c1 + c1_i)t + (c2 + c2_i)t^2
///
/// D = D0 + D1t + D2t^2
/// = (d0 + d0_i) + (d1 + d1_i)t + (d2 + d2_i)t^2
///
2022-10-19 10:31:43 -04:00
/// E = E0 + E1t + E2t^2 = CD
///
2022-10-18 00:21:12 -04:00
/// initial stack: c0, c0_, c1, c1_, c2, c2_, d0, d0_, d1, d1_, d2, d2_
/// final stack: e0, e0_, e1, e1_, e2, e2_
/// E0 = C0D0 + i9(C1D2 + C2D1)
///
/// C0D0 = (c0d0 - c0_d0_) + (c0d0_ + c0_d0)i
///
/// C1D2 = (c1d2 - c1_d2_) + (c1d2_ + c1_d2)i
/// C2D1 = (c2d1 - c2_d1_) + (c2d1_ + c2_d1)i
///
/// CDX = C1D2 + C2D1
/// = (c1d2 + c2d1 - c1_d2_ - c2_d1_) + (c1d2_ + c1_d2 + c2d1_ + c2_d1)i
///
/// i9(CDX) = (9CDX - CDX_) + (CDX + 9CDX_)i
///
/// E0 = 9CDX - CDX_ + C0D0
/// E0_ = 9CDX_ + CDX + C0D0_
2022-10-19 10:31:43 -04:00
// make CDX_ = c1d2_ + c1_d2 + c2d1_ + c2_d1
2022-10-18 00:21:12 -04:00
DUP12
DUP4
MULFP254
DUP12
DUP6
MULFP254
ADDFP254
DUP11
DUP7
MULFP254
ADDFP254
DUP10
DUP8
MULFP254
ADDFP254
2022-10-19 10:31:43 -04:00
// make C0D0_ = c0d0_ + c0_d0
2022-10-18 00:21:12 -04:00
DUP9
DUP3
MULFP254
DUP9
DUP5
MULFP254
ADDFP254
2022-10-19 10:31:43 -04:00
// make CDX = c1d2 + c2d1 - c1_d2_ - c2_d1_
2022-10-18 00:21:12 -04:00
DUP12
DUP9
MULFP254
DUP15
DUP8
MULFP254
ADDFP254
DUP14
DUP7
MULFP254
DUP13
DUP10
MULFP254
ADDFP254
SUBFP254
2022-10-19 10:31:43 -04:00
// make C0D0 = c0d0 - c0_d0_
2022-10-18 00:21:12 -04:00
DUP11
DUP6
MULFP254
DUP11
DUP6
MULFP254
SUBFP254
2022-10-19 10:31:43 -04:00
// stack: C0D0 , CDX , C0D0_, CDX_
2022-10-18 00:21:12 -04:00
DUP4
DUP3
2022-10-19 10:31:43 -04:00
// stack: CDX , CDX_ , C0D0 , CDX , C0D0_, CDX_
2022-10-18 00:21:12 -04:00
PUSH 9
MULFP254
SUBFP254
ADDFP254
2022-10-19 10:31:43 -04:00
// stack: E0 = 9CDX - CDX_ + C0D0 , CDX , C0D0_, CDX_
2022-10-18 00:21:12 -04:00
SWAP15
SWAP3
2022-10-19 10:31:43 -04:00
// stack: CDX_ , CDX , C0D0_
2022-10-18 00:21:12 -04:00
PUSH 9
MULFP254
ADDFP254
ADDFP254
2022-10-19 10:31:43 -04:00
// stack: E0_ = 9CDX_ + CDX + C0D0_
2022-10-18 00:21:12 -04:00
SWAP9
/// E1 = C0D1 + C1D0 + i9(C2D2)
///
/// C0D1 = (c0d1 - c0_d1_) + (c0d1_ + c0_d1)i
/// C1D0 = (c1d0 - c1_d0_) + (c1d0_ + c1_d0)i
///
2022-10-19 10:31:43 -04:00
/// CD01 = c0d1 + c1d0 - (c0_d1_ + c1_d0_)
/// CD01_ = c0d1_ + c0_d1 + c1d0_ + c1_d0
///
2022-10-18 00:21:12 -04:00
/// C2D2 = (c2d2 - c2_d2_) + (c2d2_ + c2_d2)i
/// i9(C2D2) = (9C2D2 - C2D2_) + (C2D2 + 9C2D2_)i
///
2022-10-19 10:31:43 -04:00
/// E1 = 9C2D2 - C2D2_ + CD01
/// E1_ = C2D2 + 9C2D2_ + CD01_
2022-10-18 00:21:12 -04:00
2022-10-19 10:31:43 -04:00
// make C2D2_ = c2d2_ + c2_d2
2022-10-18 00:21:12 -04:00
DUP13
DUP9
MULFP254
DUP3
DUP9
MULFP254
ADDFP254
2022-10-19 10:31:43 -04:00
// make C2D2 = c2d2 - c2_d2_
2022-10-18 00:21:12 -04:00
DUP3
DUP10
MULFP254
DUP15
DUP10
MULFP254
SUBFP254
2022-10-19 10:31:43 -04:00
// make C0D0 = c0d1 + c1d0 - (c0_d1_ + c1_d0_)
2022-10-18 00:21:12 -04:00
DUP3
DUP9
MULFP254
DUP15
DUP8
MULFP254
ADDFP254
DUP12
DUP9
MULFP254
DUP15
DUP8
MULFP254
ADDFP254
SUBFP254
2022-10-19 10:31:43 -04:00
// stack: C0D0, C2D2, C2D2_
2022-10-18 00:21:12 -04:00
DUP3
DUP3
2022-10-19 10:31:43 -04:00
// stack: C2D2 , C2D2_ , C0D0, C2D2, C2D2_
2022-10-18 00:21:12 -04:00
PUSH 9
MULFP254
SUBFP254
ADDFP254
2022-10-19 10:31:43 -04:00
// stack: E1 = 9C2D2 - C2D2_ + C0D0, C2D2, C2D2_
2022-10-18 00:21:12 -04:00
SWAP13
SWAP2
2022-10-19 10:31:43 -04:00
// stack: C2D2_, C2D2
2022-10-18 00:21:12 -04:00
PUSH 9
MULFP254
ADDFP254
// stack: 9C2D2_ + C2D2
2022-10-19 10:31:43 -04:00
// make CD01_ = c0d1_ + c0_d1 + c1d0_ + c1_d0
2022-10-18 00:21:12 -04:00
DUP11
DUP9
MULFP254
DUP4
DUP9
MULFP254
ADDFP254
DUP3
DUP8
MULFP254
ADDFP254
DUP15
DUP7
MULFP254
ADDFP254
2022-10-19 10:31:43 -04:00
// stack: CD01_ , 9C2D2_ + C2D2
2022-10-18 00:21:12 -04:00
ADDFP254
2022-10-19 10:31:43 -04:00
// stack: E1_ = CD01_ + 9C2D2_ + C2D2
2022-10-18 00:21:12 -04:00
SWAP13
2022-10-18 12:47:31 -04:00
2022-10-18 00:21:12 -04:00
/// E2 = C0D2 + C1D1 + C2D0
///
/// C0D2 = (c0d2 - c0_d2_) + (c0d2_ + c0_d2)i
/// C1D1 = (c1d1 - c1_d1_) + (c1d1_ + c1_d1)i
/// C2D0 = (c2d0 - c2_d0_) + (c2d0_ + c2_d0)i
///
/// E2 = c0d2 + c1d1 + c2d0 - (c0_d2_ + c1_d1_ + c2_d0_)
/// E2_ = c0d2_ + c0_d2 + c1d1_ + c1_d1 + c2d0_ + c2_d0
2022-10-18 12:04:26 -04:00
2022-10-19 10:31:43 -04:00
// make c0_d2_ + c1_d1_ + c2_d0_
2022-10-18 00:21:12 -04:00
DUP3
DUP11
MULFP254
DUP2
DUP10
MULFP254
ADDFP254
DUP5
DUP8
MULFP254
ADDFP254
2022-10-19 10:31:43 -04:00
// make c0d2 + c1d1 + c2d0
2022-10-18 00:21:12 -04:00
DUP16
DUP7
MULFP254
DUP4
DUP10
MULFP254
ADDFP254
DUP13
DUP12
MULFP254
ADDFP254
2022-10-19 10:31:43 -04:00
// stack: c0d2 + c1d1 + c2d0 , c0_d2_ + c1_d1_ + c2_d0_
2022-10-18 00:21:12 -04:00
SUBFP254
2022-10-19 10:31:43 -04:00
// stack: E2 = c0d2 + c1d1 + c2d0 - (c0_d2_ + c1_d1_ + c2_d0_)
2022-10-18 00:21:12 -04:00
SWAP15
2022-10-19 10:31:43 -04:00
// make c0d2_ + c0_d2 + c1d1_ + c1_d1 + c2d0_ + c2_d0
2022-10-18 00:21:12 -04:00
SWAP7
MULFP254
SWAP7
MULFP254
SWAP7
MULFP254
SWAP2
MULFP254
ADDFP254
SWAP2
MULFP254
ADDFP254
ADDFP254
ADDFP254
SWAP2
MULFP254
ADDFP254
2022-10-19 10:31:43 -04:00
// stack: E2_ = c0d2_ + c0_d2 + c1d1_ + c1_d1 + c2d0_ + c2_d0
2022-10-18 00:21:12 -04:00
SWAP5
%endmacro