Merge bitcoin-core/secp256k1#1000: Synthetic int128 type.

a340d9500a ci: add int128_struct tests (Jonas Nick) dceaa1f579 int128: Tidy #includes of int128.h and int128_impl.h (Tim Ruffing) 2914bccbc0 Simulated int128 type. (Russell O'Connor) Pull request description: Abstracts the int128 type and provides an native version, if available, or a implements it using a pair of int64_t's. This is activated by setting the configuration flag `--with-test-override-wide-multiply=int128_struct`. The primary purpose of this PR is to take advantage of MSVC's [umulh](https://docs.microsoft.com/en-us/cpp/intrinsics/umulh?view=msvc-170) intrinsic that we can use to simulate an int128 type which MSVC does not have (AFAIU). This PR lays out the groundwork for this level of MSVC support, but doesn't include the configuration logic to enable it yet. For completeness, and implementation of `umulh` and `mulh` are also provided for compilers that support neither the intrinsic nor the int128 type (such as CompCert?). This also opens up the possibility of removing the 32-bit field and scalar implementations should that ever be desired. ACKs for top commit: sipa: ACK a340d9500a jonasnick: ACK a340d9500a Tree-SHA512: b4f2853fa3ab60ce9d77b4eaee1fd20c4b612850e19fcb3179d7e36986f420c6c4589ff72f0cf844f989584ace49a1cd23cca3f4e405dabefc8da647a0df679d
2022-11-16 14:28:45 -05:00 · 2022-11-16 14:28:45 -05:00 · ddf2b2910e
parent 86e3b38a4a a340d9500a
commit ddf2b2910e
18 changed files with 814 additions and 297 deletions
--- a/.cirrus.yml
+++ b/.cirrus.yml
@ -68,6 +68,7 @@ task:
    - env: {WIDEMUL:  int64,  RECOVERY: yes}
    - env: {WIDEMUL:  int64,                 ECDH: yes, SCHNORRSIG: yes}
    - env: {WIDEMUL: int128}
+    - env: {WIDEMUL: int128_struct}
    - env: {WIDEMUL: int128,  RECOVERY: yes,            SCHNORRSIG: yes}
    - env: {WIDEMUL: int128,                 ECDH: yes, SCHNORRSIG: yes}
    - env: {WIDEMUL: int128,  ASM: x86_64}
@ -271,20 +272,22 @@ task:
    EXPERIMENTAL: yes
    SCHNORRSIG: yes
    CTIMETEST: no
+    # Use a MinGW-w64 host to tell ./configure we're building for Windows.
+    # This will detect some MinGW-w64 tools but then make will need only
+    # the MSVC tools CC, AR and NM as specified below.
+    HOST: x86_64-w64-mingw32
+    CC: /opt/msvc/bin/x64/cl
+    AR: /opt/msvc/bin/x64/lib
+    NM: /opt/msvc/bin/x64/dumpbin -symbols -headers
    # Set non-essential options that affect the CLI messages here.
    # (They depend on the user's taste, so we don't want to set them automatically in configure.ac.)
    CFLAGS: -nologo -diagnostics:caret
    LDFLAGS: -XCClinker -nologo -XCClinker -diagnostics:caret
-  # Use a MinGW-w64 host to tell ./configure we're building for Windows.
-  # This will detect some MinGW-w64 tools but then make will need only
-  # the MSVC tools CC, AR and NM as specified below.
  matrix:
    - name: "x86_64 (MSVC): Windows (Debian stable, Wine)"
+    - name: "x86_64 (MSVC): Windows (Debian stable, Wine, int128_struct)"
      env:
-        HOST: x86_64-w64-mingw32
-        CC: /opt/msvc/bin/x64/cl
-        AR: /opt/msvc/bin/x64/lib
-        NM: /opt/msvc/bin/x64/dumpbin -symbols -headers
+        WIDEMUL: int128_struct
    - name: "i686 (MSVC): Windows (Debian stable, Wine)"
      env:
        HOST: i686-w64-mingw32
--- a/Makefile.am
+++ b/Makefile.am
@ -48,6 +48,12 @@ noinst_HEADERS += src/precomputed_ecmult.h
 noinst_HEADERS += src/precomputed_ecmult_gen.h
 noinst_HEADERS += src/assumptions.h
 noinst_HEADERS += src/util.h
+noinst_HEADERS += src/int128.h
+noinst_HEADERS += src/int128_impl.h
+noinst_HEADERS += src/int128_native.h
+noinst_HEADERS += src/int128_native_impl.h
+noinst_HEADERS += src/int128_struct.h
+noinst_HEADERS += src/int128_struct_impl.h
 noinst_HEADERS += src/scratch.h
 noinst_HEADERS += src/scratch_impl.h
 noinst_HEADERS += src/selftest.h
--- a/configure.ac
+++ b/configure.ac
@ -175,7 +175,11 @@ AC_ARG_ENABLE(external_default_callbacks,
    [SECP_SET_DEFAULT([enable_external_default_callbacks], [no], [no])])

 # Test-only override of the (autodetected by the C code) "widemul" setting.
-# Legal values are int64 (for [u]int64_t), int128 (for [unsigned] __int128), and auto (the default).
+# Legal values are:
+#  * int64 (for [u]int64_t),
+#  * int128 (for [unsigned] __int128),
+#  * int128_struct (for int128 implemented as a structure),
+#  *  and auto (the default).
 AC_ARG_WITH([test-override-wide-multiply], [] ,[set_widemul=$withval], [set_widemul=auto])

 AC_ARG_WITH([asm], [AS_HELP_STRING([--with-asm=x86_64|arm|no|auto],
@ -285,6 +289,9 @@ fi

 # Select wide multiplication implementation
 case $set_widemul in
+int128_struct)
+  AC_DEFINE(USE_FORCE_WIDEMUL_INT128_STRUCT, 1, [Define this symbol to force the use of the structure for simulating (unsigned) int128 based wide multiplication])
+  ;;
 int128)
  AC_DEFINE(USE_FORCE_WIDEMUL_INT128, 1, [Define this symbol to force the use of the (unsigned) __int128 based wide multiplication implementation])
  ;;
--- a/src/assumptions.h
+++ b/src/assumptions.h
@ -10,6 +10,9 @@
 #include <limits.h>

 #include "util.h"
+#if defined(SECP256K1_INT128_NATIVE)
+#include "int128_native.h"
+#endif

 /* This library, like most software, relies on a number of compiler implementation defined (but not undefined)
   behaviours. Although the behaviours we require are essentially universal we test them specifically here to
@ -55,7 +58,7 @@ struct secp256k1_assumption_checker {

        /* To int64_t. */
        ((int64_t)(uint64_t)0xB123C456D789E012ULL == (int64_t)-(int64_t)0x4EDC3BA928761FEEULL) &&
-#if defined(SECP256K1_WIDEMUL_INT128)
+#if defined(SECP256K1_INT128_NATIVE)
        ((int64_t)(((uint128_t)0xA1234567B8901234ULL << 64) + 0xC5678901D2345678ULL) == (int64_t)-(int64_t)0x3A9876FE2DCBA988ULL) &&
        (((int64_t)(int128_t)(((uint128_t)0xB1C2D3E4F5A6B7C8ULL << 64) + 0xD9E0F1A2B3C4D5E6ULL)) == (int64_t)(uint64_t)0xD9E0F1A2B3C4D5E6ULL) &&
        (((int64_t)(int128_t)(((uint128_t)0xABCDEF0123456789ULL << 64) + 0x0123456789ABCDEFULL)) == (int64_t)(uint64_t)0x0123456789ABCDEFULL) &&
@ -71,7 +74,7 @@ struct secp256k1_assumption_checker {
        ((((int16_t)0xE9AC) >> 4) == (int16_t)(uint16_t)0xFE9A) &&
        ((((int32_t)0x937C918A) >> 9) == (int32_t)(uint32_t)0xFFC9BE48) &&
        ((((int64_t)0xA8B72231DF9CF4B9ULL) >> 19) == (int64_t)(uint64_t)0xFFFFF516E4463BF3ULL) &&
-#if defined(SECP256K1_WIDEMUL_INT128)
+#if defined(SECP256K1_INT128_NATIVE)
        ((((int128_t)(((uint128_t)0xCD833A65684A0DBCULL << 64) + 0xB349312F71EA7637ULL)) >> 39) == (int128_t)(((uint128_t)0xFFFFFFFFFF9B0674ULL << 64) + 0xCAD0941B79669262ULL)) &&
 #endif
    1) * 2 - 1];
--- a/src/field_5x52_int128_impl.h
+++ b/src/field_5x52_int128_impl.h
@ -9,14 +9,18 @@

 #include <stdint.h>

+#include "int128.h"
+
 #ifdef VERIFY
 #define VERIFY_BITS(x, n) VERIFY_CHECK(((x) >> (n)) == 0)
+#define VERIFY_BITS_128(x, n) VERIFY_CHECK(secp256k1_u128_check_bits((x), (n)))
 #else
 #define VERIFY_BITS(x, n) do { } while(0)
+#define VERIFY_BITS_128(x, n) do { } while(0)
 #endif

 SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) {
-    uint128_t c, d;
+    secp256k1_uint128 c, d;
    uint64_t t3, t4, tx, u0;
    uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
    const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL;
@ -40,121 +44,119 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t
     *  Note that [x 0 0 0 0 0] = [x*R].
     */

-    d  = (uint128_t)a0 * b[3]
-       + (uint128_t)a1 * b[2]
-       + (uint128_t)a2 * b[1]
-       + (uint128_t)a3 * b[0];
-    VERIFY_BITS(d, 114);
+    secp256k1_u128_mul(&d, a0, b[3]);
+    secp256k1_u128_accum_mul(&d, a1, b[2]);
+    secp256k1_u128_accum_mul(&d, a2, b[1]);
+    secp256k1_u128_accum_mul(&d, a3, b[0]);
+    VERIFY_BITS_128(&d, 114);
    /* [d 0 0 0] = [p3 0 0 0] */
-    c  = (uint128_t)a4 * b[4];
-    VERIFY_BITS(c, 112);
+    secp256k1_u128_mul(&c, a4, b[4]);
+    VERIFY_BITS_128(&c, 112);
    /* [c 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
-    d += (uint128_t)R * (uint64_t)c; c >>= 64;
-    VERIFY_BITS(d, 115);
-    VERIFY_BITS(c, 48);
+    secp256k1_u128_accum_mul(&d, R, secp256k1_u128_to_u64(&c)); secp256k1_u128_rshift(&c, 64);
+    VERIFY_BITS_128(&d, 115);
+    VERIFY_BITS_128(&c, 48);
    /* [(c<<12) 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
-    t3 = d & M; d >>= 52;
+    t3 = secp256k1_u128_to_u64(&d) & M; secp256k1_u128_rshift(&d, 52);
    VERIFY_BITS(t3, 52);
-    VERIFY_BITS(d, 63);
+    VERIFY_BITS_128(&d, 63);
    /* [(c<<12) 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */

-    d += (uint128_t)a0 * b[4]
-       + (uint128_t)a1 * b[3]
-       + (uint128_t)a2 * b[2]
-       + (uint128_t)a3 * b[1]
-       + (uint128_t)a4 * b[0];
-    VERIFY_BITS(d, 115);
+    secp256k1_u128_accum_mul(&d, a0, b[4]);
+    secp256k1_u128_accum_mul(&d, a1, b[3]);
+    secp256k1_u128_accum_mul(&d, a2, b[2]);
+    secp256k1_u128_accum_mul(&d, a3, b[1]);
+    secp256k1_u128_accum_mul(&d, a4, b[0]);
+    VERIFY_BITS_128(&d, 115);
    /* [(c<<12) 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
-    d += (uint128_t)(R << 12) * (uint64_t)c;
-    VERIFY_BITS(d, 116);
+    secp256k1_u128_accum_mul(&d, R << 12, secp256k1_u128_to_u64(&c));
+    VERIFY_BITS_128(&d, 116);
    /* [d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
-    t4 = d & M; d >>= 52;
+    t4 = secp256k1_u128_to_u64(&d) & M; secp256k1_u128_rshift(&d, 52);
    VERIFY_BITS(t4, 52);
-    VERIFY_BITS(d, 64);
+    VERIFY_BITS_128(&d, 64);
    /* [d t4 t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
    tx = (t4 >> 48); t4 &= (M >> 4);
    VERIFY_BITS(tx, 4);
    VERIFY_BITS(t4, 48);
    /* [d t4+(tx<<48) t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */

-    c  = (uint128_t)a0 * b[0];
-    VERIFY_BITS(c, 112);
+    secp256k1_u128_mul(&c, a0, b[0]);
+    VERIFY_BITS_128(&c, 112);
    /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 0 p4 p3 0 0 p0] */
-    d += (uint128_t)a1 * b[4]
-       + (uint128_t)a2 * b[3]
-       + (uint128_t)a3 * b[2]
-       + (uint128_t)a4 * b[1];
-    VERIFY_BITS(d, 115);
+    secp256k1_u128_accum_mul(&d, a1, b[4]);
+    secp256k1_u128_accum_mul(&d, a2, b[3]);
+    secp256k1_u128_accum_mul(&d, a3, b[2]);
+    secp256k1_u128_accum_mul(&d, a4, b[1]);
+    VERIFY_BITS_128(&d, 115);
    /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
-    u0 = d & M; d >>= 52;
+    u0 = secp256k1_u128_to_u64(&d) & M; secp256k1_u128_rshift(&d, 52);
    VERIFY_BITS(u0, 52);
-    VERIFY_BITS(d, 63);
+    VERIFY_BITS_128(&d, 63);
    /* [d u0 t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
    /* [d 0 t4+(tx<<48)+(u0<<52) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
    u0 = (u0 << 4) | tx;
    VERIFY_BITS(u0, 56);
    /* [d 0 t4+(u0<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
-    c += (uint128_t)u0 * (R >> 4);
-    VERIFY_BITS(c, 115);
+    secp256k1_u128_accum_mul(&c, u0, R >> 4);
+    VERIFY_BITS_128(&c, 115);
    /* [d 0 t4 t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
-    r[0] = c & M; c >>= 52;
+    r[0] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52);
    VERIFY_BITS(r[0], 52);
-    VERIFY_BITS(c, 61);
+    VERIFY_BITS_128(&c, 61);
    /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 0 p0] */

-    c += (uint128_t)a0 * b[1]
-       + (uint128_t)a1 * b[0];
-    VERIFY_BITS(c, 114);
+    secp256k1_u128_accum_mul(&c, a0, b[1]);
+    secp256k1_u128_accum_mul(&c, a1, b[0]);
+    VERIFY_BITS_128(&c, 114);
    /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 p1 p0] */
-    d += (uint128_t)a2 * b[4]
-       + (uint128_t)a3 * b[3]
-       + (uint128_t)a4 * b[2];
-    VERIFY_BITS(d, 114);
+    secp256k1_u128_accum_mul(&d, a2, b[4]);
+    secp256k1_u128_accum_mul(&d, a3, b[3]);
+    secp256k1_u128_accum_mul(&d, a4, b[2]);
+    VERIFY_BITS_128(&d, 114);
    /* [d 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */
-    c += (d & M) * R; d >>= 52;
-    VERIFY_BITS(c, 115);
-    VERIFY_BITS(d, 62);
+    secp256k1_u128_accum_mul(&c, secp256k1_u128_to_u64(&d) & M, R); secp256k1_u128_rshift(&d, 52);
+    VERIFY_BITS_128(&c, 115);
+    VERIFY_BITS_128(&d, 62);
    /* [d 0 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */
-    r[1] = c & M; c >>= 52;
+    r[1] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52);
    VERIFY_BITS(r[1], 52);
-    VERIFY_BITS(c, 63);
+    VERIFY_BITS_128(&c, 63);
    /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */

-    c += (uint128_t)a0 * b[2]
-       + (uint128_t)a1 * b[1]
-       + (uint128_t)a2 * b[0];
-    VERIFY_BITS(c, 114);
+    secp256k1_u128_accum_mul(&c, a0, b[2]);
+    secp256k1_u128_accum_mul(&c, a1, b[1]);
+    secp256k1_u128_accum_mul(&c, a2, b[0]);
+    VERIFY_BITS_128(&c, 114);
    /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 p2 p1 p0] */
-    d += (uint128_t)a3 * b[4]
-       + (uint128_t)a4 * b[3];
-    VERIFY_BITS(d, 114);
+    secp256k1_u128_accum_mul(&d, a3, b[4]);
+    secp256k1_u128_accum_mul(&d, a4, b[3]);
+    VERIFY_BITS_128(&d, 114);
    /* [d 0 0 t4 t3 c t1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    c += (uint128_t)R * (uint64_t)d; d >>= 64;
-    VERIFY_BITS(c, 115);
-    VERIFY_BITS(d, 50);
+    secp256k1_u128_accum_mul(&c, R, secp256k1_u128_to_u64(&d)); secp256k1_u128_rshift(&d, 64);
+    VERIFY_BITS_128(&c, 115);
+    VERIFY_BITS_128(&d, 50);
    /* [(d<<12) 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */

-    r[2] = c & M; c >>= 52;
+    r[2] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52);
    VERIFY_BITS(r[2], 52);
-    VERIFY_BITS(c, 63);
+    VERIFY_BITS_128(&c, 63);
    /* [(d<<12) 0 0 0 t4 t3+c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    c   += (uint128_t)(R << 12) * (uint64_t)d + t3;
-    VERIFY_BITS(c, 100);
+    secp256k1_u128_accum_mul(&c, R << 12, secp256k1_u128_to_u64(&d));
+    secp256k1_u128_accum_u64(&c, t3);
+    VERIFY_BITS_128(&c, 100);
    /* [t4 c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    r[3] = c & M; c >>= 52;
+    r[3] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52);
    VERIFY_BITS(r[3], 52);
-    VERIFY_BITS(c, 48);
+    VERIFY_BITS_128(&c, 48);
    /* [t4+c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    c   += t4;
-    VERIFY_BITS(c, 49);
-    /* [c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    r[4] = c;
+    r[4] = secp256k1_u128_to_u64(&c) + t4;
    VERIFY_BITS(r[4], 49);
    /* [r4 r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
 }

 SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) {
-    uint128_t c, d;
+    secp256k1_uint128 c, d;
    uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
    int64_t t3, t4, tx, u0;
    const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL;
@ -170,107 +172,105 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t
     *  Note that [x 0 0 0 0 0] = [x*R].
     */

-    d  = (uint128_t)(a0*2) * a3
-       + (uint128_t)(a1*2) * a2;
-    VERIFY_BITS(d, 114);
+    secp256k1_u128_mul(&d, a0*2, a3);
+    secp256k1_u128_accum_mul(&d, a1*2, a2);
+    VERIFY_BITS_128(&d, 114);
    /* [d 0 0 0] = [p3 0 0 0] */
-    c  = (uint128_t)a4 * a4;
-    VERIFY_BITS(c, 112);
+    secp256k1_u128_mul(&c, a4, a4);
+    VERIFY_BITS_128(&c, 112);
    /* [c 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
-    d += (uint128_t)R * (uint64_t)c; c >>= 64;
-    VERIFY_BITS(d, 115);
-    VERIFY_BITS(c, 48);
+    secp256k1_u128_accum_mul(&d, R, secp256k1_u128_to_u64(&c)); secp256k1_u128_rshift(&c, 64);
+    VERIFY_BITS_128(&d, 115);
+    VERIFY_BITS_128(&c, 48);
    /* [(c<<12) 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
-    t3 = d & M; d >>= 52;
+    t3 = secp256k1_u128_to_u64(&d) & M; secp256k1_u128_rshift(&d, 52);
    VERIFY_BITS(t3, 52);
-    VERIFY_BITS(d, 63);
+    VERIFY_BITS_128(&d, 63);
    /* [(c<<12) 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */

    a4 *= 2;
-    d += (uint128_t)a0 * a4
-       + (uint128_t)(a1*2) * a3
-       + (uint128_t)a2 * a2;
-    VERIFY_BITS(d, 115);
+    secp256k1_u128_accum_mul(&d, a0, a4);
+    secp256k1_u128_accum_mul(&d, a1*2, a3);
+    secp256k1_u128_accum_mul(&d, a2, a2);
+    VERIFY_BITS_128(&d, 115);
    /* [(c<<12) 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
-    d += (uint128_t)(R << 12) * (uint64_t)c;
-    VERIFY_BITS(d, 116);
+    secp256k1_u128_accum_mul(&d, R << 12, secp256k1_u128_to_u64(&c));
+    VERIFY_BITS_128(&d, 116);
    /* [d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
-    t4 = d & M; d >>= 52;
+    t4 = secp256k1_u128_to_u64(&d) & M; secp256k1_u128_rshift(&d, 52);
    VERIFY_BITS(t4, 52);
-    VERIFY_BITS(d, 64);
+    VERIFY_BITS_128(&d, 64);
    /* [d t4 t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
    tx = (t4 >> 48); t4 &= (M >> 4);
    VERIFY_BITS(tx, 4);
    VERIFY_BITS(t4, 48);
    /* [d t4+(tx<<48) t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */

-    c  = (uint128_t)a0 * a0;
-    VERIFY_BITS(c, 112);
+    secp256k1_u128_mul(&c, a0, a0);
+    VERIFY_BITS_128(&c, 112);
    /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 0 p4 p3 0 0 p0] */
-    d += (uint128_t)a1 * a4
-       + (uint128_t)(a2*2) * a3;
-    VERIFY_BITS(d, 114);
+    secp256k1_u128_accum_mul(&d, a1, a4);
+    secp256k1_u128_accum_mul(&d, a2*2, a3);
+    VERIFY_BITS_128(&d, 114);
    /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
-    u0 = d & M; d >>= 52;
+    u0 = secp256k1_u128_to_u64(&d) & M; secp256k1_u128_rshift(&d, 52);
    VERIFY_BITS(u0, 52);
-    VERIFY_BITS(d, 62);
+    VERIFY_BITS_128(&d, 62);
    /* [d u0 t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
    /* [d 0 t4+(tx<<48)+(u0<<52) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
    u0 = (u0 << 4) | tx;
    VERIFY_BITS(u0, 56);
    /* [d 0 t4+(u0<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
-    c += (uint128_t)u0 * (R >> 4);
-    VERIFY_BITS(c, 113);
+    secp256k1_u128_accum_mul(&c, u0, R >> 4);
+    VERIFY_BITS_128(&c, 113);
    /* [d 0 t4 t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
-    r[0] = c & M; c >>= 52;
+    r[0] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52);
    VERIFY_BITS(r[0], 52);
-    VERIFY_BITS(c, 61);
+    VERIFY_BITS_128(&c, 61);
    /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 0 p0] */

    a0 *= 2;
-    c += (uint128_t)a0 * a1;
-    VERIFY_BITS(c, 114);
+    secp256k1_u128_accum_mul(&c, a0, a1);
+    VERIFY_BITS_128(&c, 114);
    /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 p1 p0] */
-    d += (uint128_t)a2 * a4
-       + (uint128_t)a3 * a3;
-    VERIFY_BITS(d, 114);
+    secp256k1_u128_accum_mul(&d, a2, a4);
+    secp256k1_u128_accum_mul(&d, a3, a3);
+    VERIFY_BITS_128(&d, 114);
    /* [d 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */
-    c += (d & M) * R; d >>= 52;
-    VERIFY_BITS(c, 115);
-    VERIFY_BITS(d, 62);
+    secp256k1_u128_accum_mul(&c, secp256k1_u128_to_u64(&d) & M, R); secp256k1_u128_rshift(&d, 52);
+    VERIFY_BITS_128(&c, 115);
+    VERIFY_BITS_128(&d, 62);
    /* [d 0 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */
-    r[1] = c & M; c >>= 52;
+    r[1] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52);
    VERIFY_BITS(r[1], 52);
-    VERIFY_BITS(c, 63);
+    VERIFY_BITS_128(&c, 63);
    /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */

-    c += (uint128_t)a0 * a2
-       + (uint128_t)a1 * a1;
-    VERIFY_BITS(c, 114);
+    secp256k1_u128_accum_mul(&c, a0, a2);
+    secp256k1_u128_accum_mul(&c, a1, a1);
+    VERIFY_BITS_128(&c, 114);
    /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 p2 p1 p0] */
-    d += (uint128_t)a3 * a4;
-    VERIFY_BITS(d, 114);
+    secp256k1_u128_accum_mul(&d, a3, a4);
+    VERIFY_BITS_128(&d, 114);
    /* [d 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    c += (uint128_t)R * (uint64_t)d; d >>= 64;
-    VERIFY_BITS(c, 115);
-    VERIFY_BITS(d, 50);
+    secp256k1_u128_accum_mul(&c, R, secp256k1_u128_to_u64(&d)); secp256k1_u128_rshift(&d, 64);
+    VERIFY_BITS_128(&c, 115);
+    VERIFY_BITS_128(&d, 50);
    /* [(d<<12) 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    r[2] = c & M; c >>= 52;
+    r[2] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52);
    VERIFY_BITS(r[2], 52);
-    VERIFY_BITS(c, 63);
+    VERIFY_BITS_128(&c, 63);
    /* [(d<<12) 0 0 0 t4 t3+c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */

-    c   += (uint128_t)(R << 12) * (uint64_t)d + t3;
-    VERIFY_BITS(c, 100);
+    secp256k1_u128_accum_mul(&c, R << 12, secp256k1_u128_to_u64(&d));
+    secp256k1_u128_accum_u64(&c, t3);
+    VERIFY_BITS_128(&c, 100);
    /* [t4 c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    r[3] = c & M; c >>= 52;
+    r[3] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52);
    VERIFY_BITS(r[3], 52);
-    VERIFY_BITS(c, 48);
+    VERIFY_BITS_128(&c, 48);
    /* [t4+c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    c   += t4;
-    VERIFY_BITS(c, 49);
-    /* [c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    r[4] = c;
+    r[4] = secp256k1_u128_to_u64(&c) + t4;
    VERIFY_BITS(r[4], 49);
    /* [r4 r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
 }
--- a/src/int128.h
+++ b/src/int128.h
@ -0,0 +1,79 @@
+#ifndef SECP256K1_INT128_H
+#define SECP256K1_INT128_H
+
+#include "util.h"
+
+#if defined(SECP256K1_WIDEMUL_INT128)
+#  if defined(SECP256K1_INT128_NATIVE)
+#    include "int128_native.h"
+#  elif defined(SECP256K1_INT128_STRUCT)
+#    include "int128_struct.h"
+#  else
+#    error "Please select int128 implementation"
+#  endif
+
+/* Multiply two unsigned 64-bit values a and b and write the result to r. */
+static SECP256K1_INLINE void secp256k1_u128_mul(secp256k1_uint128 *r, uint64_t a, uint64_t b);
+
+/* Multiply two unsigned 64-bit values a and b and add the result to r.
+ * The final result is taken modulo 2^128.
+ */
+static SECP256K1_INLINE void secp256k1_u128_accum_mul(secp256k1_uint128 *r, uint64_t a, uint64_t b);
+
+/* Add an unsigned 64-bit value a to r.
+ * The final result is taken modulo 2^128.
+ */
+static SECP256K1_INLINE void secp256k1_u128_accum_u64(secp256k1_uint128 *r, uint64_t a);
+
+/* Unsigned (logical) right shift.
+ * Non-constant time in n.
+ */
+static SECP256K1_INLINE void secp256k1_u128_rshift(secp256k1_uint128 *r, unsigned int n);
+
+/* Return the low 64-bits of a 128-bit value as an unsigned 64-bit value. */
+static SECP256K1_INLINE uint64_t secp256k1_u128_to_u64(const secp256k1_uint128 *a);
+
+/* Return the high 64-bits of a 128-bit value as an unsigned 64-bit value. */
+static SECP256K1_INLINE uint64_t secp256k1_u128_hi_u64(const secp256k1_uint128 *a);
+
+/* Write an unsigned 64-bit value to r. */
+static SECP256K1_INLINE void secp256k1_u128_from_u64(secp256k1_uint128 *r, uint64_t a);
+
+/* Tests if r is strictly less than to 2^n.
+ * n must be strictly less than 128.
+ */
+static SECP256K1_INLINE int secp256k1_u128_check_bits(const secp256k1_uint128 *r, unsigned int n);
+
+/* Multiply two signed 64-bit values a and b and write the result to r. */
+static SECP256K1_INLINE void secp256k1_i128_mul(secp256k1_int128 *r, int64_t a, int64_t b);
+
+/* Multiply two signed 64-bit values a and b and add the result to r.
+ * Overflow or underflow from the addition is undefined behaviour.
+ */
+static SECP256K1_INLINE void secp256k1_i128_accum_mul(secp256k1_int128 *r, int64_t a, int64_t b);
+
+/* Compute a*d - b*c from signed 64-bit values and write the result to r. */
+static SECP256K1_INLINE void secp256k1_i128_det(secp256k1_int128 *r, int64_t a, int64_t b, int64_t c, int64_t d);
+
+/* Signed (arithmetic) right shift.
+ * Non-constant time in b.
+ */
+static SECP256K1_INLINE void secp256k1_i128_rshift(secp256k1_int128 *r, unsigned int b);
+
+/* Return the low 64-bits of a 128-bit value interpreted as an signed 64-bit value. */
+static SECP256K1_INLINE int64_t secp256k1_i128_to_i64(const secp256k1_int128 *a);
+
+/* Write a signed 64-bit value to r. */
+static SECP256K1_INLINE void secp256k1_i128_from_i64(secp256k1_int128 *r, int64_t a);
+
+/* Compare two 128-bit values for equality. */
+static SECP256K1_INLINE int secp256k1_i128_eq_var(const secp256k1_int128 *a, const secp256k1_int128 *b);
+
+/* Tests if r is equal to 2^n.
+ * n must be strictly less than 127.
+ */
+static SECP256K1_INLINE int secp256k1_i128_check_pow2(const secp256k1_int128 *r, unsigned int n);
+
+#endif
+
+#endif
--- a/src/int128_impl.h
+++ b/src/int128_impl.h
@ -0,0 +1,18 @@
+#ifndef SECP256K1_INT128_IMPL_H
+#define SECP256K1_INT128_IMPL_H
+
+#include "util.h"
+
+#include "int128.h"
+
+#if defined(SECP256K1_WIDEMUL_INT128)
+#  if defined(SECP256K1_INT128_NATIVE)
+#    include "int128_native_impl.h"
+#  elif defined(SECP256K1_INT128_STRUCT)
+#    include "int128_struct_impl.h"
+#  else
+#    error "Please select int128 implementation"
+#  endif
+#endif
+
+#endif
--- a/src/int128_native.h
+++ b/src/int128_native.h
@ -0,0 +1,19 @@
+#ifndef SECP256K1_INT128_NATIVE_H
+#define SECP256K1_INT128_NATIVE_H
+
+#include <stdint.h>
+#include "util.h"
+
+#if !defined(UINT128_MAX) && defined(__SIZEOF_INT128__)
+SECP256K1_GNUC_EXT typedef unsigned __int128 uint128_t;
+SECP256K1_GNUC_EXT typedef __int128 int128_t;
+# define UINT128_MAX ((uint128_t)(-1))
+# define INT128_MAX ((int128_t)(UINT128_MAX >> 1))
+# define INT128_MIN (-INT128_MAX - 1)
+/* No (U)INT128_C macros because compilers providing __int128 do not support 128-bit literals.  */
+#endif
+
+typedef uint128_t secp256k1_uint128;
+typedef int128_t secp256k1_int128;
+
+#endif
--- a/src/int128_native_impl.h
+++ b/src/int128_native_impl.h
@ -0,0 +1,79 @@
+#ifndef SECP256K1_INT128_NATIVE_IMPL_H
+#define SECP256K1_INT128_NATIVE_IMPL_H
+
+#include "int128.h"
+
+static SECP256K1_INLINE void secp256k1_u128_mul(secp256k1_uint128 *r, uint64_t a, uint64_t b) {
+   *r = (uint128_t)a * b;
+}
+
+static SECP256K1_INLINE void secp256k1_u128_accum_mul(secp256k1_uint128 *r, uint64_t a, uint64_t b) {
+   *r += (uint128_t)a * b;
+}
+
+static SECP256K1_INLINE void secp256k1_u128_accum_u64(secp256k1_uint128 *r, uint64_t a) {
+   *r += a;
+}
+
+static SECP256K1_INLINE void secp256k1_u128_rshift(secp256k1_uint128 *r, unsigned int n) {
+   VERIFY_CHECK(n < 128);
+   *r >>= n;
+}
+
+static SECP256K1_INLINE uint64_t secp256k1_u128_to_u64(const secp256k1_uint128 *a) {
+   return (uint64_t)(*a);
+}
+
+static SECP256K1_INLINE uint64_t secp256k1_u128_hi_u64(const secp256k1_uint128 *a) {
+   return (uint64_t)(*a >> 64);
+}
+
+static SECP256K1_INLINE void secp256k1_u128_from_u64(secp256k1_uint128 *r, uint64_t a) {
+   *r = a;
+}
+
+static SECP256K1_INLINE int secp256k1_u128_check_bits(const secp256k1_uint128 *r, unsigned int n) {
+   VERIFY_CHECK(n < 128);
+   return (*r >> n == 0);
+}
+
+static SECP256K1_INLINE void secp256k1_i128_mul(secp256k1_int128 *r, int64_t a, int64_t b) {
+   *r = (int128_t)a * b;
+}
+
+static SECP256K1_INLINE void secp256k1_i128_accum_mul(secp256k1_int128 *r, int64_t a, int64_t b) {
+   int128_t ab = (int128_t)a * b;
+   VERIFY_CHECK(0 <= ab ? *r <= INT128_MAX - ab : INT128_MIN - ab <= *r);
+   *r += ab;
+}
+
+static SECP256K1_INLINE void secp256k1_i128_det(secp256k1_int128 *r, int64_t a, int64_t b, int64_t c, int64_t d) {
+   int128_t ad = (int128_t)a * d;
+   int128_t bc = (int128_t)b * c;
+   VERIFY_CHECK(0 <= bc ? INT128_MIN + bc <= ad : ad <= INT128_MAX + bc);
+   *r = ad - bc;
+}
+
+static SECP256K1_INLINE void secp256k1_i128_rshift(secp256k1_int128 *r, unsigned int n) {
+   VERIFY_CHECK(n < 128);
+   *r >>= n;
+}
+
+static SECP256K1_INLINE int64_t secp256k1_i128_to_i64(const secp256k1_int128 *a) {
+   return *a;
+}
+
+static SECP256K1_INLINE void secp256k1_i128_from_i64(secp256k1_int128 *r, int64_t a) {
+   *r = a;
+}
+
+static SECP256K1_INLINE int secp256k1_i128_eq_var(const secp256k1_int128 *a, const secp256k1_int128 *b) {
+   return *a == *b;
+}
+
+static SECP256K1_INLINE int secp256k1_i128_check_pow2(const secp256k1_int128 *r, unsigned int n) {
+   VERIFY_CHECK(n < 127);
+   return (*r == (int128_t)1 << n);
+}
+
+#endif
--- a/src/int128_struct.h
+++ b/src/int128_struct.h
@ -0,0 +1,14 @@
+#ifndef SECP256K1_INT128_STRUCT_H
+#define SECP256K1_INT128_STRUCT_H
+
+#include <stdint.h>
+#include "util.h"
+
+typedef struct {
+  uint64_t lo;
+  uint64_t hi;
+} secp256k1_uint128;
+
+typedef secp256k1_uint128 secp256k1_int128;
+
+#endif
--- a/src/int128_struct_impl.h
+++ b/src/int128_struct_impl.h
@ -0,0 +1,177 @@
+#ifndef SECP256K1_INT128_STRUCT_IMPL_H
+#define SECP256K1_INT128_STRUCT_IMPL_H
+
+#include "int128.h"
+
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64)) /* MSVC */
+#    include <intrin.h>
+#    if defined(_M_X64)
+/* On x84_64 MSVC, use native _(u)mul128 for 64x64->128 multiplications. */
+#        define secp256k1_umul128 _umul128
+#        define secp256k1_mul128 _mul128
+#    else
+/* On ARM64 MSVC, use __(u)mulh for the upper half of 64x64 multiplications. */
+static SECP256K1_INLINE uint64_t secp256k1_umul128(uint64_t a, uint64_t b, uint64_t* hi) {
+    *hi = __umulh(a, b);
+    return a * b;
+}
+
+static SECP256K1_INLINE int64_t secp256k1_mul128(int64_t a, int64_t b, int64_t* hi) {
+    *hi = __mulh(a, b);
+    return a * b;
+}
+#    endif
+#else
+/* On other systems, emulate 64x64->128 multiplications using 32x32->64 multiplications. */
+static SECP256K1_INLINE uint64_t secp256k1_umul128(uint64_t a, uint64_t b, uint64_t* hi) {
+    uint64_t ll = (uint64_t)(uint32_t)a * (uint32_t)b;
+    uint64_t lh = (uint32_t)a * (b >> 32);
+    uint64_t hl = (a >> 32) * (uint32_t)b;
+    uint64_t hh = (a >> 32) * (b >> 32);
+    uint64_t mid34 = (ll >> 32) + (uint32_t)lh + (uint32_t)hl;
+    *hi = hh + (lh >> 32) + (hl >> 32) + (mid34 >> 32);
+    return (mid34 << 32) + (uint32_t)ll;
+}
+
+static SECP256K1_INLINE int64_t secp256k1_mul128(int64_t a, int64_t b, int64_t* hi) {
+    uint64_t ll = (uint64_t)(uint32_t)a * (uint32_t)b;
+    int64_t lh = (uint32_t)a * (b >> 32);
+    int64_t hl = (a >> 32) * (uint32_t)b;
+    int64_t hh = (a >> 32) * (b >> 32);
+    uint64_t mid34 = (ll >> 32) + (uint32_t)lh + (uint32_t)hl;
+    *hi = hh + (lh >> 32) + (hl >> 32) + (mid34 >> 32);
+    return (mid34 << 32) + (uint32_t)ll;
+}
+#endif
+
+static SECP256K1_INLINE void secp256k1_u128_mul(secp256k1_uint128 *r, uint64_t a, uint64_t b) {
+   r->lo = secp256k1_umul128(a, b, &r->hi);
+}
+
+static SECP256K1_INLINE void secp256k1_u128_accum_mul(secp256k1_uint128 *r, uint64_t a, uint64_t b) {
+   uint64_t lo, hi;
+   lo = secp256k1_umul128(a, b, &hi);
+   r->lo += lo;
+   r->hi += hi + (r->lo < lo);
+}
+
+static SECP256K1_INLINE void secp256k1_u128_accum_u64(secp256k1_uint128 *r, uint64_t a) {
+   r->lo += a;
+   r->hi += r->lo < a;
+}
+
+/* Unsigned (logical) right shift.
+ * Non-constant time in n.
+ */
+static SECP256K1_INLINE void secp256k1_u128_rshift(secp256k1_uint128 *r, unsigned int n) {
+   VERIFY_CHECK(n < 128);
+   if (n >= 64) {
+     r->lo = r->hi >> (n-64);
+     r->hi = 0;
+   } else if (n > 0) {
+     r->lo = ((1U * r->hi) << (64-n)) | r->lo >> n;
+     r->hi >>= n;
+   }
+}
+
+static SECP256K1_INLINE uint64_t secp256k1_u128_to_u64(const secp256k1_uint128 *a) {
+   return a->lo;
+}
+
+static SECP256K1_INLINE uint64_t secp256k1_u128_hi_u64(const secp256k1_uint128 *a) {
+   return a->hi;
+}
+
+static SECP256K1_INLINE void secp256k1_u128_from_u64(secp256k1_uint128 *r, uint64_t a) {
+   r->hi = 0;
+   r->lo = a;
+}
+
+static SECP256K1_INLINE int secp256k1_u128_check_bits(const secp256k1_uint128 *r, unsigned int n) {
+   VERIFY_CHECK(n < 128);
+   return n >= 64 ? r->hi >> (n - 64) == 0
+                  : r->hi == 0 && r->lo >> n == 0;
+}
+
+static SECP256K1_INLINE void secp256k1_i128_mul(secp256k1_int128 *r, int64_t a, int64_t b) {
+   int64_t hi;
+   r->lo = (uint64_t)secp256k1_mul128(a, b, &hi);
+   r->hi = (uint64_t)hi;
+}
+
+static SECP256K1_INLINE void secp256k1_i128_accum_mul(secp256k1_int128 *r, int64_t a, int64_t b) {
+   int64_t hi;
+   uint64_t lo = (uint64_t)secp256k1_mul128(a, b, &hi);
+   r->lo += lo;
+   hi += r->lo < lo;
+   /* Verify no overflow.
+    * If r represents a positive value (the sign bit is not set) and the value we are adding is a positive value (the sign bit is not set),
+    * then we require that the resulting value also be positive (the sign bit is not set).
+    * Note that (X <= Y) means (X implies Y) when X and Y are boolean values (i.e. 0 or 1).
+    */
+   VERIFY_CHECK((r->hi <= 0x7fffffffffffffffu && (uint64_t)hi <= 0x7fffffffffffffffu) <= (r->hi + (uint64_t)hi <= 0x7fffffffffffffffu));
+   /* Verify no underflow.
+    * If r represents a negative value (the sign bit is set) and the value we are adding is a negative value (the sign bit is set),
+    * then we require that the resulting value also be negative (the sign bit is set).
+    */
+   VERIFY_CHECK((r->hi > 0x7fffffffffffffffu && (uint64_t)hi > 0x7fffffffffffffffu) <= (r->hi + (uint64_t)hi > 0x7fffffffffffffffu));
+   r->hi += hi;
+}
+
+static SECP256K1_INLINE void secp256k1_i128_dissip_mul(secp256k1_int128 *r, int64_t a, int64_t b) {
+   int64_t hi;
+   uint64_t lo = (uint64_t)secp256k1_mul128(a, b, &hi);
+   hi += r->lo < lo;
+   /* Verify no overflow.
+    * If r represents a positive value (the sign bit is not set) and the value we are subtracting is a negative value (the sign bit is set),
+    * then we require that the resulting value also be positive (the sign bit is not set).
+    */
+   VERIFY_CHECK((r->hi <= 0x7fffffffffffffffu && (uint64_t)hi > 0x7fffffffffffffffu) <= (r->hi - (uint64_t)hi <= 0x7fffffffffffffffu));
+   /* Verify no underflow.
+    * If r represents a negative value (the sign bit is set) and the value we are subtracting is a positive value (the sign sign bit is not set),
+    * then we require that the resulting value also be negative (the sign bit is set).
+    */
+   VERIFY_CHECK((r->hi > 0x7fffffffffffffffu && (uint64_t)hi <= 0x7fffffffffffffffu) <= (r->hi - (uint64_t)hi > 0x7fffffffffffffffu));
+   r->hi -= hi;
+   r->lo -= lo;
+}
+
+static SECP256K1_INLINE void secp256k1_i128_det(secp256k1_int128 *r, int64_t a, int64_t b, int64_t c, int64_t d) {
+   secp256k1_i128_mul(r, a, d);
+   secp256k1_i128_dissip_mul(r, b, c);
+}
+
+/* Signed (arithmetic) right shift.
+ * Non-constant time in n.
+ */
+static SECP256K1_INLINE void secp256k1_i128_rshift(secp256k1_int128 *r, unsigned int n) {
+   VERIFY_CHECK(n < 128);
+   if (n >= 64) {
+     r->lo = (uint64_t)((int64_t)(r->hi) >> (n-64));
+     r->hi = (uint64_t)((int64_t)(r->hi) >> 63);
+   } else if (n > 0) {
+     r->lo = ((1U * r->hi) << (64-n)) | r->lo >> n;
+     r->hi = (uint64_t)((int64_t)(r->hi) >> n);
+   }
+}
+
+static SECP256K1_INLINE int64_t secp256k1_i128_to_i64(const secp256k1_int128 *a) {
+   return (int64_t)a->lo;
+}
+
+static SECP256K1_INLINE void secp256k1_i128_from_i64(secp256k1_int128 *r, int64_t a) {
+   r->hi = (uint64_t)(a >> 63);
+   r->lo = (uint64_t)a;
+}
+
+static SECP256K1_INLINE int secp256k1_i128_eq_var(const secp256k1_int128 *a, const secp256k1_int128 *b) {
+   return a->hi == b->hi && a->lo == b->lo;
+}
+
+static SECP256K1_INLINE int secp256k1_i128_check_pow2(const secp256k1_int128 *r, unsigned int n) {
+   VERIFY_CHECK(n < 127);
+   return n >= 64 ? r->hi == (uint64_t)1 << (n - 64) && r->lo == 0
+                  : r->hi == 0 && r->lo == (uint64_t)1 << n;
+}
+
+#endif
--- a/src/modinv64_impl.h
+++ b/src/modinv64_impl.h
@ -7,10 +7,9 @@
 #ifndef SECP256K1_MODINV64_IMPL_H
 #define SECP256K1_MODINV64_IMPL_H

+#include "int128.h"
 #include "modinv64.h"

-#include "util.h"
-
 /* This file implements modular inversion based on the paper "Fast constant-time gcd computation and
 * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
 *
@ -18,6 +17,15 @@
 * implementation for N=62, using 62-bit signed limbs represented as int64_t.
 */

+/* Data type for transition matrices (see section 3 of explanation).
+ *
+ * t = [ u  v ]
+ *     [ q  r ]
+ */
+typedef struct {
+    int64_t u, v, q, r;
+} secp256k1_modinv64_trans2x2;
+
 #ifdef VERIFY
 /* Helper function to compute the absolute value of an int64_t.
 * (we don't use abs/labs/llabs as it depends on the int sizes). */
@ -32,15 +40,17 @@ static const secp256k1_modinv64_signed62 SECP256K1_SIGNED62_ONE = {{1}};
 /* Compute a*factor and put it in r. All but the top limb in r will be in range [0,2^62). */
 static void secp256k1_modinv64_mul_62(secp256k1_modinv64_signed62 *r, const secp256k1_modinv64_signed62 *a, int alen, int64_t factor) {
    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
-    int128_t c = 0;
+    secp256k1_int128 c, d;
    int i;
+    secp256k1_i128_from_i64(&c, 0);
    for (i = 0; i < 4; ++i) {
-        if (i < alen) c += (int128_t)a->v[i] * factor;
-        r->v[i] = (int64_t)c & M62; c >>= 62;
+        if (i < alen) secp256k1_i128_accum_mul(&c, a->v[i], factor);
+        r->v[i] = secp256k1_i128_to_i64(&c) & M62; secp256k1_i128_rshift(&c, 62);
    }
-    if (4 < alen) c += (int128_t)a->v[4] * factor;
-    VERIFY_CHECK(c == (int64_t)c);
-    r->v[4] = (int64_t)c;
+    if (4 < alen) secp256k1_i128_accum_mul(&c, a->v[4], factor);
+    secp256k1_i128_from_i64(&d, secp256k1_i128_to_i64(&c));
+    VERIFY_CHECK(secp256k1_i128_eq_var(&c, &d));
+    r->v[4] = secp256k1_i128_to_i64(&c);
 }

 /* Return -1 for a<b*factor, 0 for a==b*factor, 1 for a>b*factor. A has alen limbs; b has 5. */
@ -60,6 +70,13 @@ static int secp256k1_modinv64_mul_cmp_62(const secp256k1_modinv64_signed62 *a, i
    }
    return 0;
 }
+
+/* Check if the determinant of t is equal to 1 << n. */
+static int secp256k1_modinv64_det_check_pow2(const secp256k1_modinv64_trans2x2 *t, unsigned int n) {
+    secp256k1_int128 a;
+    secp256k1_i128_det(&a, t->u, t->v, t->q, t->r);
+    return secp256k1_i128_check_pow2(&a, n);
+}
 #endif

 /* Take as input a signed62 number in range (-2*modulus,modulus), and add a multiple of the modulus
@ -136,15 +153,6 @@ static void secp256k1_modinv64_normalize_62(secp256k1_modinv64_signed62 *r, int6
 #endif
 }

-/* Data type for transition matrices (see section 3 of explanation).
- *
- * t = [ u  v ]
- *     [ q  r ]
- */
-typedef struct {
-    int64_t u, v, q, r;
-} secp256k1_modinv64_trans2x2;
-
 /* Compute the transition matrix and eta for 59 divsteps (where zeta=-(delta+1/2)).
 * Note that the transformation matrix is scaled by 2^62 and not 2^59.
 *
@ -203,13 +211,15 @@ static int64_t secp256k1_modinv64_divsteps_59(int64_t zeta, uint64_t f0, uint64_
    t->v = (int64_t)v;
    t->q = (int64_t)q;
    t->r = (int64_t)r;
+#ifdef VERIFY
    /* The determinant of t must be a power of two. This guarantees that multiplication with t
     * does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which
     * will be divided out again). As each divstep's individual matrix has determinant 2, the
     * aggregate of 59 of them will have determinant 2^59. Multiplying with the initial
     * 8*identity (which has determinant 2^6) means the overall outputs has determinant
     * 2^65. */
-    VERIFY_CHECK((int128_t)t->u * t->r - (int128_t)t->v * t->q == ((int128_t)1) << 65);
+    VERIFY_CHECK(secp256k1_modinv64_det_check_pow2(t, 65));
+#endif
    return zeta;
 }

@ -286,11 +296,13 @@ static int64_t secp256k1_modinv64_divsteps_62_var(int64_t eta, uint64_t f0, uint
    t->v = (int64_t)v;
    t->q = (int64_t)q;
    t->r = (int64_t)r;
+#ifdef VERIFY
    /* The determinant of t must be a power of two. This guarantees that multiplication with t
     * does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which
     * will be divided out again). As each divstep's individual matrix has determinant 2, the
     * aggregate of 62 of them will have determinant 2^62. */
-    VERIFY_CHECK((int128_t)t->u * t->r - (int128_t)t->v * t->q == ((int128_t)1) << 62);
+    VERIFY_CHECK(secp256k1_modinv64_det_check_pow2(t, 62));
+#endif
    return eta;
 }

@ -307,7 +319,7 @@ static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp
    const int64_t e0 = e->v[0], e1 = e->v[1], e2 = e->v[2], e3 = e->v[3], e4 = e->v[4];
    const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
    int64_t md, me, sd, se;
-    int128_t cd, ce;
+    secp256k1_int128 cd, ce;
 #ifdef VERIFY
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, 1) < 0);  /* d <    modulus */
@ -324,54 +336,64 @@ static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp
    md = (u & sd) + (v & se);
    me = (q & sd) + (r & se);
    /* Begin computing t*[d,e]. */
-    cd = (int128_t)u * d0 + (int128_t)v * e0;
-    ce = (int128_t)q * d0 + (int128_t)r * e0;
+    secp256k1_i128_mul(&cd, u, d0);
+    secp256k1_i128_accum_mul(&cd, v, e0);
+    secp256k1_i128_mul(&ce, q, d0);
+    secp256k1_i128_accum_mul(&ce, r, e0);
    /* Correct md,me so that t*[d,e]+modulus*[md,me] has 62 zero bottom bits. */
-    md -= (modinfo->modulus_inv62 * (uint64_t)cd + md) & M62;
-    me -= (modinfo->modulus_inv62 * (uint64_t)ce + me) & M62;
+    md -= (modinfo->modulus_inv62 * (uint64_t)secp256k1_i128_to_i64(&cd) + md) & M62;
+    me -= (modinfo->modulus_inv62 * (uint64_t)secp256k1_i128_to_i64(&ce) + me) & M62;
    /* Update the beginning of computation for t*[d,e]+modulus*[md,me] now md,me are known. */
-    cd += (int128_t)modinfo->modulus.v[0] * md;
-    ce += (int128_t)modinfo->modulus.v[0] * me;
+    secp256k1_i128_accum_mul(&cd, modinfo->modulus.v[0], md);
+    secp256k1_i128_accum_mul(&ce, modinfo->modulus.v[0], me);
    /* Verify that the low 62 bits of the computation are indeed zero, and then throw them away. */
-    VERIFY_CHECK(((int64_t)cd & M62) == 0); cd >>= 62;
-    VERIFY_CHECK(((int64_t)ce & M62) == 0); ce >>= 62;
+    VERIFY_CHECK((secp256k1_i128_to_i64(&cd) & M62) == 0); secp256k1_i128_rshift(&cd, 62);
+    VERIFY_CHECK((secp256k1_i128_to_i64(&ce) & M62) == 0); secp256k1_i128_rshift(&ce, 62);
    /* Compute limb 1 of t*[d,e]+modulus*[md,me], and store it as output limb 0 (= down shift). */
-    cd += (int128_t)u * d1 + (int128_t)v * e1;
-    ce += (int128_t)q * d1 + (int128_t)r * e1;
+    secp256k1_i128_accum_mul(&cd, u, d1);
+    secp256k1_i128_accum_mul(&cd, v, e1);
+    secp256k1_i128_accum_mul(&ce, q, d1);
+    secp256k1_i128_accum_mul(&ce, r, e1);
    if (modinfo->modulus.v[1]) { /* Optimize for the case where limb of modulus is zero. */
-        cd += (int128_t)modinfo->modulus.v[1] * md;
-        ce += (int128_t)modinfo->modulus.v[1] * me;
+        secp256k1_i128_accum_mul(&cd, modinfo->modulus.v[1], md);
+        secp256k1_i128_accum_mul(&ce, modinfo->modulus.v[1], me);
    }
-    d->v[0] = (int64_t)cd & M62; cd >>= 62;
-    e->v[0] = (int64_t)ce & M62; ce >>= 62;
+    d->v[0] = secp256k1_i128_to_i64(&cd) & M62; secp256k1_i128_rshift(&cd, 62);
+    e->v[0] = secp256k1_i128_to_i64(&ce) & M62; secp256k1_i128_rshift(&ce, 62);
    /* Compute limb 2 of t*[d,e]+modulus*[md,me], and store it as output limb 1. */
-    cd += (int128_t)u * d2 + (int128_t)v * e2;
-    ce += (int128_t)q * d2 + (int128_t)r * e2;
+    secp256k1_i128_accum_mul(&cd, u, d2);
+    secp256k1_i128_accum_mul(&cd, v, e2);
+    secp256k1_i128_accum_mul(&ce, q, d2);
+    secp256k1_i128_accum_mul(&ce, r, e2);
    if (modinfo->modulus.v[2]) { /* Optimize for the case where limb of modulus is zero. */
-        cd += (int128_t)modinfo->modulus.v[2] * md;
-        ce += (int128_t)modinfo->modulus.v[2] * me;
+        secp256k1_i128_accum_mul(&cd, modinfo->modulus.v[2], md);
+        secp256k1_i128_accum_mul(&ce, modinfo->modulus.v[2], me);
    }
-    d->v[1] = (int64_t)cd & M62; cd >>= 62;
-    e->v[1] = (int64_t)ce & M62; ce >>= 62;
+    d->v[1] = secp256k1_i128_to_i64(&cd) & M62; secp256k1_i128_rshift(&cd, 62);
+    e->v[1] = secp256k1_i128_to_i64(&ce) & M62; secp256k1_i128_rshift(&ce, 62);
    /* Compute limb 3 of t*[d,e]+modulus*[md,me], and store it as output limb 2. */
-    cd += (int128_t)u * d3 + (int128_t)v * e3;
-    ce += (int128_t)q * d3 + (int128_t)r * e3;
+    secp256k1_i128_accum_mul(&cd, u, d3);
+    secp256k1_i128_accum_mul(&cd, v, e3);
+    secp256k1_i128_accum_mul(&ce, q, d3);
+    secp256k1_i128_accum_mul(&ce, r, e3);
    if (modinfo->modulus.v[3]) { /* Optimize for the case where limb of modulus is zero. */
-        cd += (int128_t)modinfo->modulus.v[3] * md;
-        ce += (int128_t)modinfo->modulus.v[3] * me;
+        secp256k1_i128_accum_mul(&cd, modinfo->modulus.v[3], md);
+        secp256k1_i128_accum_mul(&ce, modinfo->modulus.v[3], me);
    }
-    d->v[2] = (int64_t)cd & M62; cd >>= 62;
-    e->v[2] = (int64_t)ce & M62; ce >>= 62;
+    d->v[2] = secp256k1_i128_to_i64(&cd) & M62; secp256k1_i128_rshift(&cd, 62);
+    e->v[2] = secp256k1_i128_to_i64(&ce) & M62; secp256k1_i128_rshift(&ce, 62);
    /* Compute limb 4 of t*[d,e]+modulus*[md,me], and store it as output limb 3. */
-    cd += (int128_t)u * d4 + (int128_t)v * e4;
-    ce += (int128_t)q * d4 + (int128_t)r * e4;
-    cd += (int128_t)modinfo->modulus.v[4] * md;
-    ce += (int128_t)modinfo->modulus.v[4] * me;
-    d->v[3] = (int64_t)cd & M62; cd >>= 62;
-    e->v[3] = (int64_t)ce & M62; ce >>= 62;
+    secp256k1_i128_accum_mul(&cd, u, d4);
+    secp256k1_i128_accum_mul(&cd, v, e4);
+    secp256k1_i128_accum_mul(&ce, q, d4);
+    secp256k1_i128_accum_mul(&ce, r, e4);
+    secp256k1_i128_accum_mul(&cd, modinfo->modulus.v[4], md);
+    secp256k1_i128_accum_mul(&ce, modinfo->modulus.v[4], me);
+    d->v[3] = secp256k1_i128_to_i64(&cd) & M62; secp256k1_i128_rshift(&cd, 62);
+    e->v[3] = secp256k1_i128_to_i64(&ce) & M62; secp256k1_i128_rshift(&ce, 62);
    /* What remains is limb 5 of t*[d,e]+modulus*[md,me]; store it as output limb 4. */
-    d->v[4] = (int64_t)cd;
-    e->v[4] = (int64_t)ce;
+    d->v[4] = secp256k1_i128_to_i64(&cd);
+    e->v[4] = secp256k1_i128_to_i64(&ce);
 #ifdef VERIFY
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, 1) < 0);  /* d <    modulus */
@ -389,36 +411,46 @@ static void secp256k1_modinv64_update_fg_62(secp256k1_modinv64_signed62 *f, secp
    const int64_t f0 = f->v[0], f1 = f->v[1], f2 = f->v[2], f3 = f->v[3], f4 = f->v[4];
    const int64_t g0 = g->v[0], g1 = g->v[1], g2 = g->v[2], g3 = g->v[3], g4 = g->v[4];
    const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
-    int128_t cf, cg;
+    secp256k1_int128 cf, cg;
    /* Start computing t*[f,g]. */
-    cf = (int128_t)u * f0 + (int128_t)v * g0;
-    cg = (int128_t)q * f0 + (int128_t)r * g0;
+    secp256k1_i128_mul(&cf, u, f0);
+    secp256k1_i128_accum_mul(&cf, v, g0);
+    secp256k1_i128_mul(&cg, q, f0);
+    secp256k1_i128_accum_mul(&cg, r, g0);
    /* Verify that the bottom 62 bits of the result are zero, and then throw them away. */
-    VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
-    VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
+    VERIFY_CHECK((secp256k1_i128_to_i64(&cf) & M62) == 0); secp256k1_i128_rshift(&cf, 62);
+    VERIFY_CHECK((secp256k1_i128_to_i64(&cg) & M62) == 0); secp256k1_i128_rshift(&cg, 62);
    /* Compute limb 1 of t*[f,g], and store it as output limb 0 (= down shift). */
-    cf += (int128_t)u * f1 + (int128_t)v * g1;
-    cg += (int128_t)q * f1 + (int128_t)r * g1;
-    f->v[0] = (int64_t)cf & M62; cf >>= 62;
-    g->v[0] = (int64_t)cg & M62; cg >>= 62;
+    secp256k1_i128_accum_mul(&cf, u, f1);
+    secp256k1_i128_accum_mul(&cf, v, g1);
+    secp256k1_i128_accum_mul(&cg, q, f1);
+    secp256k1_i128_accum_mul(&cg, r, g1);
+    f->v[0] = secp256k1_i128_to_i64(&cf) & M62; secp256k1_i128_rshift(&cf, 62);
+    g->v[0] = secp256k1_i128_to_i64(&cg) & M62; secp256k1_i128_rshift(&cg, 62);
    /* Compute limb 2 of t*[f,g], and store it as output limb 1. */
-    cf += (int128_t)u * f2 + (int128_t)v * g2;
-    cg += (int128_t)q * f2 + (int128_t)r * g2;
-    f->v[1] = (int64_t)cf & M62; cf >>= 62;
-    g->v[1] = (int64_t)cg & M62; cg >>= 62;
+    secp256k1_i128_accum_mul(&cf, u, f2);
+    secp256k1_i128_accum_mul(&cf, v, g2);
+    secp256k1_i128_accum_mul(&cg, q, f2);
+    secp256k1_i128_accum_mul(&cg, r, g2);
+    f->v[1] = secp256k1_i128_to_i64(&cf) & M62; secp256k1_i128_rshift(&cf, 62);
+    g->v[1] = secp256k1_i128_to_i64(&cg) & M62; secp256k1_i128_rshift(&cg, 62);
    /* Compute limb 3 of t*[f,g], and store it as output limb 2. */
-    cf += (int128_t)u * f3 + (int128_t)v * g3;
-    cg += (int128_t)q * f3 + (int128_t)r * g3;
-    f->v[2] = (int64_t)cf & M62; cf >>= 62;
-    g->v[2] = (int64_t)cg & M62; cg >>= 62;
+    secp256k1_i128_accum_mul(&cf, u, f3);
+    secp256k1_i128_accum_mul(&cf, v, g3);
+    secp256k1_i128_accum_mul(&cg, q, f3);
+    secp256k1_i128_accum_mul(&cg, r, g3);
+    f->v[2] = secp256k1_i128_to_i64(&cf) & M62; secp256k1_i128_rshift(&cf, 62);
+    g->v[2] = secp256k1_i128_to_i64(&cg) & M62; secp256k1_i128_rshift(&cg, 62);
    /* Compute limb 4 of t*[f,g], and store it as output limb 3. */
-    cf += (int128_t)u * f4 + (int128_t)v * g4;
-    cg += (int128_t)q * f4 + (int128_t)r * g4;
-    f->v[3] = (int64_t)cf & M62; cf >>= 62;
-    g->v[3] = (int64_t)cg & M62; cg >>= 62;
+    secp256k1_i128_accum_mul(&cf, u, f4);
+    secp256k1_i128_accum_mul(&cf, v, g4);
+    secp256k1_i128_accum_mul(&cg, q, f4);
+    secp256k1_i128_accum_mul(&cg, r, g4);
+    f->v[3] = secp256k1_i128_to_i64(&cf) & M62; secp256k1_i128_rshift(&cf, 62);
+    g->v[3] = secp256k1_i128_to_i64(&cg) & M62; secp256k1_i128_rshift(&cg, 62);
    /* What remains is limb 5 of t*[f,g]; store it as output limb 4. */
-    f->v[4] = (int64_t)cf;
-    g->v[4] = (int64_t)cg;
+    f->v[4] = secp256k1_i128_to_i64(&cf);
+    g->v[4] = secp256k1_i128_to_i64(&cg);
 }

 /* Compute (t/2^62) * [f, g], where t is a transition matrix for 62 divsteps.
@ -431,30 +463,34 @@ static void secp256k1_modinv64_update_fg_62_var(int len, secp256k1_modinv64_sign
    const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
    const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
    int64_t fi, gi;
-    int128_t cf, cg;
+    secp256k1_int128 cf, cg;
    int i;
    VERIFY_CHECK(len > 0);
    /* Start computing t*[f,g]. */
    fi = f->v[0];
    gi = g->v[0];
-    cf = (int128_t)u * fi + (int128_t)v * gi;
-    cg = (int128_t)q * fi + (int128_t)r * gi;
+    secp256k1_i128_mul(&cf, u, fi);
+    secp256k1_i128_accum_mul(&cf, v, gi);
+    secp256k1_i128_mul(&cg, q, fi);
+    secp256k1_i128_accum_mul(&cg, r, gi);
    /* Verify that the bottom 62 bits of the result are zero, and then throw them away. */
-    VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
-    VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
+    VERIFY_CHECK((secp256k1_i128_to_i64(&cf) & M62) == 0); secp256k1_i128_rshift(&cf, 62);
+    VERIFY_CHECK((secp256k1_i128_to_i64(&cg) & M62) == 0); secp256k1_i128_rshift(&cg, 62);
    /* Now iteratively compute limb i=1..len of t*[f,g], and store them in output limb i-1 (shifting
     * down by 62 bits). */
    for (i = 1; i < len; ++i) {
        fi = f->v[i];
        gi = g->v[i];
-        cf += (int128_t)u * fi + (int128_t)v * gi;
-        cg += (int128_t)q * fi + (int128_t)r * gi;
-        f->v[i - 1] = (int64_t)cf & M62; cf >>= 62;
-        g->v[i - 1] = (int64_t)cg & M62; cg >>= 62;
+        secp256k1_i128_accum_mul(&cf, u, fi);
+        secp256k1_i128_accum_mul(&cf, v, gi);
+        secp256k1_i128_accum_mul(&cg, q, fi);
+        secp256k1_i128_accum_mul(&cg, r, gi);
+        f->v[i - 1] = secp256k1_i128_to_i64(&cf) & M62; secp256k1_i128_rshift(&cf, 62);
+        g->v[i - 1] = secp256k1_i128_to_i64(&cg) & M62; secp256k1_i128_rshift(&cg, 62);
    }
    /* What remains is limb (len) of t*[f,g]; store it as output limb (len-1). */
-    f->v[len - 1] = (int64_t)cf;
-    g->v[len - 1] = (int64_t)cg;
+    f->v[len - 1] = secp256k1_i128_to_i64(&cf);
+    g->v[len - 1] = secp256k1_i128_to_i64(&cg);
 }

 /* Compute the inverse of x modulo modinfo->modulus, and replace x with it (constant time in x). */
--- a/src/precompute_ecmult.c
+++ b/src/precompute_ecmult.c
@ -14,10 +14,13 @@
 #endif

 #include "../include/secp256k1.h"
+
 #include "assumptions.h"
 #include "util.h"
+
 #include "field_impl.h"
 #include "group_impl.h"
+#include "int128_impl.h"
 #include "ecmult.h"
 #include "ecmult_compute_table_impl.h"

--- a/src/precompute_ecmult_gen.c
+++ b/src/precompute_ecmult_gen.c
@ -8,9 +8,12 @@
 #include <stdio.h>

 #include "../include/secp256k1.h"
+
 #include "assumptions.h"
 #include "util.h"
+
 #include "group.h"
+#include "int128_impl.h"
 #include "ecmult_gen.h"
 #include "ecmult_gen_compute_table_impl.h"

--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@ -7,6 +7,7 @@
 #ifndef SECP256K1_SCALAR_REPR_IMPL_H
 #define SECP256K1_SCALAR_REPR_IMPL_H

+#include "int128.h"
 #include "modinv64_impl.h"

 /* Limbs of the secp256k1 order. */
@ -69,50 +70,61 @@ SECP256K1_INLINE static int secp256k1_scalar_check_overflow(const secp256k1_scal
 }

 SECP256K1_INLINE static int secp256k1_scalar_reduce(secp256k1_scalar *r, unsigned int overflow) {
-    uint128_t t;
+    secp256k1_uint128 t;
    VERIFY_CHECK(overflow <= 1);
-    t = (uint128_t)r->d[0] + overflow * SECP256K1_N_C_0;
-    r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
-    t += (uint128_t)r->d[1] + overflow * SECP256K1_N_C_1;
-    r->d[1] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
-    t += (uint128_t)r->d[2] + overflow * SECP256K1_N_C_2;
-    r->d[2] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
-    t += (uint64_t)r->d[3];
-    r->d[3] = t & 0xFFFFFFFFFFFFFFFFULL;
+    secp256k1_u128_from_u64(&t, r->d[0]);
+    secp256k1_u128_accum_u64(&t, overflow * SECP256K1_N_C_0);
+    r->d[0] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, r->d[1]);
+    secp256k1_u128_accum_u64(&t, overflow * SECP256K1_N_C_1);
+    r->d[1] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, r->d[2]);
+    secp256k1_u128_accum_u64(&t, overflow * SECP256K1_N_C_2);
+    r->d[2] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, r->d[3]);
+    r->d[3] = secp256k1_u128_to_u64(&t);
    return overflow;
 }

 static int secp256k1_scalar_add(secp256k1_scalar *r, const secp256k1_scalar *a, const secp256k1_scalar *b) {
    int overflow;
-    uint128_t t = (uint128_t)a->d[0] + b->d[0];
-    r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
-    t += (uint128_t)a->d[1] + b->d[1];
-    r->d[1] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
-    t += (uint128_t)a->d[2] + b->d[2];
-    r->d[2] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
-    t += (uint128_t)a->d[3] + b->d[3];
-    r->d[3] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
-    overflow = t + secp256k1_scalar_check_overflow(r);
+    secp256k1_uint128 t;
+    secp256k1_u128_from_u64(&t, a->d[0]);
+    secp256k1_u128_accum_u64(&t, b->d[0]);
+    r->d[0] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, a->d[1]);
+    secp256k1_u128_accum_u64(&t, b->d[1]);
+    r->d[1] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, a->d[2]);
+    secp256k1_u128_accum_u64(&t, b->d[2]);
+    r->d[2] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, a->d[3]);
+    secp256k1_u128_accum_u64(&t, b->d[3]);
+    r->d[3] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    overflow = secp256k1_u128_to_u64(&t) + secp256k1_scalar_check_overflow(r);
    VERIFY_CHECK(overflow == 0 || overflow == 1);
    secp256k1_scalar_reduce(r, overflow);
    return overflow;
 }

 static void secp256k1_scalar_cadd_bit(secp256k1_scalar *r, unsigned int bit, int flag) {
-    uint128_t t;
+    secp256k1_uint128 t;
    VERIFY_CHECK(bit < 256);
    bit += ((uint32_t) flag - 1) & 0x100;  /* forcing (bit >> 6) > 3 makes this a noop */
-    t = (uint128_t)r->d[0] + (((uint64_t)((bit >> 6) == 0)) << (bit & 0x3F));
-    r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
-    t += (uint128_t)r->d[1] + (((uint64_t)((bit >> 6) == 1)) << (bit & 0x3F));
-    r->d[1] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
-    t += (uint128_t)r->d[2] + (((uint64_t)((bit >> 6) == 2)) << (bit & 0x3F));
-    r->d[2] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
-    t += (uint128_t)r->d[3] + (((uint64_t)((bit >> 6) == 3)) << (bit & 0x3F));
-    r->d[3] = t & 0xFFFFFFFFFFFFFFFFULL;
+    secp256k1_u128_from_u64(&t, r->d[0]);
+    secp256k1_u128_accum_u64(&t, ((uint64_t)((bit >> 6) == 0)) << (bit & 0x3F));
+    r->d[0] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, r->d[1]);
+    secp256k1_u128_accum_u64(&t, ((uint64_t)((bit >> 6) == 1)) << (bit & 0x3F));
+    r->d[1] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, r->d[2]);
+    secp256k1_u128_accum_u64(&t, ((uint64_t)((bit >> 6) == 2)) << (bit & 0x3F));
+    r->d[2] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, r->d[3]);
+    secp256k1_u128_accum_u64(&t, ((uint64_t)((bit >> 6) == 3)) << (bit & 0x3F));
+    r->d[3] = secp256k1_u128_to_u64(&t);
 #ifdef VERIFY
-    VERIFY_CHECK((t >> 64) == 0);
-    VERIFY_CHECK(secp256k1_scalar_check_overflow(r) == 0);
+    VERIFY_CHECK(secp256k1_u128_hi_u64(&t) == 0);
 #endif
 }

@ -141,14 +153,19 @@ SECP256K1_INLINE static int secp256k1_scalar_is_zero(const secp256k1_scalar *a)

 static void secp256k1_scalar_negate(secp256k1_scalar *r, const secp256k1_scalar *a) {
    uint64_t nonzero = 0xFFFFFFFFFFFFFFFFULL * (secp256k1_scalar_is_zero(a) == 0);
-    uint128_t t = (uint128_t)(~a->d[0]) + SECP256K1_N_0 + 1;
-    r->d[0] = t & nonzero; t >>= 64;
-    t += (uint128_t)(~a->d[1]) + SECP256K1_N_1;
-    r->d[1] = t & nonzero; t >>= 64;
-    t += (uint128_t)(~a->d[2]) + SECP256K1_N_2;
-    r->d[2] = t & nonzero; t >>= 64;
-    t += (uint128_t)(~a->d[3]) + SECP256K1_N_3;
-    r->d[3] = t & nonzero;
+    secp256k1_uint128 t;
+    secp256k1_u128_from_u64(&t, ~a->d[0]);
+    secp256k1_u128_accum_u64(&t, SECP256K1_N_0 + 1);
+    r->d[0] = secp256k1_u128_to_u64(&t) & nonzero; secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, ~a->d[1]);
+    secp256k1_u128_accum_u64(&t, SECP256K1_N_1);
+    r->d[1] = secp256k1_u128_to_u64(&t) & nonzero; secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, ~a->d[2]);
+    secp256k1_u128_accum_u64(&t, SECP256K1_N_2);
+    r->d[2] = secp256k1_u128_to_u64(&t) & nonzero; secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, ~a->d[3]);
+    secp256k1_u128_accum_u64(&t, SECP256K1_N_3);
+    r->d[3] = secp256k1_u128_to_u64(&t) & nonzero;
 }

 SECP256K1_INLINE static int secp256k1_scalar_is_one(const secp256k1_scalar *a) {
@ -172,14 +189,19 @@ static int secp256k1_scalar_cond_negate(secp256k1_scalar *r, int flag) {
     * if we are flag = 1, mask = 11...11 and this is identical to secp256k1_scalar_negate */
    uint64_t mask = !flag - 1;
    uint64_t nonzero = (secp256k1_scalar_is_zero(r) != 0) - 1;
-    uint128_t t = (uint128_t)(r->d[0] ^ mask) + ((SECP256K1_N_0 + 1) & mask);
-    r->d[0] = t & nonzero; t >>= 64;
-    t += (uint128_t)(r->d[1] ^ mask) + (SECP256K1_N_1 & mask);
-    r->d[1] = t & nonzero; t >>= 64;
-    t += (uint128_t)(r->d[2] ^ mask) + (SECP256K1_N_2 & mask);
-    r->d[2] = t & nonzero; t >>= 64;
-    t += (uint128_t)(r->d[3] ^ mask) + (SECP256K1_N_3 & mask);
-    r->d[3] = t & nonzero;
+    secp256k1_uint128 t;
+    secp256k1_u128_from_u64(&t, r->d[0] ^ mask);
+    secp256k1_u128_accum_u64(&t, (SECP256K1_N_0 + 1) & mask);
+    r->d[0] = secp256k1_u128_to_u64(&t) & nonzero; secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, r->d[1] ^ mask);
+    secp256k1_u128_accum_u64(&t, SECP256K1_N_1 & mask);
+    r->d[1] = secp256k1_u128_to_u64(&t) & nonzero; secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, r->d[2] ^ mask);
+    secp256k1_u128_accum_u64(&t, SECP256K1_N_2 & mask);
+    r->d[2] = secp256k1_u128_to_u64(&t) & nonzero; secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, r->d[3] ^ mask);
+    secp256k1_u128_accum_u64(&t, SECP256K1_N_3 & mask);
+    r->d[3] = secp256k1_u128_to_u64(&t) & nonzero;
    return 2 * (mask == 0) - 1;
 }

@ -189,9 +211,10 @@ static int secp256k1_scalar_cond_negate(secp256k1_scalar *r, int flag) {
 #define muladd(a,b) { \
    uint64_t tl, th; \
    { \
-        uint128_t t = (uint128_t)a * b; \
-        th = t >> 64;         /* at most 0xFFFFFFFFFFFFFFFE */ \
-        tl = t; \
+        secp256k1_uint128 t; \
+        secp256k1_u128_mul(&t, a, b); \
+        th = secp256k1_u128_hi_u64(&t);  /* at most 0xFFFFFFFFFFFFFFFE */ \
+        tl = secp256k1_u128_to_u64(&t); \
    } \
    c0 += tl;                 /* overflow is handled on the next line */ \
    th += (c0 < tl);          /* at most 0xFFFFFFFFFFFFFFFF */ \
@ -204,9 +227,10 @@ static int secp256k1_scalar_cond_negate(secp256k1_scalar *r, int flag) {
 #define muladd_fast(a,b) { \
    uint64_t tl, th; \
    { \
-        uint128_t t = (uint128_t)a * b; \
-        th = t >> 64;         /* at most 0xFFFFFFFFFFFFFFFE */ \
-        tl = t; \
+        secp256k1_uint128 t; \
+        secp256k1_u128_mul(&t, a, b); \
+        th = secp256k1_u128_hi_u64(&t);  /* at most 0xFFFFFFFFFFFFFFFE */ \
+        tl = secp256k1_u128_to_u64(&t); \
    } \
    c0 += tl;                 /* overflow is handled on the next line */ \
    th += (c0 < tl);          /* at most 0xFFFFFFFFFFFFFFFF */ \
@ -484,8 +508,8 @@ static void secp256k1_scalar_reduce_512(secp256k1_scalar *r, const uint64_t *l)
    : "g"(p0), "g"(p1), "g"(p2), "g"(p3), "g"(p4), "D"(r), "i"(SECP256K1_N_C_0), "i"(SECP256K1_N_C_1)
    : "rax", "rdx", "r8", "r9", "r10", "cc", "memory");
 #else
-    uint128_t c;
-    uint64_t c0, c1, c2;
+    secp256k1_uint128 c128;
+    uint64_t c, c0, c1, c2;
    uint64_t n0 = l[4], n1 = l[5], n2 = l[6], n3 = l[7];
    uint64_t m0, m1, m2, m3, m4, m5;
    uint32_t m6;
@ -542,14 +566,18 @@ static void secp256k1_scalar_reduce_512(secp256k1_scalar *r, const uint64_t *l)

    /* Reduce 258 bits into 256. */
    /* r[0..3] = p[0..3] + p[4] * SECP256K1_N_C. */
-    c = p0 + (uint128_t)SECP256K1_N_C_0 * p4;
-    r->d[0] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
-    c += p1 + (uint128_t)SECP256K1_N_C_1 * p4;
-    r->d[1] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
-    c += p2 + (uint128_t)p4;
-    r->d[2] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
-    c += p3;
-    r->d[3] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
+    secp256k1_u128_from_u64(&c128, p0);
+    secp256k1_u128_accum_mul(&c128, SECP256K1_N_C_0, p4);
+    r->d[0] = secp256k1_u128_to_u64(&c128); secp256k1_u128_rshift(&c128, 64);
+    secp256k1_u128_accum_u64(&c128, p1);
+    secp256k1_u128_accum_mul(&c128, SECP256K1_N_C_1, p4);
+    r->d[1] = secp256k1_u128_to_u64(&c128); secp256k1_u128_rshift(&c128, 64);
+    secp256k1_u128_accum_u64(&c128, p2);
+    secp256k1_u128_accum_u64(&c128, p4);
+    r->d[2] = secp256k1_u128_to_u64(&c128); secp256k1_u128_rshift(&c128, 64);
+    secp256k1_u128_accum_u64(&c128, p3);
+    r->d[3] = secp256k1_u128_to_u64(&c128);
+    c = secp256k1_u128_hi_u64(&c128);
 #endif

    /* Final reduction of r. */
--- a/src/secp256k1.c
+++ b/src/secp256k1.c
@ -22,6 +22,7 @@

 #include "assumptions.h"
 #include "util.h"
+
 #include "field_impl.h"
 #include "scalar_impl.h"
 #include "group_impl.h"
@ -31,6 +32,7 @@
 #include "ecdsa_impl.h"
 #include "eckey_impl.h"
 #include "hash_impl.h"
+#include "int128_impl.h"
 #include "scratch_impl.h"
 #include "selftest.h"

--- a/src/tests.c
+++ b/src/tests.c
@ -26,6 +26,7 @@
 #include "modinv32_impl.h"
 #ifdef SECP256K1_WIDEMUL_INT128
 #include "modinv64_impl.h"
+#include "int128_impl.h"
 #endif

 #define CONDITIONAL_TEST(cnt, nam) if (count < (cnt)) { printf("Skipping %s (iteration count too low)\n", nam); } else
@ -430,6 +431,47 @@ void run_scratch_tests(void) {
    secp256k1_context_destroy(none);
 }

+
+#ifdef SECP256K1_WIDEMUL_INT128
+void run_int128_tests(void) {
+    {   /* secp256k1_u128_accum_mul */
+        secp256k1_uint128 res;
+
+        /* Check secp256k1_u128_accum_mul overflow */
+        secp256k1_u128_from_u64(&res, 0);
+        secp256k1_u128_accum_mul(&res, UINT64_MAX, UINT64_MAX);
+        secp256k1_u128_accum_mul(&res, UINT64_MAX, UINT64_MAX);
+        CHECK(secp256k1_u128_to_u64(&res) == 2);
+        CHECK(secp256k1_u128_hi_u64(&res) == 18446744073709551612U);
+    }
+    {   /* secp256k1_u128_accum_mul */
+        secp256k1_int128 res;
+
+        /* Compute INT128_MAX = 2^127 - 1 with secp256k1_i128_accum_mul */
+        secp256k1_i128_from_i64(&res, 0);
+        secp256k1_i128_accum_mul(&res, INT64_MAX, INT64_MAX);
+        secp256k1_i128_accum_mul(&res, INT64_MAX, INT64_MAX);
+        CHECK(secp256k1_i128_to_i64(&res) == 2);
+        secp256k1_i128_accum_mul(&res, 4, 9223372036854775807);
+        secp256k1_i128_accum_mul(&res, 1, 1);
+        CHECK((uint64_t)secp256k1_i128_to_i64(&res) == UINT64_MAX);
+        secp256k1_i128_rshift(&res, 64);
+        CHECK(secp256k1_i128_to_i64(&res) == INT64_MAX);
+
+        /* Compute INT128_MIN = - 2^127 with secp256k1_i128_accum_mul */
+        secp256k1_i128_from_i64(&res, 0);
+        secp256k1_i128_accum_mul(&res, INT64_MAX, INT64_MIN);
+        CHECK(secp256k1_i128_to_i64(&res) == INT64_MIN);
+        secp256k1_i128_accum_mul(&res, INT64_MAX, INT64_MIN);
+        CHECK(secp256k1_i128_to_i64(&res) == 0);
+        secp256k1_i128_accum_mul(&res, 2, INT64_MIN);
+        CHECK(secp256k1_i128_to_i64(&res) == 0);
+        secp256k1_i128_rshift(&res, 64);
+        CHECK(secp256k1_i128_to_i64(&res) == INT64_MIN);
+    }
+}
+#endif
+
 void run_ctz_tests(void) {
    static const uint32_t b32[] = {1, 0xffffffff, 0x5e56968f, 0xe0d63129};
    static const uint64_t b64[] = {1, 0xffffffffffffffff, 0xbcd02462139b3fc3, 0x98b5f80c769693ef};
@ -7100,6 +7142,9 @@ int main(int argc, char **argv) {
    run_rand_bits();
    run_rand_int();

+#ifdef SECP256K1_WIDEMUL_INT128
+    run_int128_tests();
+#endif
    run_ctz_tests();
    run_modinv_tests();
    run_inverse_tests();
--- a/src/util.h
+++ b/src/util.h
@ -230,28 +230,23 @@ static SECP256K1_INLINE void secp256k1_int_cmov(int *r, const int *a, int flag)
    *r = (int)(r_masked | a_masked);
 }

-/* If USE_FORCE_WIDEMUL_{INT128,INT64} is set, use that wide multiplication implementation.
+/* If USE_FORCE_WIDEMUL_{INT128, INT128_STRUCT, INT64} is set, use that wide multiplication implementation.
 * Otherwise use the presence of __SIZEOF_INT128__ to decide.
 */
-#if defined(USE_FORCE_WIDEMUL_INT128)
+#if defined(USE_FORCE_WIDEMUL_INT128_STRUCT)
 # define SECP256K1_WIDEMUL_INT128 1
+# define SECP256K1_INT128_STRUCT 1
+#elif defined(USE_FORCE_WIDEMUL_INT128)
+# define SECP256K1_WIDEMUL_INT128 1
+# define SECP256K1_INT128_NATIVE 1
 #elif defined(USE_FORCE_WIDEMUL_INT64)
 # define SECP256K1_WIDEMUL_INT64 1
 #elif defined(UINT128_MAX) || defined(__SIZEOF_INT128__)
 # define SECP256K1_WIDEMUL_INT128 1
+# define SECP256K1_INT128_NATIVE 1
 #else
 # define SECP256K1_WIDEMUL_INT64 1
 #endif
-#if defined(SECP256K1_WIDEMUL_INT128)
-# if !defined(UINT128_MAX) && defined(__SIZEOF_INT128__)
-SECP256K1_GNUC_EXT typedef unsigned __int128 uint128_t;
-SECP256K1_GNUC_EXT typedef __int128 int128_t;
-#define UINT128_MAX ((uint128_t)(-1))
-#define INT128_MAX ((int128_t)(UINT128_MAX >> 1))
-#define INT128_MIN (-INT128_MAX - 1)
-/* No (U)INT128_C macros because compilers providing __int128 do not support 128-bit literals.  */
-# endif
-#endif

 #ifndef __has_builtin
 #define __has_builtin(x) 0