aboutsummaryrefslogtreecommitdiff
path: root/src/field_5x52_asm_impl.h
diff options
context:
space:
mode:
authorPieter Wuille <pieter.wuille@gmail.com>2014-12-11 01:58:25 +0100
committerPieter Wuille <pieter.wuille@gmail.com>2014-12-11 01:58:25 +0100
commitecae2acb06d44509425a9cdad38ed9a5bce15cbe (patch)
tree12f4a7d577d562ac452bf647da3dd380af105971 /src/field_5x52_asm_impl.h
parent87bddb7a3a83aaad96b5b54b4bac34d8a71b3810 (diff)
Squashed 'src/secp256k1/' changes from b0210a9..bccaf86
bccaf86 Merge pull request #150 2a53a47 Merge pull request #151 5f5a31f Merge pull request #149 3907277 Merge pull request #142 a3e0611 Enable tests in x86 travis builds 45da235 x86 builder 8bb0e93 Merge pull request #155 971fe81 build: fix openssl detection for cross builds f22d73e Explicitly access %0..%2 as 64-bit so we use the right registers for x32 ABI e66d4d6 Avoid the stack in assembly and use explicit registers cf7b2b4 Fix ECDSA message hashes to 32 bytes 056ad31 Really compile with -O3 by default 74ad63a Merge pull request #146 9000458 Merge pull request #145 1f46b00 build: fix __builtin_expect detection for clang aaba2e0 Merge pull request #136 8a0775c Merge pull request #144 ee1eaa7 Merge pull request #141 c88e2b8 Compile with -O3 by default 6558a26 Make the benchmarks print out stats 000bdf6 Rename bench_verify to bench_recovery 7c6fed2 Add a few more additional tests. 992e03b travis: add clang to the test matrix b43b79a Merge pull request #143 e06a924 Include time.h header for time(). 8d11164 Add some additional tests. 3545627 Merge pull request #118 6a9901e Merge pull request #137 376b28b Merge pull request #128 1728806 Merge pull request #138 a5759c5 Check return value of malloc 39bd94d Variable time normalize ad86bdf Merge pull request #140 54b768c Another redundant secp256k1_fe_normalize 69dcaab Merge pull request #139 1c29f2e Remove redundant secp256k1_fe_normalize from secp256k1_gej_add_ge_var. 2b9388b Remove unused secp256k1_fe_inv_all f461b76 Allocate precomputation arrays on the heap b2c9681 Make {mul,sqr}_inner use the same argument order as {mul,sqr} 6793505 Convert YASM code into inline assembly f048615 Rewrite field assembly to match the C version 3ce74b1 Tweak precomputed table size for G git-subtree-dir: src/secp256k1 git-subtree-split: bccaf86caa9c44166e5a66600b742c516e03c3f0
Diffstat (limited to 'src/field_5x52_asm_impl.h')
-rw-r--r--src/field_5x52_asm_impl.h495
1 files changed, 492 insertions, 3 deletions
diff --git a/src/field_5x52_asm_impl.h b/src/field_5x52_asm_impl.h
index f29605b11b..98cc004bf0 100644
--- a/src/field_5x52_asm_impl.h
+++ b/src/field_5x52_asm_impl.h
@@ -1,13 +1,502 @@
/**********************************************************************
- * Copyright (c) 2013 Pieter Wuille *
+ * Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille *
* Distributed under the MIT software license, see the accompanying *
* file COPYING or http://www.opensource.org/licenses/mit-license.php.*
**********************************************************************/
+/**
+ * Changelog:
+ * - March 2013, Diederik Huys: original version
+ * - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm
+ * - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly
+ */
+
#ifndef _SECP256K1_FIELD_INNER5X52_IMPL_H_
#define _SECP256K1_FIELD_INNER5X52_IMPL_H_
-void __attribute__ ((sysv_abi)) secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t *b, uint64_t *r);
-void __attribute__ ((sysv_abi)) secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r);
+SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) {
+/**
+ * Registers: rdx:rax = multiplication accumulator
+ * r9:r8 = c
+ * r15:rcx = d
+ * r10-r14 = a0-a4
+ * rbx = b
+ * rdi = r
+ * rsi = a / t?
+ */
+ uint64_t tmp1, tmp2, tmp3;
+__asm__ __volatile__(
+ "movq 0(%%rsi),%%r10\n"
+ "movq 8(%%rsi),%%r11\n"
+ "movq 16(%%rsi),%%r12\n"
+ "movq 24(%%rsi),%%r13\n"
+ "movq 32(%%rsi),%%r14\n"
+
+ /* d += a3 * b0 */
+ "movq 0(%%rbx),%%rax\n"
+ "mulq %%r13\n"
+ "movq %%rax,%%rcx\n"
+ "movq %%rdx,%%r15\n"
+ /* d += a2 * b1 */
+ "movq 8(%%rbx),%%rax\n"
+ "mulq %%r12\n"
+ "addq %%rax,%%rcx\n"
+ "adcq %%rdx,%%r15\n"
+ /* d += a1 * b2 */
+ "movq 16(%%rbx),%%rax\n"
+ "mulq %%r11\n"
+ "addq %%rax,%%rcx\n"
+ "adcq %%rdx,%%r15\n"
+ /* d = a0 * b3 */
+ "movq 24(%%rbx),%%rax\n"
+ "mulq %%r10\n"
+ "addq %%rax,%%rcx\n"
+ "adcq %%rdx,%%r15\n"
+ /* c = a4 * b4 */
+ "movq 32(%%rbx),%%rax\n"
+ "mulq %%r14\n"
+ "movq %%rax,%%r8\n"
+ "movq %%rdx,%%r9\n"
+ /* d += (c & M) * R */
+ "movq $0xfffffffffffff,%%rdx\n"
+ "andq %%rdx,%%rax\n"
+ "movq $0x1000003d10,%%rdx\n"
+ "mulq %%rdx\n"
+ "addq %%rax,%%rcx\n"
+ "adcq %%rdx,%%r15\n"
+ /* c >>= 52 (%%r8 only) */
+ "shrdq $52,%%r9,%%r8\n"
+ /* t3 (tmp1) = d & M */
+ "movq %%rcx,%%rsi\n"
+ "movq $0xfffffffffffff,%%rdx\n"
+ "andq %%rdx,%%rsi\n"
+ "movq %%rsi,%q1\n"
+ /* d >>= 52 */
+ "shrdq $52,%%r15,%%rcx\n"
+ "xorq %%r15,%%r15\n"
+ /* d += a4 * b0 */
+ "movq 0(%%rbx),%%rax\n"
+ "mulq %%r14\n"
+ "addq %%rax,%%rcx\n"
+ "adcq %%rdx,%%r15\n"
+ /* d += a3 * b1 */
+ "movq 8(%%rbx),%%rax\n"
+ "mulq %%r13\n"
+ "addq %%rax,%%rcx\n"
+ "adcq %%rdx,%%r15\n"
+ /* d += a2 * b2 */
+ "movq 16(%%rbx),%%rax\n"
+ "mulq %%r12\n"
+ "addq %%rax,%%rcx\n"
+ "adcq %%rdx,%%r15\n"
+ /* d += a1 * b3 */
+ "movq 24(%%rbx),%%rax\n"
+ "mulq %%r11\n"
+ "addq %%rax,%%rcx\n"
+ "adcq %%rdx,%%r15\n"
+ /* d += a0 * b4 */
+ "movq 32(%%rbx),%%rax\n"
+ "mulq %%r10\n"
+ "addq %%rax,%%rcx\n"
+ "adcq %%rdx,%%r15\n"
+ /* d += c * R */
+ "movq %%r8,%%rax\n"
+ "movq $0x1000003d10,%%rdx\n"
+ "mulq %%rdx\n"
+ "addq %%rax,%%rcx\n"
+ "adcq %%rdx,%%r15\n"
+ /* t4 = d & M (%%rsi) */
+ "movq %%rcx,%%rsi\n"
+ "movq $0xfffffffffffff,%%rdx\n"
+ "andq %%rdx,%%rsi\n"
+ /* d >>= 52 */
+ "shrdq $52,%%r15,%%rcx\n"
+ "xorq %%r15,%%r15\n"
+ /* tx = t4 >> 48 (tmp3) */
+ "movq %%rsi,%%rax\n"
+ "shrq $48,%%rax\n"
+ "movq %%rax,%q3\n"
+ /* t4 &= (M >> 4) (tmp2) */
+ "movq $0xffffffffffff,%%rax\n"
+ "andq %%rax,%%rsi\n"
+ "movq %%rsi,%q2\n"
+ /* c = a0 * b0 */
+ "movq 0(%%rbx),%%rax\n"
+ "mulq %%r10\n"
+ "movq %%rax,%%r8\n"
+ "movq %%rdx,%%r9\n"
+ /* d += a4 * b1 */
+ "movq 8(%%rbx),%%rax\n"
+ "mulq %%r14\n"
+ "addq %%rax,%%rcx\n"
+ "adcq %%rdx,%%r15\n"
+ /* d += a3 * b2 */
+ "movq 16(%%rbx),%%rax\n"
+ "mulq %%r13\n"
+ "addq %%rax,%%rcx\n"
+ "adcq %%rdx,%%r15\n"
+ /* d += a2 * b3 */
+ "movq 24(%%rbx),%%rax\n"
+ "mulq %%r12\n"
+ "addq %%rax,%%rcx\n"
+ "adcq %%rdx,%%r15\n"
+ /* d += a1 * b4 */
+ "movq 32(%%rbx),%%rax\n"
+ "mulq %%r11\n"
+ "addq %%rax,%%rcx\n"
+ "adcq %%rdx,%%r15\n"
+ /* u0 = d & M (%%rsi) */
+ "movq %%rcx,%%rsi\n"
+ "movq $0xfffffffffffff,%%rdx\n"
+ "andq %%rdx,%%rsi\n"
+ /* d >>= 52 */
+ "shrdq $52,%%r15,%%rcx\n"
+ "xorq %%r15,%%r15\n"
+ /* u0 = (u0 << 4) | tx (%%rsi) */
+ "shlq $4,%%rsi\n"
+ "movq %q3,%%rax\n"
+ "orq %%rax,%%rsi\n"
+ /* c += u0 * (R >> 4) */
+ "movq $0x1000003d1,%%rax\n"
+ "mulq %%rsi\n"
+ "addq %%rax,%%r8\n"
+ "adcq %%rdx,%%r9\n"
+ /* r[0] = c & M */
+ "movq %%r8,%%rax\n"
+ "movq $0xfffffffffffff,%%rdx\n"
+ "andq %%rdx,%%rax\n"
+ "movq %%rax,0(%%rdi)\n"
+ /* c >>= 52 */
+ "shrdq $52,%%r9,%%r8\n"
+ "xorq %%r9,%%r9\n"
+ /* c += a1 * b0 */
+ "movq 0(%%rbx),%%rax\n"
+ "mulq %%r11\n"
+ "addq %%rax,%%r8\n"
+ "adcq %%rdx,%%r9\n"
+ /* c += a0 * b1 */
+ "movq 8(%%rbx),%%rax\n"
+ "mulq %%r10\n"
+ "addq %%rax,%%r8\n"
+ "adcq %%rdx,%%r9\n"
+ /* d += a4 * b2 */
+ "movq 16(%%rbx),%%rax\n"
+ "mulq %%r14\n"
+ "addq %%rax,%%rcx\n"
+ "adcq %%rdx,%%r15\n"
+ /* d += a3 * b3 */
+ "movq 24(%%rbx),%%rax\n"
+ "mulq %%r13\n"
+ "addq %%rax,%%rcx\n"
+ "adcq %%rdx,%%r15\n"
+ /* d += a2 * b4 */
+ "movq 32(%%rbx),%%rax\n"
+ "mulq %%r12\n"
+ "addq %%rax,%%rcx\n"
+ "adcq %%rdx,%%r15\n"
+ /* c += (d & M) * R */
+ "movq %%rcx,%%rax\n"
+ "movq $0xfffffffffffff,%%rdx\n"
+ "andq %%rdx,%%rax\n"
+ "movq $0x1000003d10,%%rdx\n"
+ "mulq %%rdx\n"
+ "addq %%rax,%%r8\n"
+ "adcq %%rdx,%%r9\n"
+ /* d >>= 52 */
+ "shrdq $52,%%r15,%%rcx\n"
+ "xorq %%r15,%%r15\n"
+ /* r[1] = c & M */
+ "movq %%r8,%%rax\n"
+ "movq $0xfffffffffffff,%%rdx\n"
+ "andq %%rdx,%%rax\n"
+ "movq %%rax,8(%%rdi)\n"
+ /* c >>= 52 */
+ "shrdq $52,%%r9,%%r8\n"
+ "xorq %%r9,%%r9\n"
+ /* c += a2 * b0 */
+ "movq 0(%%rbx),%%rax\n"
+ "mulq %%r12\n"
+ "addq %%rax,%%r8\n"
+ "adcq %%rdx,%%r9\n"
+ /* c += a1 * b1 */
+ "movq 8(%%rbx),%%rax\n"
+ "mulq %%r11\n"
+ "addq %%rax,%%r8\n"
+ "adcq %%rdx,%%r9\n"
+ /* c += a0 * b2 (last use of %%r10 = a0) */
+ "movq 16(%%rbx),%%rax\n"
+ "mulq %%r10\n"
+ "addq %%rax,%%r8\n"
+ "adcq %%rdx,%%r9\n"
+ /* fetch t3 (%%r10, overwrites a0), t4 (%%rsi) */
+ "movq %q2,%%rsi\n"
+ "movq %q1,%%r10\n"
+ /* d += a4 * b3 */
+ "movq 24(%%rbx),%%rax\n"
+ "mulq %%r14\n"
+ "addq %%rax,%%rcx\n"
+ "adcq %%rdx,%%r15\n"
+ /* d += a3 * b4 */
+ "movq 32(%%rbx),%%rax\n"
+ "mulq %%r13\n"
+ "addq %%rax,%%rcx\n"
+ "adcq %%rdx,%%r15\n"
+ /* c += (d & M) * R */
+ "movq %%rcx,%%rax\n"
+ "movq $0xfffffffffffff,%%rdx\n"
+ "andq %%rdx,%%rax\n"
+ "movq $0x1000003d10,%%rdx\n"
+ "mulq %%rdx\n"
+ "addq %%rax,%%r8\n"
+ "adcq %%rdx,%%r9\n"
+ /* d >>= 52 (%%rcx only) */
+ "shrdq $52,%%r15,%%rcx\n"
+ /* r[2] = c & M */
+ "movq %%r8,%%rax\n"
+ "movq $0xfffffffffffff,%%rdx\n"
+ "andq %%rdx,%%rax\n"
+ "movq %%rax,16(%%rdi)\n"
+ /* c >>= 52 */
+ "shrdq $52,%%r9,%%r8\n"
+ "xorq %%r9,%%r9\n"
+ /* c += t3 */
+ "addq %%r10,%%r8\n"
+ /* c += d * R */
+ "movq %%rcx,%%rax\n"
+ "movq $0x1000003d10,%%rdx\n"
+ "mulq %%rdx\n"
+ "addq %%rax,%%r8\n"
+ "adcq %%rdx,%%r9\n"
+ /* r[3] = c & M */
+ "movq %%r8,%%rax\n"
+ "movq $0xfffffffffffff,%%rdx\n"
+ "andq %%rdx,%%rax\n"
+ "movq %%rax,24(%%rdi)\n"
+ /* c >>= 52 (%%r8 only) */
+ "shrdq $52,%%r9,%%r8\n"
+ /* c += t4 (%%r8 only) */
+ "addq %%rsi,%%r8\n"
+ /* r[4] = c */
+ "movq %%r8,32(%%rdi)\n"
+: "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3)
+: "b"(b), "D"(r)
+: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
+);
+}
+
+SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) {
+/**
+ * Registers: rdx:rax = multiplication accumulator
+ * r9:r8 = c
+ * rcx:rbx = d
+ * r10-r14 = a0-a4
+ * r15 = M (0xfffffffffffff)
+ * rdi = r
+ * rsi = a / t?
+ */
+ uint64_t tmp1, tmp2, tmp3;
+__asm__ __volatile__(
+ "movq 0(%%rsi),%%r10\n"
+ "movq 8(%%rsi),%%r11\n"
+ "movq 16(%%rsi),%%r12\n"
+ "movq 24(%%rsi),%%r13\n"
+ "movq 32(%%rsi),%%r14\n"
+ "movq $0xfffffffffffff,%%r15\n"
+
+ /* d = (a0*2) * a3 */
+ "leaq (%%r10,%%r10,1),%%rax\n"
+ "mulq %%r13\n"
+ "movq %%rax,%%rbx\n"
+ "movq %%rdx,%%rcx\n"
+ /* d += (a1*2) * a2 */
+ "leaq (%%r11,%%r11,1),%%rax\n"
+ "mulq %%r12\n"
+ "addq %%rax,%%rbx\n"
+ "adcq %%rdx,%%rcx\n"
+ /* c = a4 * a4 */
+ "movq %%r14,%%rax\n"
+ "mulq %%r14\n"
+ "movq %%rax,%%r8\n"
+ "movq %%rdx,%%r9\n"
+ /* d += (c & M) * R */
+ "andq %%r15,%%rax\n"
+ "movq $0x1000003d10,%%rdx\n"
+ "mulq %%rdx\n"
+ "addq %%rax,%%rbx\n"
+ "adcq %%rdx,%%rcx\n"
+ /* c >>= 52 (%%r8 only) */
+ "shrdq $52,%%r9,%%r8\n"
+ /* t3 (tmp1) = d & M */
+ "movq %%rbx,%%rsi\n"
+ "andq %%r15,%%rsi\n"
+ "movq %%rsi,%q1\n"
+ /* d >>= 52 */
+ "shrdq $52,%%rcx,%%rbx\n"
+ "xorq %%rcx,%%rcx\n"
+ /* a4 *= 2 */
+ "addq %%r14,%%r14\n"
+ /* d += a0 * a4 */
+ "movq %%r10,%%rax\n"
+ "mulq %%r14\n"
+ "addq %%rax,%%rbx\n"
+ "adcq %%rdx,%%rcx\n"
+ /* d+= (a1*2) * a3 */
+ "leaq (%%r11,%%r11,1),%%rax\n"
+ "mulq %%r13\n"
+ "addq %%rax,%%rbx\n"
+ "adcq %%rdx,%%rcx\n"
+ /* d += a2 * a2 */
+ "movq %%r12,%%rax\n"
+ "mulq %%r12\n"
+ "addq %%rax,%%rbx\n"
+ "adcq %%rdx,%%rcx\n"
+ /* d += c * R */
+ "movq %%r8,%%rax\n"
+ "movq $0x1000003d10,%%rdx\n"
+ "mulq %%rdx\n"
+ "addq %%rax,%%rbx\n"
+ "adcq %%rdx,%%rcx\n"
+ /* t4 = d & M (%%rsi) */
+ "movq %%rbx,%%rsi\n"
+ "andq %%r15,%%rsi\n"
+ /* d >>= 52 */
+ "shrdq $52,%%rcx,%%rbx\n"
+ "xorq %%rcx,%%rcx\n"
+ /* tx = t4 >> 48 (tmp3) */
+ "movq %%rsi,%%rax\n"
+ "shrq $48,%%rax\n"
+ "movq %%rax,%q3\n"
+ /* t4 &= (M >> 4) (tmp2) */
+ "movq $0xffffffffffff,%%rax\n"
+ "andq %%rax,%%rsi\n"
+ "movq %%rsi,%q2\n"
+ /* c = a0 * a0 */
+ "movq %%r10,%%rax\n"
+ "mulq %%r10\n"
+ "movq %%rax,%%r8\n"
+ "movq %%rdx,%%r9\n"
+ /* d += a1 * a4 */
+ "movq %%r11,%%rax\n"
+ "mulq %%r14\n"
+ "addq %%rax,%%rbx\n"
+ "adcq %%rdx,%%rcx\n"
+ /* d += (a2*2) * a3 */
+ "leaq (%%r12,%%r12,1),%%rax\n"
+ "mulq %%r13\n"
+ "addq %%rax,%%rbx\n"
+ "adcq %%rdx,%%rcx\n"
+ /* u0 = d & M (%%rsi) */
+ "movq %%rbx,%%rsi\n"
+ "andq %%r15,%%rsi\n"
+ /* d >>= 52 */
+ "shrdq $52,%%rcx,%%rbx\n"
+ "xorq %%rcx,%%rcx\n"
+ /* u0 = (u0 << 4) | tx (%%rsi) */
+ "shlq $4,%%rsi\n"
+ "movq %q3,%%rax\n"
+ "orq %%rax,%%rsi\n"
+ /* c += u0 * (R >> 4) */
+ "movq $0x1000003d1,%%rax\n"
+ "mulq %%rsi\n"
+ "addq %%rax,%%r8\n"
+ "adcq %%rdx,%%r9\n"
+ /* r[0] = c & M */
+ "movq %%r8,%%rax\n"
+ "andq %%r15,%%rax\n"
+ "movq %%rax,0(%%rdi)\n"
+ /* c >>= 52 */
+ "shrdq $52,%%r9,%%r8\n"
+ "xorq %%r9,%%r9\n"
+ /* a0 *= 2 */
+ "addq %%r10,%%r10\n"
+ /* c += a0 * a1 */
+ "movq %%r10,%%rax\n"
+ "mulq %%r11\n"
+ "addq %%rax,%%r8\n"
+ "adcq %%rdx,%%r9\n"
+ /* d += a2 * a4 */
+ "movq %%r12,%%rax\n"
+ "mulq %%r14\n"
+ "addq %%rax,%%rbx\n"
+ "adcq %%rdx,%%rcx\n"
+ /* d += a3 * a3 */
+ "movq %%r13,%%rax\n"
+ "mulq %%r13\n"
+ "addq %%rax,%%rbx\n"
+ "adcq %%rdx,%%rcx\n"
+ /* c += (d & M) * R */
+ "movq %%rbx,%%rax\n"
+ "andq %%r15,%%rax\n"
+ "movq $0x1000003d10,%%rdx\n"
+ "mulq %%rdx\n"
+ "addq %%rax,%%r8\n"
+ "adcq %%rdx,%%r9\n"
+ /* d >>= 52 */
+ "shrdq $52,%%rcx,%%rbx\n"
+ "xorq %%rcx,%%rcx\n"
+ /* r[1] = c & M */
+ "movq %%r8,%%rax\n"
+ "andq %%r15,%%rax\n"
+ "movq %%rax,8(%%rdi)\n"
+ /* c >>= 52 */
+ "shrdq $52,%%r9,%%r8\n"
+ "xorq %%r9,%%r9\n"
+ /* c += a0 * a2 (last use of %%r10) */
+ "movq %%r10,%%rax\n"
+ "mulq %%r12\n"
+ "addq %%rax,%%r8\n"
+ "adcq %%rdx,%%r9\n"
+ /* fetch t3 (%%r10, overwrites a0),t4 (%%rsi) */
+ "movq %q2,%%rsi\n"
+ "movq %q1,%%r10\n"
+ /* c += a1 * a1 */
+ "movq %%r11,%%rax\n"
+ "mulq %%r11\n"
+ "addq %%rax,%%r8\n"
+ "adcq %%rdx,%%r9\n"
+ /* d += a3 * a4 */
+ "movq %%r13,%%rax\n"
+ "mulq %%r14\n"
+ "addq %%rax,%%rbx\n"
+ "adcq %%rdx,%%rcx\n"
+ /* c += (d & M) * R */
+ "movq %%rbx,%%rax\n"
+ "andq %%r15,%%rax\n"
+ "movq $0x1000003d10,%%rdx\n"
+ "mulq %%rdx\n"
+ "addq %%rax,%%r8\n"
+ "adcq %%rdx,%%r9\n"
+ /* d >>= 52 (%%rbx only) */
+ "shrdq $52,%%rcx,%%rbx\n"
+ /* r[2] = c & M */
+ "movq %%r8,%%rax\n"
+ "andq %%r15,%%rax\n"
+ "movq %%rax,16(%%rdi)\n"
+ /* c >>= 52 */
+ "shrdq $52,%%r9,%%r8\n"
+ "xorq %%r9,%%r9\n"
+ /* c += t3 */
+ "addq %%r10,%%r8\n"
+ /* c += d * R */
+ "movq %%rbx,%%rax\n"
+ "movq $0x1000003d10,%%rdx\n"
+ "mulq %%rdx\n"
+ "addq %%rax,%%r8\n"
+ "adcq %%rdx,%%r9\n"
+ /* r[3] = c & M */
+ "movq %%r8,%%rax\n"
+ "andq %%r15,%%rax\n"
+ "movq %%rax,24(%%rdi)\n"
+ /* c >>= 52 (%%r8 only) */
+ "shrdq $52,%%r9,%%r8\n"
+ /* c += t4 (%%r8 only) */
+ "addq %%rsi,%%r8\n"
+ /* r[4] = c */
+ "movq %%r8,32(%%rdi)\n"
+: "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3)
+: "D"(r)
+: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
+);
+}
#endif