12 files changed, 759 insertions, 0 deletions
diff --git a/src/minisketch/doc/example.c b/src/minisketch/doc/example.c
new file mode 100644
index 0000000000..7279165845
--- /dev/null
+++ b/src/minisketch/doc/example.c
@@ -0,0 +1,51 @@
+/**********************************************************************
+ * Copyright (c) 2018 Pieter Wuille, Greg Maxwell, Gleb Naumenko      *
+ * Distributed under the MIT software license, see the accompanying   *
+ * file LICENSE or http://www.opensource.org/licenses/mit-license.php.*
+ **********************************************************************/
+
+#include <stdio.h>
+#include <assert.h>
+#include "../include/minisketch.h"
+
+int main(void) {
+
+  minisketch *sketch_a = minisketch_create(12, 0, 4);
+
+  for (int i = 3000; i < 3010; ++i) {
+    minisketch_add_uint64(sketch_a, i);
+  }
+
+  size_t sersize = minisketch_serialized_size(sketch_a);
+  assert(sersize == 12 * 4 / 8); // 4 12-bit values is 6 bytes.
+  unsigned char *buffer_a = malloc(sersize);
+  minisketch_serialize(sketch_a, buffer_a);
+  minisketch_destroy(sketch_a);
+
+  minisketch *sketch_b = minisketch_create(12, 0, 4); // Bob's own sketch
+  for (int i = 3002; i < 3012; ++i) {
+    minisketch_add_uint64(sketch_b, i);
+  }
+
+  sketch_a = minisketch_create(12, 0, 4);     // Alice's sketch
+  minisketch_deserialize(sketch_a, buffer_a); // Load Alice's sketch
+  free(buffer_a);
+
+  // Merge the elements from sketch_a into sketch_b. The result is a sketch_b
+  // which contains all elements that occurred in Alice's or Bob's sets, but not
+  // in both.
+  minisketch_merge(sketch_b, sketch_a);
+
+  uint64_t differences[4];
+  ssize_t num_differences = minisketch_decode(sketch_b, 4, differences);
+  minisketch_destroy(sketch_a);
+  minisketch_destroy(sketch_b);
+  if (num_differences < 0) {
+    printf("More than 4 differences!\n");
+  } else {
+    ssize_t i;
+    for (i = 0; i < num_differences; ++i) {
+      printf("%u is in only one of the two sets\n", (unsigned)differences[i]);
+    }
+  }
+}
diff --git a/src/minisketch/doc/gen_basefpbits.sage b/src/minisketch/doc/gen_basefpbits.sage
new file mode 100644
index 0000000000..d1e75a6e29
--- /dev/null
+++ b/src/minisketch/doc/gen_basefpbits.sage
@@ -0,0 +1,78 @@
+# Require exact values up to
+FPBITS = 256
+
+# Overkill accuracy
+F = RealField(400)
+
+def BaseFPBits(bits, capacity):
+    return bits * capacity - int(ceil(F(log(sum(binomial(2**bits - 1, i) for i in range(capacity+1)), 2))))
+
+def Log2Factorial(capacity):
+    return int(floor(log(factorial(capacity), 2)))
+
+print("uint64_t BaseFPBits(uint32_t bits, uint32_t capacity) {")
+print("    // Correction table for low bits/capacities")
+TBLS={}
+FARS={}
+SKIPS={}
+for bits in range(1, 32):
+    TBL = []
+    for capacity in range(1, min(2**bits, FPBITS)):
+        exact = BaseFPBits(bits, capacity)
+        approx = Log2Factorial(capacity)
+        TBL.append((exact, approx))
+    MIN = 10000000000
+    while len(TBL) and ((TBL[-1][0] == TBL[-1][1]) or (TBL[-1][0] >= FPBITS and TBL[-1][1] >= FPBITS)):
+        MIN = min(MIN, TBL[-1][0] - TBL[-1][1])
+        TBL.pop()
+    while len(TBL) and (TBL[-1][0] - TBL[-1][1] == MIN):
+        TBL.pop()
+    SKIP = 0
+    while SKIP < len(TBL) and TBL[SKIP][0] == TBL[SKIP][1]:
+        SKIP += 1
+    DIFFS = [TBL[i][0] - TBL[i][1] for i in range(SKIP, len(TBL))]
+    if len(DIFFS) > 0 and len(DIFFS) * Integer(max(DIFFS)).nbits() > 64:
+        print("    static constexpr uint8_t ADD%i[] = {%s};" % (bits, ", ".join(("%i" % (TBL[i][0] - TBL[i][1])) for i in range(SKIP, len(TBL)))))
+    TBLS[bits] = DIFFS
+    FARS[bits] = MIN
+    SKIPS[bits] = SKIP
+print("")
+print("    if (capacity == 0) return 0;")
+print("    uint64_t ret = 0;")
+print("    if (bits < 32 && capacity >= (1U << bits)) {")
+print("        ret = uint64_t{bits} * (capacity - (1U << bits) + 1);")
+print("        capacity = (1U << bits) - 1;")
+print("    }")
+print("    ret += Log2Factorial(capacity);")
+print("    switch (bits) {")
+for bits in sorted(TBLS.keys()):
+    if len(TBLS[bits]) == 0:
+        continue
+    width = Integer(max(TBLS[bits])).nbits()
+    if len(TBLS[bits]) == 1:
+        add = "%i" % TBLS[bits][0]
+    elif len(TBLS[bits]) * width <= 64:
+        code = sum((2**(width*i) * TBLS[bits][i]) for i in range(len(TBLS[bits])))
+        if width == 1:
+            add = "(0x%x >> (capacity - %i)) & 1" % (code, 1 + SKIPS[bits])
+        else:
+            add = "(0x%x >> %i * (capacity - %i)) & %i" % (code, width, 1 + SKIPS[bits], 2**width - 1)
+    else:
+        add = "ADD%i[capacity - %i]" % (bits, 1 + SKIPS[bits])
+    if len(TBLS[bits]) + SKIPS[bits] == 2**bits - 1:
+        print("        case %i: return ret + (capacity <= %i ? 0 : %s);" % (bits, SKIPS[bits], add))
+    else:
+        print("        case %i: return ret + (capacity <= %i ? 0 : capacity > %i ? %i : %s);" % (bits, SKIPS[bits], len(TBLS[bits]) + SKIPS[bits], FARS[bits], add))
+print("        default: return ret;")
+print("    }")
+print("}")
+
+print("void TestBaseFPBits() {")
+print("    static constexpr uint16_t TBL[20][100] = {%s};" % (", ".join("{" + ", ".join(("%i" % BaseFPBits(bits, capacity)) for capacity in range(0, 100)) + "}" for bits in range(1, 21))))
+print("    for (int bits = 1; bits <= 20; ++bits) {")
+print("        for (int capacity = 0; capacity < 100; ++capacity) {")
+print("        uint64_t computed = BaseFPBits(bits, capacity), exact = TBL[bits - 1][capacity];")
+print("            CHECK(exact == computed || (exact >= 256 && computed >= 256));")
+print("        }")
+print("    }")
+print("}")
diff --git a/src/minisketch/doc/gen_params.sage b/src/minisketch/doc/gen_params.sage
new file mode 100755
index 0000000000..1cf036adb4
--- /dev/null
+++ b/src/minisketch/doc/gen_params.sage
@@ -0,0 +1,333 @@
+#!/usr/bin/env sage
+r"""
+Generate finite field parameters for minisketch.
+
+This script selects the finite fields used by minisketch
+ for various sizes and generates the required tables for
+ the implementation.
+
+The output (after formatting) can be found in src/fields/*.cpp.
+
+"""
+B.<b> = GF(2)
+P.<p> = B[]
+
+def apply_map(m, v):
+    r = 0
+    i = 0
+    while v != 0:
+        if (v & 1):
+            r ^^= m[i]
+        i += 1
+        v >>= 1
+    return r
+
+def recurse_moduli(acc, maxweight, maxdegree):
+    for pos in range(maxweight, maxdegree + 1, 1):
+        poly = acc + p^pos
+        if maxweight == 1:
+            if poly.is_irreducible():
+                return (pos, poly)
+        else:
+            (deg, ret) = recurse_moduli(poly, maxweight - 1, pos - 1)
+            if ret is not None:
+                return (pos, ret)
+    return (None, None)
+
+def compute_moduli(bits):
+    # Return all optimal irreducible polynomials for GF(2^bits)
+    # The result is a list of tuples (weight, degree of second-highest nonzero coefficient, polynomial)
+    maxdegree = bits - 1
+    result = []
+    for weight in range(1, bits, 2):
+        deg, res = None, None
+        while True:
+            ret = recurse_moduli(p^bits + 1, weight, maxdegree)
+            if ret[0] is not None:
+                (deg, res) = ret
+                maxdegree = deg - 1
+            else:
+                break
+        if res is not None:
+            result.append((weight + 2, deg, res))
+    return result
+
+def bits_to_int(vals):
+    ret = 0
+    base = 1
+    for val in vals:
+        ret += Integer(val) * base
+        base *= 2
+    return ret
+
+def sqr_table(f, bits, n=1):
+    ret = []
+    for i in range(bits):
+        ret.append((f^(2^n*i)).integer_representation())
+    return ret
+
+# Compute x**(2**n)
+def pow2(x, n):
+    for i in range(n):
+        x = x**2
+    return x
+
+def qrt_table(F, f, bits):
+    # Table for solving x2 + x = a
+    # This implements the technique from https://www.raco.cat/index.php/PublicacionsMatematiques/article/viewFile/37927/40412, Lemma 1
+    for i in range(bits):
+        if (f**i).trace() != 0:
+            u = f**i
+    ret = []
+    for i in range(0, bits):
+        d = f^i
+        y = sum(pow2(d, j) * sum(pow2(u, k) for k in range(j)) for j in range(1, bits))
+        ret.append(y.integer_representation() ^^ (y.integer_representation() & 1))
+    return ret
+
+def conv_tables(F, NF, bits):
+    # Generate a F(2) linear projection that maps elements from one field
+    #  to an isomorphic field with a different modulus.
+    f = F.gen()
+    fp = f.minimal_polynomial()
+    assert(fp == F.modulus())
+    nfp = fp.change_ring(NF)
+    nf = sorted(nfp.roots(multiplicities=False))[0]
+    ret = []
+    matrepr = [[B(0) for x in range(bits)] for y in range(bits)]
+    for i in range(bits):
+        val = (nf**i).integer_representation()
+        ret.append(val)
+        for j in range(bits):
+            matrepr[j][i] = B((val >> j) & 1)
+    mat = Matrix(matrepr).inverse().transpose()
+    ret2 = []
+    for i in range(bits):
+        ret2.append(bits_to_int(mat[i]))
+
+    for t in range(100):
+        f1a = F.random_element()
+        f1b = F.random_element()
+        f1r = f1a * f1b
+        f2a = NF.fetch_int(apply_map(ret, f1a.integer_representation()))
+        f2b = NF.fetch_int(apply_map(ret, f1b.integer_representation()))
+        f2r = NF.fetch_int(apply_map(ret, f1r.integer_representation()))
+        f2s = f2a * f2b
+        assert(f2r == f2s)
+
+    for t in range(100):
+        f2a = NF.random_element()
+        f2b = NF.random_element()
+        f2r = f2a * f2b
+        f1a = F.fetch_int(apply_map(ret2, f2a.integer_representation()))
+        f1b = F.fetch_int(apply_map(ret2, f2b.integer_representation()))
+        f1r = F.fetch_int(apply_map(ret2, f2r.integer_representation()))
+        f1s = f1a * f1b
+        assert(f1r == f1s)
+
+    return (ret, ret2)
+
+def fmt(i,typ):
+    if i == 0:
+        return "0"
+    else:
+        return "0x%x" % i
+
+def lintranstype(typ, bits, maxtbl):
+    gsize = min(maxtbl, bits)
+    array_size = (bits + gsize - 1) // gsize
+    bits_list = []
+    total = 0
+    for i in range(array_size):
+        rsize = (bits - total + array_size - i - 1) // (array_size - i)
+        total += rsize
+        bits_list.append(rsize)
+    return "RecLinTrans<%s, %s>" % (typ, ", ".join("%i" % x for x in bits_list))
+
+INT=0
+CLMUL=1
+CLMUL_TRI=2
+MD=3
+
+def print_modulus_md(mod):
+    ret = ""
+    pos = mod.degree()
+    for c in reversed(list(mod)):
+        if c:
+            if ret:
+                ret += " + "
+            if pos == 0:
+                ret += "1"
+            elif pos == 1:
+                ret += "x"
+            else:
+                ret += "x<sup>%i</sup>" % pos
+        pos -= 1
+    return ret
+
+def pick_modulus(bits, style):
+    # Choose the lexicographicly-first lowest-weight modulus
+    #  optionally subject to implementation specific constraints.
+    moduli = compute_moduli(bits)
+    if style == INT or style == MD:
+        multi_sqr = False
+        need_trans = False
+    elif style == CLMUL:
+        # Fast CLMUL reduction requires that bits + the highest
+        #  set bit are less than 66.
+        moduli = list(filter((lambda x: bits+x[1] <= 66), moduli)) + moduli
+        multi_sqr = True
+        need_trans = True
+        if not moduli or moduli[0][2].change_ring(ZZ)(2) == 3 + 2**bits:
+            # For modulus 3, CLMUL_TRI is obviously better.
+            return None
+    elif style == CLMUL_TRI:
+        moduli = list(filter(lambda x: bits+x[1] <= 66, moduli)) + moduli
+        moduli = list(filter(lambda x: x[0] == 3, moduli))
+        multi_sqr = True
+        need_trans = True
+    else:
+        assert(False)
+    if not moduli:
+        return None
+    return moduli[0][2]
+
+def print_result(bits, style):
+    if style == INT:
+        multi_sqr = False
+        need_trans = False
+        table_id = "%i" % bits
+    elif style == MD:
+        pass
+    elif style == CLMUL:
+        multi_sqr = True
+        need_trans = True
+        table_id = "%i" % bits
+    elif style == CLMUL_TRI:
+        multi_sqr = True
+        need_trans = True
+        table_id = "TRI%i" % bits
+    else:
+        assert(False)
+
+    nmodulus = pick_modulus(bits, INT)
+    modulus = pick_modulus(bits, style)
+    if modulus is None:
+        return
+
+    if style == MD:
+        print("* *%s*" % print_modulus_md(modulus))
+        return
+
+    if bits > 32:
+        typ = "uint64_t"
+    elif bits > 16:
+        typ = "uint32_t"
+    elif bits > 8:
+        typ = "uint16_t"
+    else:
+        typ = "uint8_t"
+
+    ttyp = lintranstype(typ, bits, 4)
+    rtyp = lintranstype(typ, bits, 6)
+
+    F.<f> = GF(2**bits, modulus=modulus)
+
+    include_table = True
+    if style != INT and style != CLMUL:
+        cmodulus = pick_modulus(bits, CLMUL)
+        if cmodulus == modulus:
+            include_table = False
+            table_id = "%i" % bits
+
+    if include_table:
+        print("typedef %s StatTable%s;" % (rtyp, table_id))
+        rtyp = "StatTable%s" % table_id
+        if (style == INT):
+            print("typedef %s DynTable%s;" % (ttyp, table_id))
+            ttyp = "DynTable%s" % table_id
+
+    if need_trans:
+        if modulus != nmodulus:
+            # If the bitstream modulus is not the best modulus for
+            #  this implementation a conversion table will be needed.
+            ctyp = rtyp
+            NF.<nf> = GF(2**bits, modulus=nmodulus)
+            ctables = conv_tables(NF, F, bits)
+            loadtbl = "&LOAD_TABLE_%s" % table_id
+            savetbl = "&SAVE_TABLE_%s" % table_id
+            if include_table:
+                print("constexpr %s LOAD_TABLE_%s({%s});" % (ctyp, table_id, ", ".join([fmt(x,typ) for x in ctables[0]])))
+                print("constexpr %s SAVE_TABLE_%s({%s});" % (ctyp, table_id, ", ".join([fmt(x,typ) for x in ctables[1]])))
+        else:
+            ctyp = "IdTrans"
+            loadtbl = "&ID_TRANS"
+            savetbl = "&ID_TRANS"
+    else:
+        assert(modulus == nmodulus)
+
+    if include_table:
+        print("constexpr %s SQR_TABLE_%s({%s});" % (rtyp, table_id, ", ".join([fmt(x,typ) for x in sqr_table(f, bits, 1)])))
+    if multi_sqr:
+        # Repeated squaring is a linearised polynomial so in F(2^n) it is
+        #  F(2) linear and can be computed by a simple bit-matrix.
+        # Repeated squaring is especially useful in powering ladders such as
+        #  for inversion.
+        # When certain repeated squaring tables are not in use, use the QRT
+        # table instead to make the C++ compiler happy (it always has the
+        # same type).
+        sqr2 = "&QRT_TABLE_%s" % table_id
+        sqr4 = "&QRT_TABLE_%s" % table_id
+        sqr8 = "&QRT_TABLE_%s" % table_id
+        sqr16 = "&QRT_TABLE_%s" % table_id
+        if ((bits - 1) >= 4):
+            if include_table:
+                print("constexpr %s SQR2_TABLE_%s({%s});" % (rtyp, table_id, ", ".join([fmt(x,typ) for x in sqr_table(f, bits, 2)])))
+            sqr2 = "&SQR2_TABLE_%s" % table_id
+        if ((bits - 1) >= 8):
+            if include_table:
+                print("constexpr %s SQR4_TABLE_%s({%s});" % (rtyp, table_id, ", ".join([fmt(x,typ) for x in sqr_table(f, bits, 4)])))
+            sqr4 = "&SQR4_TABLE_%s" % table_id
+        if ((bits - 1) >= 16):
+            if include_table:
+                print("constexpr %s SQR8_TABLE_%s({%s});" % (rtyp, table_id, ", ".join([fmt(x,typ) for x in sqr_table(f, bits, 8)])))
+            sqr8 = "&SQR8_TABLE_%s" % table_id
+        if ((bits - 1) >= 32):
+            if include_table:
+                print("constexpr %s SQR16_TABLE_%s({%s});" % (rtyp, table_id, ", ".join([fmt(x,typ) for x in sqr_table(f, bits, 16)])))
+            sqr16 = "&SQR16_TABLE_%s" % table_id
+    if include_table:
+        print("constexpr %s QRT_TABLE_%s({%s});" % (rtyp, table_id, ", ".join([fmt(x,typ) for x in qrt_table(F, f, bits)])))
+
+    modulus_weight = modulus.hamming_weight()
+    modulus_degree = (modulus - p**bits).degree()
+    modulus_int = (modulus - p**bits).change_ring(ZZ)(2)
+
+    lfsr = ""
+
+    if style == INT:
+        print("typedef Field<%s, %i, %i, %s, %s, &SQR_TABLE_%s, &QRT_TABLE_%s%s> Field%i;" % (typ, bits, modulus_int, rtyp, ttyp, table_id, table_id, lfsr, bits))
+    elif style == CLMUL:
+        print("typedef Field<%s, %i, %i, %s, &SQR_TABLE_%s, %s, %s, %s, %s, &QRT_TABLE_%s, %s, %s, %s%s> Field%i;" % (typ, bits, modulus_int, rtyp, table_id, sqr2, sqr4, sqr8, sqr16, table_id, ctyp, loadtbl, savetbl, lfsr, bits))
+    elif style == CLMUL_TRI:
+        print("typedef FieldTri<%s, %i, %i, %s, &SQR_TABLE_%s, %s, %s, %s, %s, &QRT_TABLE_%s, %s, %s, %s> FieldTri%i;" % (typ, bits, modulus_degree, rtyp, table_id, sqr2, sqr4, sqr8, sqr16, table_id, ctyp, loadtbl, savetbl, bits))
+    else:
+        assert(False)
+
+for bits in range(2, 65):
+    print("#ifdef ENABLE_FIELD_INT_%i" % bits)
+    print("// %i bit field" % bits)
+    print_result(bits, INT)
+    print("#endif")
+    print("")
+
+for bits in range(2, 65):
+    print("#ifdef ENABLE_FIELD_INT_%i" % bits)
+    print("// %i bit field" % bits)
+    print_result(bits, CLMUL)
+    print_result(bits, CLMUL_TRI)
+    print("#endif")
+    print("")
+
+for bits in range(2, 65):
+    print_result(bits, MD)
diff --git a/src/minisketch/doc/log2_factorial.sage b/src/minisketch/doc/log2_factorial.sage
new file mode 100644
index 0000000000..afc6d66c57
--- /dev/null
+++ b/src/minisketch/doc/log2_factorial.sage
@@ -0,0 +1,85 @@
+import bisect
+
+INPUT_BITS = 32
+TABLE_BITS = 5
+INT_BITS = 64
+EXACT_FPBITS = 256
+
+F = RealField(100) # overkill
+
+def BestOverApproxInvLog2(mulof, maxd):
+    """
+    Compute denominator of an approximation of 1/log(2).
+
+    Specifically, find the value of d (<= maxd, and a multiple of mulof)
+    such that ceil(d/log(2))/d is the best approximation of 1/log(2).
+    """
+    dist=1
+    best=0
+    # Precomputed denominators that lead to good approximations of 1/log(2)
+    for d in [1, 2, 9, 70, 131, 192, 445, 1588, 4319, 11369, 18419, 25469, 287209, 836158, 3057423, 8336111, 21950910, 35565709, 49180508, 161156323, 273132138, 385107953, 882191721]:
+        kd = lcm(mulof, d)
+        if kd <= maxd:
+            n = ceil(kd / log(2))
+            dis = F((n / kd) - 1 / log(2))
+            if dis < dist:
+                dist = dis
+                best = kd
+    return best
+
+
+LOG2_TABLE = []
+A = 0
+B = 0
+C = 0
+D = 0
+K = 0
+
+def Setup(k):
+    global LOG2_TABLE, A, B, C, D, K
+    K = k
+    LOG2_TABLE = []
+    for i in range(2 ** TABLE_BITS):
+        LOG2_TABLE.append(int(floor(F(K * log(1 + i / 2**TABLE_BITS, 2)))))
+
+    # Maximum for (2*x+1)*LogK2(x)
+    max_T = (2^(INPUT_BITS + 1) - 1) * (INPUT_BITS*K - 1)
+    # Maximum for A
+    max_A = (2^INT_BITS - 1) // max_T
+    D = BestOverApproxInvLog2(2 * K, max_A * 2 * K)
+    A = D // (2 * K)
+    B = int(ceil(F(D/log(2))))
+    C = int(floor(F(D*log(2*pi,2)/2)))
+
+def LogK2(n):
+    assert(n >= 1 and n < (1 << INPUT_BITS))
+    bits = Integer(n).nbits()
+    return K * (bits - 1) + LOG2_TABLE[((n << (INPUT_BITS - bits)) >> (INPUT_BITS - TABLE_BITS - 1)) - 2**TABLE_BITS]
+
+def Log2Fact(n):
+    # Use formula (A*(2*x+1)*LogK2(x) - B*x + C) / D
+    return (A*(2*n+1)*LogK2(n) - B*n + C) // D + (n < 3)
+
+RES = [int(F(log(factorial(i),2))) for i in range(EXACT_FPBITS * 10)]
+
+best_worst_ratio = 0
+
+for K in range(1, 10000):
+    Setup(K)
+    assert(LogK2(1) == 0)
+    assert(LogK2(2) == K)
+    assert(LogK2(4) == 2 * K)
+    good = True
+    worst_ratio = 1
+    for i in range(1, EXACT_FPBITS * 10):
+        exact = RES[i]
+        approx = Log2Fact(i)
+        if not (approx <= exact and ((approx == exact) or (approx >= EXACT_FPBITS and exact >= EXACT_FPBITS))):
+            good = False
+            break
+        if worst_ratio * exact > approx:
+            worst_ratio = approx / exact
+    if good and worst_ratio > best_worst_ratio:
+        best_worst_ratio = worst_ratio
+        print("Formula: (%i*(2*x+1)*floor(%i*log2(x)) - %i*x + %i) / %i; log(max_ratio)=%f" % (A, K, B, C, D, RR(-log(worst_ratio))))
+        print("LOG2K_TABLE: %r" % LOG2_TABLE)
diff --git a/src/minisketch/doc/math.md b/src/minisketch/doc/math.md
new file mode 100644
index 0000000000..cf46f193ab
--- /dev/null
+++ b/src/minisketch/doc/math.md
@@ -0,0 +1,117 @@
+# The mathematics of Minisketch sketches
+
+This is an unconventional mathematical overview of the PinSketch algorithm without references to coding theory<sup>[[1]](#myfootnote1)</sup>.
+
+## Set sketches
+
+A sketch, for the purpose of this description, can be seen as a "set checksum" with two peculiar properties:
+
+* Sketches have a predetermined capacity, and when the number of elements in the set is not higher than the capacity, minisketch will always recover the entire set from the sketch. A sketch of *b*-bit elements with capacity *c* can be stored in *bc* bits.
+* The sketches of two sets can be combined by adding them (XOR) to obtain a sketch of the [symmetric difference](https://en.wikipedia.org/wiki/Symmetric_difference) between the two sets (*i.e.*, all elements that occur in one but not both input sets).
+
+This overview explains how sets can be converted into a sketch and how a set can be recovered from a sketch. 
+
+## From field elements to sketches
+
+**Data entries as field elements**
+
+Every integer in the range *[1...2<sup>b</sup>-1]* (the acceptable data elements for a Minisketch sketch with field size *b*) can be mapped to a nonzero field element of *GF(2<sup>b</sup>)*. In this [finite field](https://en.wikipedia.org/wiki/Finite_field), we can add and multiply elements together, with many of the expected properties for those operations. Addition (and subtraction!) of field elements corresponds to bitwise XOR of the integers they correspond to, though multiplication is more involved.
+
+**Sets as power series**
+
+We define a function *S* which maps field elements *m* to the following [formal power series](https://en.wikipedia.org/wiki/Formal_power_series) (similar to a polynomial, except there can be an infinite number of terms, and we don't care about concepts like convergence as we're never going to actually evaluate it for a specific value of *x*):
+
+* *S(m) = 1 + mx + m<sup>2</sup>x<sup>2</sup> + m<sup>3</sup>x<sup>3</sup> + ...*.
+
+We then extend this function to operate on sets of field elements, by adding together the images of every set element. If *M = {m<sub>1</sub>, m<sub>2</sub>, ... }*:
+
+* *S(M) = S({m<sub>1</sub>,m<sub>2</sub>,...}) = S(m<sub>1</sub>) + S(m<sub>2</sub>) + ... = (1 + 1 + ...) + (m<sub>1</sub> + m<sub>2</sub> + ...)x + (m<sub>1</sub><sup>2</sup> + m<sub>2</sub><sup>2</sup> + ...)x<sup>2</sup> + (m<sub>1</sub><sup>3</sup> + ...*
+
+Because in our field addition corresponds to XOR of integers, it holds for every *a* that *a + a = 0*. This carries over to the *S* function, meaning that *S(a) + S(a) = 0* for every *a*. This means that the coefficients of these power series have the second of the properties we
+desire from a sketch, namely that an efficient operation exists to
+combine two sketches such that the result is a sketch of the symmetric
+difference of the sets. It holds that
+*S({m<sub>1</sub>,m<sub>2</sub>}) + S({m<sub>2</sub>,m<sub>3</sub>}) = S(m<sub>1</sub>) + (S(m<sub>2</sub>) + S(m<sub>2</sub>)) + S(m<sub>3</sub>) = S(m<sub>1</sub>) + S(m<sub>3</sub>) = S({m<sub>1</sub>,m<sub>3</sub>})*. The question is whether we can also efficiently recover the elements from their power series' coefficients.
+
+**An infinity of coefficients is hard**
+
+To make reasoning about these power series easier, notice that the series for a single element is in fact a [geometric series](https://en.wikipedia.org/wiki/Geometric_series). If we were working over real numbers rather than a finite field and *|mx| < 1*, it would converge to *(1 - mx)<sup>-1</sup>*. Convergence has no meaning in formal power series, however it is still the case that:
+
+* *(1 - mx) S(m) = 1*
+
+You can verify this by seeing that every coefficient except the constant one gets cancelled out by the multiplication. This can be generalized to the series for multiple set elements. For two elements we have:
+
+* *(1 - m<sub>1</sub>x) (1 - m<sub>2</sub>x) S({m<sub>1</sub>,m<sub>2</sub>}) = (1 - m<sub>1</sub>x) (1 - m<sub>2</sub>x) (S(m<sub>1</sub>) + S(m<sub>2</sub>)) = (1 - m<sub>2</sub>x) + (1 - m<sub>1</sub>x)*
+
+And for three:
+
+* *(1 - m<sub>1</sub>x) (1 - m<sub>2</sub>x) (1 - m<sub>3</sub>x) S({m<sub>1</sub>,m<sub>2</sub>,m<sub>3</sub>}) = (1 - m<sub>1</sub>x) (1 - m<sub>2</sub>x) (1 - m<sub>3</sub>x) (S(m<sub>1</sub>) + S(m<sub>2</sub>) + S(m<sub>3</sub>)) = (1 - m<sub>2</sub>x)(1 - m<sub>3</sub>x) + (1 - m<sub>1</sub>x)(1 - m<sub>3</sub>x) + (1 - m<sub>1</sub>x)(1 - m<sub>2</sub>x)*
+
+In each case, we notice that multiplying *S(M)* with *(1 - m<sub>i</sub>x)* for each element *m<sub>i</sub> &isin; M* results in a polynomial of degree *n-1*.
+
+**Solving for the set elements**
+
+The above insight lets us build a solver that extracts the set elements from the coefficients of a power series. If we can find a polynomial *L* that is the product of *n* different *(1 - m<sub>i</sub>x)* factors for various values of *m<sub>i</sub>*, such that *P = S(M)L* is an *n-1* degree polynomial, then those values *m<sub>i</sub>* are the elements of *M*.
+
+The coefficients of *P* are nontrivial expressions of the set elements themselves. However, we can just focus on the coefficients of degree *n* and higher in *P*, as those are all 0. Let *s<sub>i</sub>* be the coefficients of *S(M)*, and *l<sub>i</sub>* the coefficients of L. In other words, *S(M) = s<sub>0</sub> + s<sub>1</sub>x + s<sub>2</sub>x<sup>2</sup> + s<sub>3</sub>x<sup>3</sup> + ...* and *L = l<sub>0</sub> + l<sub>1</sub>x + l<sub>2</sub>x<sup>2</sup> + l<sub>3</sub>x<sup>3</sup> + ... + l<sub>n</sub>x<sup>n</sup>*. Note that *l<sub>0</sub> = 1*, as it is the product of all the *1* terms in the *(1 - m<sub>i</sub>x)* factors.
+
+Here are the equations for the coefficients of *S(M)L* of degree *n+1* through *2n*:
+* *s<sub>n+1</sub> + s<sub>n+0</sub>l<sub>1</sub> + s<sub>n-1</sub>l<sub>2</sub> + s<sub>n-2</sub>l<sub>3</sub> + ... + s<sub>1</sub>l<sub>n</sub> = 0*
+* *s<sub>n+2</sub> + s<sub>n+1</sub>l<sub>1</sub> + s<sub>n+0</sub>l<sub>2</sub> + s<sub>n-1</sub>l<sub>3</sub> + ... + s<sub>2</sub>l<sub>n</sub> = 0*
+* *s<sub>n+3</sub> + s<sub>n+2</sub>l<sub>1</sub> + s<sub>n+1</sub>l<sub>2</sub> + s<sub>n+0</sub>l<sub>3</sub> + ... + s<sub>3</sub>l<sub>n</sub> = 0*
+* ...
+* *s<sub>2n</sub> + s<sub>2n-1</sub>l<sub>1</sub> + s<sub>2n-2</sub>l<sub>2</sub> + s<sub>2n-3</sub>l<sub>3</sub> + ... + s<sub>n</sub>l<sub>n</sub> = 0*
+
+These are *n* linear equations with *n* unknowns (the *l<sub>i<sub>*
+values, for *i=1..n*), which can be solved using [Gaussian elimination](https://en.wikipedia.org/wiki/Gaussian_elimination). After doing so,
+we have the coefficients of *L*, which can then be [factored](https://en.wikipedia.org/wiki/Factorization_of_polynomials_over_finite_fields)
+into first degree factors of the form *(1 - m<sub>i</sub>x)*. The resulting *m* values are our set elements.
+
+**Putting it all together**
+
+Interestingly, only *2n* coefficients of *S(M)* were needed for solving
+the set of equations above. This means we have our answer: the
+coefficients *1* through *2n* of *S(M)*, or the list
+*[m<sub>1</sub> + m<sub>2</sub> + ..., m<sub>1</sub><sup>2</sup> + m<sub>2</sub><sup>2</sup> + ..., ..., m<sub>1</sub><sup>2n</sup> + m<sub>2</sub><sup>2n</sup> + ...]*
+functions as a sketch, satisfying the two properties we want:
+
+* Sketches can be combined to form the sketch of their symmetric difference, by simply pairwise adding the list elements together.
+* With *2n* list elements we can efficiently recover *n* elements from a sketch.
+
+**Capacity and difference**
+
+The approach above only works when the number of elements *n* in the sketch is known. Of course we want to support cases where only an upper bound on the number of elements in the sketch is known, the capacity *c*. Given that we can reconstruct a set of size *c* from a sketch with *2c* terms, we should be able to reconstruct a set of size *n* too as long as *n &le; c*. This is simply a matter of trying to solve the above set of equations assuming values of *n* that count down from *c* until a solution is found for one. This is known as the [Peterson-Gorenstein-Zierler algorithm](https://en.wikipedia.org/wiki/BCH_code#Peterson%E2%80%93Gorenstein%E2%80%93Zierler_algorithm).
+
+## Optimizations
+
+**Halving the sketch size**
+
+We can in fact only include the odd terms in the sketch, and reconstruct the even ones before solving the equation to find *L*. This means the size of a sketch becomes just *c* field elements, the same size as would be needed to send its contents naively.
+
+To see how this is possible, we need the [Frobenius endomorphism](https://en.wikipedia.org/wiki/Frobenius_endomorphism), which in short states that in fields where *x + x = 0* it holds that *(x + y)<sup>2</sup> = x<sup>2</sup> + y<sup>2</sup>* for every *x* and *y* (the dream of every high school math student!). This means that:
+
+* *s<sub>2</sub> = m<sub>1</sub><sup>2</sup> + m<sub>2</sub><sup>2</sup> + ... = (m<sub>1</sub> + m<sub>2</sub> + ...)<sup>2</sup> = s<sub>1</sub><sup>2</sup>*.
+* *s<sub>4</sub> = m<sub>1</sub><sup>4</sup> + m<sub>2</sub><sup>4</sup> + ... = (m<sub>1</sub><sup>2</sup> + m<sub>2</sub><sup>2</sup> + ...)<sup>2</sup> = s<sub>2</sub><sup>2</sup>*.
+* *s<sub>6</sub> = m<sub>1</sub><sup>6</sup> + m<sub>2</sub><sup>6</sup> + ... = (m<sub>1</sub><sup>3</sup> + m<sub>2</sub><sup>3</sup> + ...)<sup>2</sup> = s<sub>3</sub><sup>2</sup>*.
+* ...
+
+In other words, we only need to send *s<sub>1</sub>, s<sub>3</sub>, s<sub>5</sub>, ..., s<sub>2n-1</sub>* to recover all *2n* *s<sub>i</sub>* values, and proceed with reconstruction.
+
+**Quadratic performance rather than cubic**
+
+Using Gaussian elimination to solve the set of equations above for the *l<sub>i</sub>* values requires *O(n<sup>3</sup>)* field operations. However, due to the special structure in the equations (look at the repeated *s<sub>i</sub>* values), it can be solved in *O(n<sup>2</sup>)* time using a number of techniques, including the [Berlekamp-Massey algorithm](https://en.wikipedia.org/wiki/Berlekamp%E2%80%93Massey_algorithm) (BM).
+
+**Roots instead of factorization**
+
+As explained above, the polynomial *L* can be factored into *(1 - m<sub>i</sub>x)* factors, where the values *m<sub>i</sub>* are the set elements. However, since we know that a decodable sketch must result in a polynomial that is fully factorizable into degree-*1* factors, we can instead use a more efficient root-finding algorithm rather than a factorization algorithm. As the root of each *(1 - m<sub>i</sub>x)* factor is *m<sub>i</sub><sup>-1</sup>*, we conclude that the set elements are in fact the inverses of the roots of *L*.
+
+**Avoiding inversions**
+
+As inversions are a relatively expensive operation, it would be useful to avoid them.
+
+Say that we're trying to find the inverses of the roots of *L = 1 + l<sub>1</sub>x + l<sub>2</sub>x<sup>2</sup> + ... + l<sub>n</sub>x<sup>n</sup>*, then we're really interested in the solutions *y* for *1 + l<sub>1</sub>y<sup>-1</sup> + l<sub>2</sub>y<sup>-2</sup> + ... + l<sub>n</sub>y<sup>-n</sup> = 0*. By multiplying both sides in the equations with *y<sup>n</sup>*, we find *l<sub>n</sub> + l<sub>n-1</sub>y + l<sub>n-2</sub>y<sup>2</sup> + ... + y<sup>n</sup> = 0*.
+
+In other words, we can find the inverses of the roots of *L* by instead factoring the polynomial with the coefficients of *L* in reverse order.
+
+* <a name="myfootnote1">[1]</a> For those familiar with coding theory: PinSketch communicates a set difference by encoding the set members as errors in a binary [BCH](https://en.wikipedia.org/wiki/BCH_code) codeword 2<sup>bits</sup> in size and sends the syndromes.
+  The linearity of the syndromes provides all the properties needed for a sketch. Sketch decoding is simply finding the error locations. Decode is much faster than an ordinary BCH decoder for such a large codeword because the need to take a discrete log is avoided by storing the set in the roots directly instead of in an exponent (logically permuting the bits of the codeword).
diff --git a/src/minisketch/doc/minisketch-vs.png b/src/minisketch/doc/minisketch-vs.png
new file mode 100644
index 0000000000..aed810de8a
--- /dev/null
+++ b/src/minisketch/doc/minisketch-vs.png
diff --git a/src/minisketch/doc/moduli.md b/src/minisketch/doc/moduli.md
new file mode 100644
index 0000000000..379ac481b3
--- /dev/null
+++ b/src/minisketch/doc/moduli.md
@@ -0,0 +1,65 @@
+These are the irreducible polynomials over *GF(2)* used to represent field elements:
+
+* *x<sup>2</sup> + x + 1*
+* *x<sup>3</sup> + x + 1*
+* *x<sup>4</sup> + x + 1*
+* *x<sup>5</sup> + x<sup>2</sup> + 1*
+* *x<sup>6</sup> + x + 1*
+* *x<sup>7</sup> + x + 1*
+* *x<sup>8</sup> + x<sup>4</sup> + x<sup>3</sup> + x + 1*
+* *x<sup>9</sup> + x + 1*
+* *x<sup>10</sup> + x<sup>3</sup> + 1*
+* *x<sup>11</sup> + x<sup>2</sup> + 1*
+* *x<sup>12</sup> + x<sup>3</sup> + 1*
+* *x<sup>13</sup> + x<sup>4</sup> + x<sup>3</sup> + x + 1*
+* *x<sup>14</sup> + x<sup>5</sup> + 1*
+* *x<sup>15</sup> + x + 1*
+* *x<sup>16</sup> + x<sup>5</sup> + x<sup>3</sup> + x + 1*
+* *x<sup>17</sup> + x<sup>3</sup> + 1*
+* *x<sup>18</sup> + x<sup>3</sup> + 1*
+* *x<sup>19</sup> + x<sup>5</sup> + x<sup>2</sup> + x + 1*
+* *x<sup>20</sup> + x<sup>3</sup> + 1*
+* *x<sup>21</sup> + x<sup>2</sup> + 1*
+* *x<sup>22</sup> + x + 1*
+* *x<sup>23</sup> + x<sup>5</sup> + 1*
+* *x<sup>24</sup> + x<sup>4</sup> + x<sup>3</sup> + x + 1*
+* *x<sup>25</sup> + x<sup>3</sup> + 1*
+* *x<sup>26</sup> + x<sup>4</sup> + x<sup>3</sup> + x + 1*
+* *x<sup>27</sup> + x<sup>5</sup> + x<sup>2</sup> + x + 1*
+* *x<sup>28</sup> + x + 1*
+* *x<sup>29</sup> + x<sup>2</sup> + 1*
+* *x<sup>30</sup> + x + 1*
+* *x<sup>31</sup> + x<sup>3</sup> + 1*
+* *x<sup>32</sup> + x<sup>7</sup> + x<sup>3</sup> + x<sup>2</sup> + 1*
+* *x<sup>33</sup> + x<sup>10</sup> + 1*
+* *x<sup>34</sup> + x<sup>7</sup> + 1*
+* *x<sup>35</sup> + x<sup>2</sup> + 1*
+* *x<sup>36</sup> + x<sup>9</sup> + 1*
+* *x<sup>37</sup> + x<sup>6</sup> + x<sup>4</sup> + x + 1*
+* *x<sup>38</sup> + x<sup>6</sup> + x<sup>5</sup> + x + 1*
+* *x<sup>39</sup> + x<sup>4</sup> + 1*
+* *x<sup>40</sup> + x<sup>5</sup> + x<sup>4</sup> + x<sup>3</sup> + 1*
+* *x<sup>41</sup> + x<sup>3</sup> + 1*
+* *x<sup>42</sup> + x<sup>7</sup> + 1*
+* *x<sup>43</sup> + x<sup>6</sup> + x<sup>4</sup> + x<sup>3</sup> + 1*
+* *x<sup>44</sup> + x<sup>5</sup> + 1*
+* *x<sup>45</sup> + x<sup>4</sup> + x<sup>3</sup> + x + 1*
+* *x<sup>46</sup> + x + 1*
+* *x<sup>47</sup> + x<sup>5</sup> + 1*
+* *x<sup>48</sup> + x<sup>5</sup> + x<sup>3</sup> + x<sup>2</sup> + 1*
+* *x<sup>49</sup> + x<sup>9</sup> + 1*
+* *x<sup>50</sup> + x<sup>4</sup> + x<sup>3</sup> + x<sup>2</sup> + 1*
+* *x<sup>51</sup> + x<sup>6</sup> + x<sup>3</sup> + x + 1*
+* *x<sup>52</sup> + x<sup>3</sup> + 1*
+* *x<sup>53</sup> + x<sup>6</sup> + x<sup>2</sup> + x + 1*
+* *x<sup>54</sup> + x<sup>9</sup> + 1*
+* *x<sup>55</sup> + x<sup>7</sup> + 1*
+* *x<sup>56</sup> + x<sup>7</sup> + x<sup>4</sup> + x<sup>2</sup> + 1*
+* *x<sup>57</sup> + x<sup>4</sup> + 1*
+* *x<sup>58</sup> + x<sup>19</sup> + 1*
+* *x<sup>59</sup> + x<sup>7</sup> + x<sup>4</sup> + x<sup>2</sup> + 1*
+* *x<sup>60</sup> + x + 1*
+* *x<sup>61</sup> + x<sup>5</sup> + x<sup>2</sup> + x + 1*
+* *x<sup>62</sup> + x<sup>29</sup> + 1*
+* *x<sup>63</sup> + x + 1*
+* *x<sup>64</sup> + x<sup>4</sup> + x<sup>3</sup> + x + 1*
diff --git a/src/minisketch/doc/plot_bits.png b/src/minisketch/doc/plot_bits.png
new file mode 100644
index 0000000000..6e907d6b20
--- /dev/null
+++ b/src/minisketch/doc/plot_bits.png
diff --git a/src/minisketch/doc/plot_capacity.png b/src/minisketch/doc/plot_capacity.png
new file mode 100644
index 0000000000..b4f760da36
--- /dev/null
+++ b/src/minisketch/doc/plot_capacity.png
diff --git a/src/minisketch/doc/plot_diff.png b/src/minisketch/doc/plot_diff.png
new file mode 100644
index 0000000000..08ab6a86b9
--- /dev/null
+++ b/src/minisketch/doc/plot_diff.png
diff --git a/src/minisketch/doc/plot_size.png b/src/minisketch/doc/plot_size.png
new file mode 100644
index 0000000000..b21921776a
--- /dev/null
+++ b/src/minisketch/doc/plot_size.png
diff --git a/src/minisketch/doc/protocoltips.md b/src/minisketch/doc/protocoltips.md
new file mode 100644
index 0000000000..610407ebc2
--- /dev/null
+++ b/src/minisketch/doc/protocoltips.md
@@ -0,0 +1,30 @@
+# Tips for designing protocols using `libminisketch`
+
+Sending a sketch is less efficient than just sending your whole set with efficient entropy coding if the number of differences is larger than *log<sub>2</sub>( 2<sup>b</sup> choose set_size ) / b*.
+
+In most applications your set can be hashed to entries just large enough to make the probability of collision negligible. This can be a considerable speedup and bandwidth savings.  Short hashes (<128 bits) should be salted with an unpredictable value to prevent malicious inputs from intentionally causing collisions. Salting also allows an entry missed due to a collision to be reconciled on a later run with a different salt. Pre-hashing may not be possible in some applications, such as where there is only one-way communication, where the confidentiality of entry origin matters, or where security depends on the total absence of collisions.
+
+Some element sizes are faster to decode than others; see the benchmarks in the readme.
+
+Almost all the computational burden of reconciliation is in minisketch_decode(). Denial-of-service attacks can be mitigated by arranging protocol flow so that a party requests a sketch and decodes it rather than a construction where the participants will decode unsolicited sketches. Decode times can be constrained by limiting sketch capacity or via the max_count argument to minisketch_decode().
+
+In most cases you don't actually know the size of the set difference in advance, but often you know a lower bound on it (the difference in set sizes).
+
+* There are difference size estimation techniques such as min-wise hashing<sup>[[1]](#myfootnote1)</sup> or random projections<sup>[[2]](#myfootnote2)</sup>, but complex estimators can end up using more bandwidth than they save.
+
+* It may be useful to always overestimate the sketch size needed to amortize communications overheads (*e.g.* packet headers, round trip delays).
+
+* If the total data sent would end up leaving you better off having just sent the whole set, per above, then you can send the set in response to a failure but leave out as many elements as the size of the previously sent sketch. The receiver can decode the partial set and use the data they already have to complete it, reducing bandwidth waste.
+
+* Additional elements can be sent for a sketch as few as one at a time with little decode cost until enough data is received to decode.  This is most easily implemented by always computing the largest sketch size and sending it incrementally as needed.
+
+* Because sketches are linear you can adaptively subdivide to decode an overfull set. The sender uses a hash function to select approximately half their set members and sends a sketch of those members. The receiver can do the same and combine the result with the initially sent sketch to get two sketches with roughly half the number of members and attempt to decode them. Repeat recursively on failure. This adaptive subdivision procedure makes decode time essentially linear at the cost of communications inefficiency.  Minisketches can also be used as the cells in an IBLT for similar reasons.
+
+Less efficient reconciliation techniques like IBLT or adaptive subdivision, or overheads like complex estimators effectively lower the threshold where sending the whole set efficiently would use less bandwidth.
+
+When the number of differences is more than 2<sup>b/2-1</sup> an alternative sketch encoding is possible that is somewhat smaller, but requires a table of size 2<sup>b</sup>; contact the authors if you have an application where that might be useful.
+
+## References
+
+* <a name="myfootnote1">[1]</a> Broder, A. *On the Resemblance and Containment of Documents* Proceedings of the Compression and Complexity of Sequences 1997 [[PDF]](https://www.cs.princeton.edu/courses/archive/spring13/cos598C/broder97resemblance.pdf)
+* <a name="myfootnote2">[2]</a> Feigenbaum, Joan and Kannan, Sampath and Strauss, Martin J. and Viswanathan, Mahesh. *An Approximate L1-Difference Algorithm for  Massive Data Streams* SIAM J. Comput. 2003 [[PDF]](http://www.cs.yale.edu/homes/jf/FKSV1.pdf)