public LongArray ModMultiply(LongArray other, int m, int[] ks) { /* * Find out the degree of each argument and handle the zero cases */ int aDeg = Degree(); if (aDeg == 0) { return this; } int bDeg = other.Degree(); if (bDeg == 0) { return other; } /* * Swap if necessary so that A is the smaller argument */ LongArray A = this, B = other; if (aDeg > bDeg) { A = other; B = this; int tmp = aDeg; aDeg = bDeg; bDeg = tmp; } /* * Establish the word lengths of the arguments and result */ int aLen = (int)((uint)(aDeg + 63) >> 6); int bLen = (int)((uint)(bDeg + 63) >> 6); int cLen = (int)((uint)(aDeg + bDeg + 62) >> 6); if (aLen == 1) { long a0 = A.m_ints[0]; if (a0 == 1L) { return B; } /* * Fast path for small A, with performance dependent only on the number of set bits */ long[] c0 = new long[cLen]; MultiplyWord(a0, B.m_ints, bLen, c0, 0); /* * Reduce the raw answer against the reduction coefficients */ return ReduceResult(c0, 0, cLen, m, ks); } /* * Determine if B will get bigger during shifting */ int bMax = (int)((uint)(bDeg + 7 + 63) >> 6); /* * Lookup table for the offset of each B in the tables */ int[] ti = new int[16]; /* * Precompute table of all 4-bit products of B */ long[] T0 = new long[bMax << 4]; int tOff = bMax; ti[1] = tOff; Array.Copy(B.m_ints, 0, T0, tOff, bLen); for (int i = 2; i < 16; ++i) { ti[i] = (tOff += bMax); if ((i & 1) == 0) { ShiftUp(T0, (int)((uint)tOff >> 1), T0, tOff, bMax, 1); } else { Add(T0, bMax, T0, tOff - bMax, T0, tOff, bMax); } } /* * Second table with all 4-bit products of B shifted 4 bits */ long[] T1 = new long[T0.Length]; ShiftUp(T0, 0, T1, 0, T0.Length, 4); // ShiftUp(T0, bMax, T1, bMax, tOff, 4); long[] a = A.m_ints; long[] c = new long[cLen << 3]; int MASK = 0xF; /* * Lopez-Dahab (Modified) algorithm */ for (int aPos = 0; aPos < aLen; ++aPos) { long aVal = a[aPos]; int cOff = aPos; for (;;) { int u = (int)aVal & MASK; aVal = (long)((ulong)aVal >> 4); int v = (int)aVal & MASK; AddBoth(c, cOff, T0, ti[u], T1, ti[v], bMax); aVal = (long)((ulong)aVal >> 4); if (aVal == 0L) { break; } cOff += cLen; } } { int cOff = c.Length; while ((cOff -= cLen) != 0) { AddShiftedUp(c, cOff - cLen, c, cOff, cLen, 8); } } /* * Finally the raw answer is collected, reduce it against the reduction coefficients */ return ReduceResult(c, 0, cLen, m, ks); }
public LongArray ModMultiplyAlt(LongArray other, int m, int[] ks) { /* * Find out the degree of each argument and handle the zero cases */ int aDeg = Degree(); if (aDeg == 0) { return this; } int bDeg = other.Degree(); if (bDeg == 0) { return other; } /* * Swap if necessary so that A is the smaller argument */ LongArray A = this, B = other; if (aDeg > bDeg) { A = other; B = this; int tmp = aDeg; aDeg = bDeg; bDeg = tmp; } /* * Establish the word lengths of the arguments and result */ int aLen = (int)((uint)(aDeg + 63) >> 6); int bLen = (int)((uint)(bDeg + 63) >> 6); int cLen = (int)((uint)(aDeg + bDeg + 62) >> 6); if (aLen == 1) { long a0 = A.m_ints[0]; if (a0 == 1L) { return B; } /* * Fast path for small A, with performance dependent only on the number of set bits */ long[] c0 = new long[cLen]; MultiplyWord(a0, B.m_ints, bLen, c0, 0); /* * Reduce the raw answer against the reduction coefficients */ return ReduceResult(c0, 0, cLen, m, ks); } // NOTE: This works, but is slower than width 4 processing // if (aLen == 2) // { // /* // * Use common-multiplicand optimization to save ~1/4 of the adds // */ // long a1 = A.m_ints[0], a2 = A.m_ints[1]; // long aa = a1 & a2; a1 ^= aa; a2 ^= aa; // // long[] b = B.m_ints; // long[] c = new long[cLen]; // multiplyWord(aa, b, bLen, c, 1); // add(c, 0, c, 1, cLen - 1); // multiplyWord(a1, b, bLen, c, 0); // multiplyWord(a2, b, bLen, c, 1); // // /* // * Reduce the raw answer against the reduction coefficients // */ // return ReduceResult(c, 0, cLen, m, ks); // } /* * Determine the parameters of the Interleaved window algorithm: the 'width' in bits to * process together, the number of evaluation 'positions' implied by that width, and the * 'top' position at which the regular window algorithm stops. */ int width, positions, top, banks; // NOTE: width 4 is the fastest over the entire range of sizes used in current crypto // width = 1; positions = 64; top = 64; banks = 4; // width = 2; positions = 32; top = 64; banks = 4; // width = 3; positions = 21; top = 63; banks = 3; width = 4; positions = 16; top = 64; banks = 8; // width = 5; positions = 13; top = 65; banks = 7; // width = 7; positions = 9; top = 63; banks = 9; // width = 8; positions = 8; top = 64; banks = 8; /* * Determine if B will get bigger during shifting */ int shifts = top < 64 ? positions : positions - 1; int bMax = (int)((uint)(bDeg + shifts + 63) >> 6); int bTotal = bMax * banks, stride = width * banks; /* * Create a single temporary buffer, with an offset table to find the positions of things in it */ int[] ci = new int[1 << width]; int cTotal = aLen; { ci[0] = cTotal; cTotal += bTotal; ci[1] = cTotal; for (int i = 2; i < ci.Length; ++i) { cTotal += cLen; ci[i] = cTotal; } cTotal += cLen; } // NOTE: Provide a safe dump for "high zeroes" since we are adding 'bMax' and not 'bLen' ++cTotal; long[] c = new long[cTotal]; // Prepare A in Interleaved form, according to the chosen width Interleave(A.m_ints, 0, c, 0, aLen, width); // Make a working copy of B, since we will be shifting it { int bOff = aLen; Array.Copy(B.m_ints, 0, c, bOff, bLen); for (int bank = 1; bank < banks; ++bank) { ShiftUp(c, aLen, c, bOff += bMax, bMax, bank); } } /* * The main loop analyzes the Interleaved windows in A, and for each non-zero window * a single word-array XOR is performed to a carefully selected slice of 'c'. The loop is * breadth-first, checking the lowest window in each word, then looping again for the * next higher window position. */ int MASK = (1 << width) - 1; int k = 0; for (;;) { int aPos = 0; do { long aVal = (long)((ulong)c[aPos] >> k); int bank = 0, bOff = aLen; for (;;) { int index = (int)(aVal) & MASK; if (index != 0) { /* * Add to a 'c' buffer based on the bit-pattern of 'index'. Since A is in * Interleaved form, the bits represent the current B shifted by 0, 'positions', * 'positions' * 2, ..., 'positions' * ('width' - 1) */ Add(c, aPos + ci[index], c, bOff, bMax); } if (++bank == banks) { break; } bOff += bMax; aVal = (long)((ulong)aVal >> width); } } while (++aPos < aLen); if ((k += stride) >= top) { if (k >= 64) { break; } /* * Adjustment for window setups with top == 63, the final bit (if any) is processed * as the top-bit of a window */ k = 64 - width; MASK &= MASK << (top - k); } /* * After each position has been checked for all words of A, B is shifted up 1 place */ ShiftUp(c, aLen, bTotal, banks); } int ciPos = ci.Length; while (--ciPos > 1) { if ((ciPos & 1L) == 0L) { /* * For even numbers, shift contents and add to the half-position */ AddShiftedUp(c, ci[(uint)ciPos >> 1], c, ci[ciPos], cLen, positions); } else { /* * For odd numbers, 'distribute' contents to the result and the next-lowest position */ Distribute(c, ci[ciPos], ci[ciPos - 1], ci[1], cLen); } } /* * Finally the raw answer is collected, reduce it against the reduction coefficients */ return ReduceResult(c, ci[1], cLen, m, ks); }