btcec: Optimize and correct normalize.

This modifies the normalize function of the internal field value to
both optimize it and address an issue where the reduction could
lead to an incorrect result with a small range of values.  It also adds
tests to ensure the behavior is correct.

The following benchmark shows the relative speedups as a result of the
optimization on my system.  In particular, the changes result in
approximately a 14% speedup in Normalize, which ultimately translates to
a 2% speedup in signature verifies.

benchmark                        old ns/op     new ns/op     delta
--------------------------------------------------------------------
BenchmarkAddJacobian             1364          1289          -5.50%
BenchmarkAddJacobianNotZOne      3150          3091          -1.87%
BenchmarkScalarBaseMult          134117        132816        -0.97%
BenchmarkScalarBaseMultLarge     135067        132966        -1.56%
BenchmarkScalarMult              411218        402217        -2.19%
BenchmarkSigVerify               671585        657833        -2.05%
BenchmarkFieldNormalize          36.0          31.0          -13.89%
This commit is contained in:
Dave Collins 2017-06-07 04:31:11 -05:00
parent 711e7dbb2e
commit 1238b7e55a
No known key found for this signature in database
GPG Key ID: B8904D9D9C93D1F2
2 changed files with 143 additions and 124 deletions

View File

@ -100,10 +100,6 @@ const (
// fieldPrimeWordOne is word one of the secp256k1 prime in the // fieldPrimeWordOne is word one of the secp256k1 prime in the
// internal field representation. It is used during negation. // internal field representation. It is used during negation.
fieldPrimeWordOne = 0x3ffffbf fieldPrimeWordOne = 0x3ffffbf
// primeLowBits is the lower 2*fieldBase bits of the secp256k1 prime in
// its standard normalized form. It is used during modular reduction.
primeLowBits = 0xffffefffffc2f
) )
// fieldVal implements optimized fixed-precision arithmetic over the // fieldVal implements optimized fixed-precision arithmetic over the
@ -250,39 +246,15 @@ func (f *fieldVal) SetHex(hexString string) *fieldVal {
// performs fast modular reduction over the secp256k1 prime by making use of the // performs fast modular reduction over the secp256k1 prime by making use of the
// special form of the prime. // special form of the prime.
func (f *fieldVal) Normalize() *fieldVal { func (f *fieldVal) Normalize() *fieldVal {
// The field representation leaves 6 bits of overflow in each // The field representation leaves 6 bits of overflow in each word so
// word so intermediate calculations can be performed without needing // intermediate calculations can be performed without needing to
// to propagate the carry to each higher word during the calculations. // propagate the carry to each higher word during the calculations. In
// In order to normalize, first we need to "compact" the full 256-bit // order to normalize, we need to "compact" the full 256-bit value to
// value to the right and treat the additional 64 leftmost bits as // the right while propagating any carries through to the high order
// the magnitude. // word.
m := f.n[0] //
t0 := m & fieldBaseMask // Since this field is doing arithmetic modulo the secp256k1 prime, we
m = (m >> fieldBase) + f.n[1] // also need to perform modular reduction over the prime.
t1 := m & fieldBaseMask
m = (m >> fieldBase) + f.n[2]
t2 := m & fieldBaseMask
m = (m >> fieldBase) + f.n[3]
t3 := m & fieldBaseMask
m = (m >> fieldBase) + f.n[4]
t4 := m & fieldBaseMask
m = (m >> fieldBase) + f.n[5]
t5 := m & fieldBaseMask
m = (m >> fieldBase) + f.n[6]
t6 := m & fieldBaseMask
m = (m >> fieldBase) + f.n[7]
t7 := m & fieldBaseMask
m = (m >> fieldBase) + f.n[8]
t8 := m & fieldBaseMask
m = (m >> fieldBase) + f.n[9]
t9 := m & fieldMSBMask
m = m >> fieldMSBBits
// At this point, if the magnitude is greater than 0, the overall value
// is greater than the max possible 256-bit value. In particular, it is
// "how many times larger" than the max value it is. Since this field
// is doing arithmetic modulo the secp256k1 prime, we need to perform
// modular reduction over the prime.
// //
// Per [HAC] section 14.3.4: Reduction method of moduli of special form, // Per [HAC] section 14.3.4: Reduction method of moduli of special form,
// when the modulus is of the special form m = b^t - c, highly efficient // when the modulus is of the special form m = b^t - c, highly efficient
@ -298,98 +270,87 @@ func (f *fieldVal) Normalize() *fieldVal {
// //
// The algorithm presented in the referenced section typically repeats // The algorithm presented in the referenced section typically repeats
// until the quotient is zero. However, due to our field representation // until the quotient is zero. However, due to our field representation
// we already know at least how many times we would need to repeat as // we already know to within one reduction how many times we would need
// it's the value currently in m. Thus we can simply multiply the // to repeat as it's the uppermost bits of the high order word. Thus we
// magnitude by the field representation of the prime and do a single // can simply multiply the magnitude by the field representation of the
// iteration. Notice that nothing will be changed when the magnitude is // prime and do a single iteration. After this step there might be an
// zero, so we could skip this in that case, however always running // additional carry to bit 256 (bit 22 of the high order word).
// regardless allows it to run in constant time. t9 := f.n[9]
r := t0 + m*977 m := t9 >> fieldMSBBits
t0 = r & fieldBaseMask t9 = t9 & fieldMSBMask
r = (r >> fieldBase) + t1 + m*64 t0 := f.n[0] + m*977
t1 = r & fieldBaseMask t1 := (t0 >> fieldBase) + f.n[1] + (m << 6)
r = (r >> fieldBase) + t2 t0 = t0 & fieldBaseMask
t2 = r & fieldBaseMask t2 := (t1 >> fieldBase) + f.n[2]
r = (r >> fieldBase) + t3 t1 = t1 & fieldBaseMask
t3 = r & fieldBaseMask t3 := (t2 >> fieldBase) + f.n[3]
r = (r >> fieldBase) + t4 t2 = t2 & fieldBaseMask
t4 = r & fieldBaseMask t4 := (t3 >> fieldBase) + f.n[4]
r = (r >> fieldBase) + t5 t3 = t3 & fieldBaseMask
t5 = r & fieldBaseMask t5 := (t4 >> fieldBase) + f.n[5]
r = (r >> fieldBase) + t6 t4 = t4 & fieldBaseMask
t6 = r & fieldBaseMask t6 := (t5 >> fieldBase) + f.n[6]
r = (r >> fieldBase) + t7 t5 = t5 & fieldBaseMask
t7 = r & fieldBaseMask t7 := (t6 >> fieldBase) + f.n[7]
r = (r >> fieldBase) + t8 t6 = t6 & fieldBaseMask
t8 = r & fieldBaseMask t8 := (t7 >> fieldBase) + f.n[8]
r = (r >> fieldBase) + t9 t7 = t7 & fieldBaseMask
t9 = r & fieldMSBMask t9 = (t8 >> fieldBase) + t9
t8 = t8 & fieldBaseMask
// At this point, the result will be in the range 0 <= result <= // At this point, the magnitude is guaranteed to be one, however, the
// prime + (2^64 - c). Therefore, one more subtraction of the prime // value could still be greater than the prime if there was either a
// might be needed if the current result is greater than or equal to the // carry through to bit 256 (bit 22 of the higher order word) or the
// prime. The following does the final reduction in constant time. // value is greater than or equal to the field characteristic. The
// Note that the if/else here intentionally does the bitwise OR with // following determines if either or these conditions are true and does
// zero even though it won't change the value to ensure constant time // the final reduction in constant time.
// between the branches. //
var mask int32 // Note that the if/else statements here intentionally do the bitwise
lowBits := uint64(t1)<<fieldBase | uint64(t0) // operators even when it won't change the value to ensure constant time
if lowBits < primeLowBits { // between the branches. Also note that 'm' will be zero when neither
mask |= -1 // of the aforementioned conditions are true and the value will not be
// changed when 'm' is zero.
m = 1
if t9 == fieldMSBMask {
m &= 1
} else { } else {
mask |= 0 m &= 0
} }
if t2 < fieldBaseMask { if t2&t3&t4&t5&t6&t7&t8 == fieldBaseMask {
mask |= -1 m &= 1
} else { } else {
mask |= 0 m &= 0
} }
if t3 < fieldBaseMask { if ((t0+977)>>fieldBase + t1 + 64) > fieldBaseMask {
mask |= -1 m &= 1
} else { } else {
mask |= 0 m &= 0
} }
if t4 < fieldBaseMask { if t9>>fieldMSBBits != 0 {
mask |= -1 m |= 1
} else { } else {
mask |= 0 m |= 0
} }
if t5 < fieldBaseMask { t0 = t0 + m*977
mask |= -1 t1 = (t0 >> fieldBase) + t1 + (m << 6)
} else { t0 = t0 & fieldBaseMask
mask |= 0 t2 = (t1 >> fieldBase) + t2
} t1 = t1 & fieldBaseMask
if t6 < fieldBaseMask { t3 = (t2 >> fieldBase) + t3
mask |= -1 t2 = t2 & fieldBaseMask
} else { t4 = (t3 >> fieldBase) + t4
mask |= 0 t3 = t3 & fieldBaseMask
} t5 = (t4 >> fieldBase) + t5
if t7 < fieldBaseMask { t4 = t4 & fieldBaseMask
mask |= -1 t6 = (t5 >> fieldBase) + t6
} else { t5 = t5 & fieldBaseMask
mask |= 0 t7 = (t6 >> fieldBase) + t7
} t6 = t6 & fieldBaseMask
if t8 < fieldBaseMask { t8 = (t7 >> fieldBase) + t8
mask |= -1 t7 = t7 & fieldBaseMask
} else { t9 = (t8 >> fieldBase) + t9
mask |= 0 t8 = t8 & fieldBaseMask
} t9 = t9 & fieldMSBMask // Remove potential multiple of 2^256.
if t9 < fieldMSBMask {
mask |= -1
} else {
mask |= 0
}
lowBits -= ^uint64(mask) & primeLowBits
t0 = uint32(lowBits & fieldBaseMask)
t1 = uint32((lowBits >> fieldBase) & fieldBaseMask)
t2 = t2 & uint32(mask)
t3 = t3 & uint32(mask)
t4 = t4 & uint32(mask)
t5 = t5 & uint32(mask)
t6 = t6 & uint32(mask)
t7 = t7 & uint32(mask)
t8 = t8 & uint32(mask)
t9 = t9 & uint32(mask)
// Finally, set the normalized and reduced words. // Finally, set the normalized and reduced words.
f.n[0] = t0 f.n[0] = t0

View File

@ -247,17 +247,75 @@ func TestNormalize(t *testing.T) {
[10]uint32{0xffffffff, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0x3fffc0}, [10]uint32{0xffffffff, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0x3fffc0},
[10]uint32{0x000003d0, 0x00000040, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000}, [10]uint32{0x000003d0, 0x00000040, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000},
}, },
// Prime with field representation such that the initial
// reduction does not result in a carry to bit 256.
//
// 2^256 - 4294968273 (secp256k1 prime)
{
[10]uint32{0x03fffc2f, 0x03ffffbf, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x003fffff},
[10]uint32{0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000},
},
// Prime larger than P that reduces to a value which is still
// larger than P when it has a magnitude of 1 due to its first
// word and does not result in a carry to bit 256.
//
// 2^256 - 4294968272 (secp256k1 prime + 1)
{
[10]uint32{0x03fffc30, 0x03ffffbf, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x003fffff},
[10]uint32{0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000},
},
// Prime larger than P that reduces to a value which is still
// larger than P when it has a magnitude of 1 due to its second
// word and does not result in a carry to bit 256.
//
// 2^256 - 4227859409 (secp256k1 prime + 0x4000000)
{
[10]uint32{0x03fffc2f, 0x03ffffc0, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x003fffff},
[10]uint32{0x00000000, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000},
},
// Prime larger than P that reduces to a value which is still
// larger than P when it has a magnitude of 1 due to a carry to
// bit 256, but would not be without the carry. These values
// come from the fact that P is 2^256 - 4294968273 and 977 is
// the low order word in the internal field representation.
//
// 2^256 * 5 - ((4294968273 - (977+1)) * 4)
{
[10]uint32{0x03ffffff, 0x03fffeff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x0013fffff},
[10]uint32{0x00001314, 0x00000040, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000000},
},
// Prime larger than P that reduces to a value which is still
// larger than P when it has a magnitude of 1 due to both a
// carry to bit 256 and the first word.
{
[10]uint32{0x03fffc30, 0x03ffffbf, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x07ffffff, 0x003fffff},
[10]uint32{0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001},
},
// Prime larger than P that reduces to a value which is still
// larger than P when it has a magnitude of 1 due to both a
// carry to bit 256 and the second word.
//
{
[10]uint32{0x03fffc2f, 0x03ffffc0, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x3ffffff, 0x07ffffff, 0x003fffff},
[10]uint32{0x00000000, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0000000, 0x00000000, 0x00000001},
},
// Prime larger than P that reduces to a value which is still
// larger than P when it has a magnitude of 1 due to a carry to
// bit 256 and the first and second words.
//
{
[10]uint32{0x03fffc30, 0x03ffffc0, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x07ffffff, 0x003fffff},
[10]uint32{0x00000001, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001},
},
} }
t.Logf("Running %d tests", len(tests)) t.Logf("Running %d tests", len(tests))
for i, test := range tests { for i, test := range tests {
f := new(fieldVal) f := new(fieldVal)
for rawIntIdx := 0; rawIntIdx < len(test.raw); rawIntIdx++ { f.n = test.raw
f.n[rawIntIdx] = test.raw[rawIntIdx]
}
f.Normalize() f.Normalize()
if !reflect.DeepEqual(f.n, test.normalized) { if !reflect.DeepEqual(f.n, test.normalized) {
t.Errorf("fieldVal.Set #%d wrong normalized result\n"+ t.Errorf("fieldVal.Normalize #%d wrong result\n"+
"got: %x\nwant: %x", i, f.n, test.normalized) "got: %x\nwant: %x", i, f.n, test.normalized)
continue continue
} }