btcec: Optimize and correct normalize.

This modifies the normalize function of the internal field value to
both optimize it and address an issue where the reduction could
lead to an incorrect result with a small range of values.  It also adds
tests to ensure the behavior is correct.

The following benchmark shows the relative speedups as a result of the
optimization on my system.  In particular, the changes result in
approximately a 14% speedup in Normalize, which ultimately translates to
a 2% speedup in signature verifies.

benchmark                        old ns/op     new ns/op     delta
--------------------------------------------------------------------
BenchmarkAddJacobian             1364          1289          -5.50%
BenchmarkAddJacobianNotZOne      3150          3091          -1.87%
BenchmarkScalarBaseMult          134117        132816        -0.97%
BenchmarkScalarBaseMultLarge     135067        132966        -1.56%
BenchmarkScalarMult              411218        402217        -2.19%
BenchmarkSigVerify               671585        657833        -2.05%
BenchmarkFieldNormalize          36.0          31.0          -13.89%
This commit is contained in:
Dave Collins 2017-06-07 04:31:11 -05:00
parent 711e7dbb2e
commit 1238b7e55a
No known key found for this signature in database
GPG Key ID: B8904D9D9C93D1F2
2 changed files with 143 additions and 124 deletions

View File

@ -100,10 +100,6 @@ const (
// fieldPrimeWordOne is word one of the secp256k1 prime in the
// internal field representation. It is used during negation.
fieldPrimeWordOne = 0x3ffffbf
// primeLowBits is the lower 2*fieldBase bits of the secp256k1 prime in
// its standard normalized form. It is used during modular reduction.
primeLowBits = 0xffffefffffc2f
)
// fieldVal implements optimized fixed-precision arithmetic over the
@ -250,39 +246,15 @@ func (f *fieldVal) SetHex(hexString string) *fieldVal {
// performs fast modular reduction over the secp256k1 prime by making use of the
// special form of the prime.
func (f *fieldVal) Normalize() *fieldVal {
// The field representation leaves 6 bits of overflow in each
// word so intermediate calculations can be performed without needing
// to propagate the carry to each higher word during the calculations.
// In order to normalize, first we need to "compact" the full 256-bit
// value to the right and treat the additional 64 leftmost bits as
// the magnitude.
m := f.n[0]
t0 := m & fieldBaseMask
m = (m >> fieldBase) + f.n[1]
t1 := m & fieldBaseMask
m = (m >> fieldBase) + f.n[2]
t2 := m & fieldBaseMask
m = (m >> fieldBase) + f.n[3]
t3 := m & fieldBaseMask
m = (m >> fieldBase) + f.n[4]
t4 := m & fieldBaseMask
m = (m >> fieldBase) + f.n[5]
t5 := m & fieldBaseMask
m = (m >> fieldBase) + f.n[6]
t6 := m & fieldBaseMask
m = (m >> fieldBase) + f.n[7]
t7 := m & fieldBaseMask
m = (m >> fieldBase) + f.n[8]
t8 := m & fieldBaseMask
m = (m >> fieldBase) + f.n[9]
t9 := m & fieldMSBMask
m = m >> fieldMSBBits
// At this point, if the magnitude is greater than 0, the overall value
// is greater than the max possible 256-bit value. In particular, it is
// "how many times larger" than the max value it is. Since this field
// is doing arithmetic modulo the secp256k1 prime, we need to perform
// modular reduction over the prime.
// The field representation leaves 6 bits of overflow in each word so
// intermediate calculations can be performed without needing to
// propagate the carry to each higher word during the calculations. In
// order to normalize, we need to "compact" the full 256-bit value to
// the right while propagating any carries through to the high order
// word.
//
// Since this field is doing arithmetic modulo the secp256k1 prime, we
// also need to perform modular reduction over the prime.
//
// Per [HAC] section 14.3.4: Reduction method of moduli of special form,
// when the modulus is of the special form m = b^t - c, highly efficient
@ -298,98 +270,87 @@ func (f *fieldVal) Normalize() *fieldVal {
//
// The algorithm presented in the referenced section typically repeats
// until the quotient is zero. However, due to our field representation
// we already know at least how many times we would need to repeat as
// it's the value currently in m. Thus we can simply multiply the
// magnitude by the field representation of the prime and do a single
// iteration. Notice that nothing will be changed when the magnitude is
// zero, so we could skip this in that case, however always running
// regardless allows it to run in constant time.
r := t0 + m*977
t0 = r & fieldBaseMask
r = (r >> fieldBase) + t1 + m*64
t1 = r & fieldBaseMask
r = (r >> fieldBase) + t2
t2 = r & fieldBaseMask
r = (r >> fieldBase) + t3
t3 = r & fieldBaseMask
r = (r >> fieldBase) + t4
t4 = r & fieldBaseMask
r = (r >> fieldBase) + t5
t5 = r & fieldBaseMask
r = (r >> fieldBase) + t6
t6 = r & fieldBaseMask
r = (r >> fieldBase) + t7
t7 = r & fieldBaseMask
r = (r >> fieldBase) + t8
t8 = r & fieldBaseMask
r = (r >> fieldBase) + t9
t9 = r & fieldMSBMask
// we already know to within one reduction how many times we would need
// to repeat as it's the uppermost bits of the high order word. Thus we
// can simply multiply the magnitude by the field representation of the
// prime and do a single iteration. After this step there might be an
// additional carry to bit 256 (bit 22 of the high order word).
t9 := f.n[9]
m := t9 >> fieldMSBBits
t9 = t9 & fieldMSBMask
t0 := f.n[0] + m*977
t1 := (t0 >> fieldBase) + f.n[1] + (m << 6)
t0 = t0 & fieldBaseMask
t2 := (t1 >> fieldBase) + f.n[2]
t1 = t1 & fieldBaseMask
t3 := (t2 >> fieldBase) + f.n[3]
t2 = t2 & fieldBaseMask
t4 := (t3 >> fieldBase) + f.n[4]
t3 = t3 & fieldBaseMask
t5 := (t4 >> fieldBase) + f.n[5]
t4 = t4 & fieldBaseMask
t6 := (t5 >> fieldBase) + f.n[6]
t5 = t5 & fieldBaseMask
t7 := (t6 >> fieldBase) + f.n[7]
t6 = t6 & fieldBaseMask
t8 := (t7 >> fieldBase) + f.n[8]
t7 = t7 & fieldBaseMask
t9 = (t8 >> fieldBase) + t9
t8 = t8 & fieldBaseMask
// At this point, the result will be in the range 0 <= result <=
// prime + (2^64 - c). Therefore, one more subtraction of the prime
// might be needed if the current result is greater than or equal to the
// prime. The following does the final reduction in constant time.
// Note that the if/else here intentionally does the bitwise OR with
// zero even though it won't change the value to ensure constant time
// between the branches.
var mask int32
lowBits := uint64(t1)<<fieldBase | uint64(t0)
if lowBits < primeLowBits {
mask |= -1
// At this point, the magnitude is guaranteed to be one, however, the
// value could still be greater than the prime if there was either a
// carry through to bit 256 (bit 22 of the higher order word) or the
// value is greater than or equal to the field characteristic. The
// following determines if either or these conditions are true and does
// the final reduction in constant time.
//
// Note that the if/else statements here intentionally do the bitwise
// operators even when it won't change the value to ensure constant time
// between the branches. Also note that 'm' will be zero when neither
// of the aforementioned conditions are true and the value will not be
// changed when 'm' is zero.
m = 1
if t9 == fieldMSBMask {
m &= 1
} else {
mask |= 0
m &= 0
}
if t2 < fieldBaseMask {
mask |= -1
if t2&t3&t4&t5&t6&t7&t8 == fieldBaseMask {
m &= 1
} else {
mask |= 0
m &= 0
}
if t3 < fieldBaseMask {
mask |= -1
if ((t0+977)>>fieldBase + t1 + 64) > fieldBaseMask {
m &= 1
} else {
mask |= 0
m &= 0
}
if t4 < fieldBaseMask {
mask |= -1
if t9>>fieldMSBBits != 0 {
m |= 1
} else {
mask |= 0
m |= 0
}
if t5 < fieldBaseMask {
mask |= -1
} else {
mask |= 0
}
if t6 < fieldBaseMask {
mask |= -1
} else {
mask |= 0
}
if t7 < fieldBaseMask {
mask |= -1
} else {
mask |= 0
}
if t8 < fieldBaseMask {
mask |= -1
} else {
mask |= 0
}
if t9 < fieldMSBMask {
mask |= -1
} else {
mask |= 0
}
lowBits -= ^uint64(mask) & primeLowBits
t0 = uint32(lowBits & fieldBaseMask)
t1 = uint32((lowBits >> fieldBase) & fieldBaseMask)
t2 = t2 & uint32(mask)
t3 = t3 & uint32(mask)
t4 = t4 & uint32(mask)
t5 = t5 & uint32(mask)
t6 = t6 & uint32(mask)
t7 = t7 & uint32(mask)
t8 = t8 & uint32(mask)
t9 = t9 & uint32(mask)
t0 = t0 + m*977
t1 = (t0 >> fieldBase) + t1 + (m << 6)
t0 = t0 & fieldBaseMask
t2 = (t1 >> fieldBase) + t2
t1 = t1 & fieldBaseMask
t3 = (t2 >> fieldBase) + t3
t2 = t2 & fieldBaseMask
t4 = (t3 >> fieldBase) + t4
t3 = t3 & fieldBaseMask
t5 = (t4 >> fieldBase) + t5
t4 = t4 & fieldBaseMask
t6 = (t5 >> fieldBase) + t6
t5 = t5 & fieldBaseMask
t7 = (t6 >> fieldBase) + t7
t6 = t6 & fieldBaseMask
t8 = (t7 >> fieldBase) + t8
t7 = t7 & fieldBaseMask
t9 = (t8 >> fieldBase) + t9
t8 = t8 & fieldBaseMask
t9 = t9 & fieldMSBMask // Remove potential multiple of 2^256.
// Finally, set the normalized and reduced words.
f.n[0] = t0

View File

@ -247,17 +247,75 @@ func TestNormalize(t *testing.T) {
[10]uint32{0xffffffff, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0x3fffc0},
[10]uint32{0x000003d0, 0x00000040, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000},
},
// Prime with field representation such that the initial
// reduction does not result in a carry to bit 256.
//
// 2^256 - 4294968273 (secp256k1 prime)
{
[10]uint32{0x03fffc2f, 0x03ffffbf, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x003fffff},
[10]uint32{0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000},
},
// Prime larger than P that reduces to a value which is still
// larger than P when it has a magnitude of 1 due to its first
// word and does not result in a carry to bit 256.
//
// 2^256 - 4294968272 (secp256k1 prime + 1)
{
[10]uint32{0x03fffc30, 0x03ffffbf, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x003fffff},
[10]uint32{0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000},
},
// Prime larger than P that reduces to a value which is still
// larger than P when it has a magnitude of 1 due to its second
// word and does not result in a carry to bit 256.
//
// 2^256 - 4227859409 (secp256k1 prime + 0x4000000)
{
[10]uint32{0x03fffc2f, 0x03ffffc0, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x003fffff},
[10]uint32{0x00000000, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000},
},
// Prime larger than P that reduces to a value which is still
// larger than P when it has a magnitude of 1 due to a carry to
// bit 256, but would not be without the carry. These values
// come from the fact that P is 2^256 - 4294968273 and 977 is
// the low order word in the internal field representation.
//
// 2^256 * 5 - ((4294968273 - (977+1)) * 4)
{
[10]uint32{0x03ffffff, 0x03fffeff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x0013fffff},
[10]uint32{0x00001314, 0x00000040, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000000},
},
// Prime larger than P that reduces to a value which is still
// larger than P when it has a magnitude of 1 due to both a
// carry to bit 256 and the first word.
{
[10]uint32{0x03fffc30, 0x03ffffbf, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x07ffffff, 0x003fffff},
[10]uint32{0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001},
},
// Prime larger than P that reduces to a value which is still
// larger than P when it has a magnitude of 1 due to both a
// carry to bit 256 and the second word.
//
{
[10]uint32{0x03fffc2f, 0x03ffffc0, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x3ffffff, 0x07ffffff, 0x003fffff},
[10]uint32{0x00000000, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0000000, 0x00000000, 0x00000001},
},
// Prime larger than P that reduces to a value which is still
// larger than P when it has a magnitude of 1 due to a carry to
// bit 256 and the first and second words.
//
{
[10]uint32{0x03fffc30, 0x03ffffc0, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x07ffffff, 0x003fffff},
[10]uint32{0x00000001, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001},
},
}
t.Logf("Running %d tests", len(tests))
for i, test := range tests {
f := new(fieldVal)
for rawIntIdx := 0; rawIntIdx < len(test.raw); rawIntIdx++ {
f.n[rawIntIdx] = test.raw[rawIntIdx]
}
f.n = test.raw
f.Normalize()
if !reflect.DeepEqual(f.n, test.normalized) {
t.Errorf("fieldVal.Set #%d wrong normalized result\n"+
t.Errorf("fieldVal.Normalize #%d wrong result\n"+
"got: %x\nwant: %x", i, f.n, test.normalized)
continue
}