btcec: Optimize and correct normalize.

This modifies the normalize function of the internal field value to both optimize it and address an issue where the reduction could lead to an incorrect result with a small range of values. It also adds tests to ensure the behavior is correct. The following benchmark shows the relative speedups as a result of the optimization on my system. In particular, the changes result in approximately a 14% speedup in Normalize, which ultimately translates to a 2% speedup in signature verifies. benchmark old ns/op new ns/op delta -------------------------------------------------------------------- BenchmarkAddJacobian 1364 1289 -5.50% BenchmarkAddJacobianNotZOne 3150 3091 -1.87% BenchmarkScalarBaseMult 134117 132816 -0.97% BenchmarkScalarBaseMultLarge 135067 132966 -1.56% BenchmarkScalarMult 411218 402217 -2.19% BenchmarkSigVerify 671585 657833 -2.05% BenchmarkFieldNormalize 36.0 31.0 -13.89%
2025-01-19 05:33:36 +01:00 · 2017-06-07 04:31:11 -05:00 · 2017-06-07 04:31:11 -05:00 · 1238b7e55a
commit 1238b7e55a
parent 711e7dbb2e
2 changed files with 143 additions and 124 deletions
--- a/btcec/field.go
+++ b/btcec/field.go
@ -100,10 +100,6 @@ const (
 	// fieldPrimeWordOne is word one of the secp256k1 prime in the
 	// internal field representation.  It is used during negation.
 	fieldPrimeWordOne = 0x3ffffbf
 	// primeLowBits is the lower 2*fieldBase bits of the secp256k1 prime in
 	// its standard normalized form.  It is used during modular reduction.
 	primeLowBits = 0xffffefffffc2f
 )
 // fieldVal implements optimized fixed-precision arithmetic over the
@ -250,39 +246,15 @@ func (f *fieldVal) SetHex(hexString string) *fieldVal {
 // performs fast modular reduction over the secp256k1 prime by making use of the
 // special form of the prime.
 func (f *fieldVal) Normalize() *fieldVal {
-	// The field representation leaves 6 bits of overflow in each
+	// The field representation leaves 6 bits of overflow in each word so
-	// word so intermediate calculations can be performed without needing
+	// intermediate calculations can be performed without needing to
-	// to propagate the carry to each higher word during the calculations.
+	// propagate the carry to each higher word during the calculations.  In
-	// In order to normalize, first we need to "compact" the full 256-bit
+	// order to normalize, we need to "compact" the full 256-bit value to
-	// value to the right and treat the additional 64 leftmost bits as
+	// the right while propagating any carries through to the high order
-	// the magnitude.
+	// word.
-	m := f.n[0]
+	//
-	t0 := m & fieldBaseMask
+	// Since this field is doing arithmetic modulo the secp256k1 prime, we
-	m = (m >> fieldBase) + f.n[1]
+	// also need to perform modular reduction over the prime.
 	t1 := m & fieldBaseMask
 	m = (m >> fieldBase) + f.n[2]
 	t2 := m & fieldBaseMask
 	m = (m >> fieldBase) + f.n[3]
 	t3 := m & fieldBaseMask
 	m = (m >> fieldBase) + f.n[4]
 	t4 := m & fieldBaseMask
 	m = (m >> fieldBase) + f.n[5]
 	t5 := m & fieldBaseMask
 	m = (m >> fieldBase) + f.n[6]
 	t6 := m & fieldBaseMask
 	m = (m >> fieldBase) + f.n[7]
 	t7 := m & fieldBaseMask
 	m = (m >> fieldBase) + f.n[8]
 	t8 := m & fieldBaseMask
 	m = (m >> fieldBase) + f.n[9]
 	t9 := m & fieldMSBMask
 	m = m >> fieldMSBBits
 	// At this point, if the magnitude is greater than 0, the overall value
 	// is greater than the max possible 256-bit value.  In particular, it is
 	// "how many times larger" than the max value it is.  Since this field
 	// is doing arithmetic modulo the secp256k1 prime, we need to perform
 	// modular reduction over the prime.
 	//
 	// Per [HAC] section 14.3.4: Reduction method of moduli of special form,
 	// when the modulus is of the special form m = b^t - c, highly efficient
@ -298,98 +270,87 @@ func (f *fieldVal) Normalize() *fieldVal {
 	//
 	// The algorithm presented in the referenced section typically repeats
 	// until the quotient is zero.  However, due to our field representation
-	// we already know at least how many times we would need to repeat as
+	// we already know to within one reduction how many times we would need
-	// it's the value currently in m.  Thus we can simply multiply the
+	// to repeat as it's the uppermost bits of the high order word.  Thus we
-	// magnitude by the field representation of the prime and do a single
+	// can simply multiply the magnitude by the field representation of the
-	// iteration.  Notice that nothing will be changed when the magnitude is
+	// prime and do a single iteration.  After this step there might be an
-	// zero, so we could skip this in that case, however always running
+	// additional carry to bit 256 (bit 22 of the high order word).
-	// regardless allows it to run in constant time.
+	t9 := f.n[9]
-	r := t0 + m*977
+	m := t9 >> fieldMSBBits
-	t0 = r & fieldBaseMask
+	t9 = t9 & fieldMSBMask
-	r = (r >> fieldBase) + t1 + m*64
+	t0 := f.n[0] + m*977
-	t1 = r & fieldBaseMask
+	t1 := (t0 >> fieldBase) + f.n[1] + (m << 6)
-	r = (r >> fieldBase) + t2
+	t0 = t0 & fieldBaseMask
-	t2 = r & fieldBaseMask
+	t2 := (t1 >> fieldBase) + f.n[2]
-	r = (r >> fieldBase) + t3
+	t1 = t1 & fieldBaseMask
-	t3 = r & fieldBaseMask
+	t3 := (t2 >> fieldBase) + f.n[3]
-	r = (r >> fieldBase) + t4
+	t2 = t2 & fieldBaseMask
-	t4 = r & fieldBaseMask
+	t4 := (t3 >> fieldBase) + f.n[4]
-	r = (r >> fieldBase) + t5
+	t3 = t3 & fieldBaseMask
-	t5 = r & fieldBaseMask
+	t5 := (t4 >> fieldBase) + f.n[5]
-	r = (r >> fieldBase) + t6
+	t4 = t4 & fieldBaseMask
-	t6 = r & fieldBaseMask
+	t6 := (t5 >> fieldBase) + f.n[6]
-	r = (r >> fieldBase) + t7
+	t5 = t5 & fieldBaseMask
-	t7 = r & fieldBaseMask
+	t7 := (t6 >> fieldBase) + f.n[7]
-	r = (r >> fieldBase) + t8
+	t6 = t6 & fieldBaseMask
-	t8 = r & fieldBaseMask
+	t8 := (t7 >> fieldBase) + f.n[8]
-	r = (r >> fieldBase) + t9
+	t7 = t7 & fieldBaseMask
-	t9 = r & fieldMSBMask
+	t9 = (t8 >> fieldBase) + t9
 	t8 = t8 & fieldBaseMask
-	// At this point, the result will be in the range 0 <= result <=
+	// At this point, the magnitude is guaranteed to be one, however, the
-	// prime + (2^64 - c).  Therefore, one more subtraction of the prime
+	// value could still be greater than the prime if there was either a
-	// might be needed if the current result is greater than or equal to the
+	// carry through to bit 256 (bit 22 of the higher order word) or the
-	// prime.  The following does the final reduction in constant time.
+	// value is greater than or equal to the field characteristic.  The
-	// Note that the if/else here intentionally does the bitwise OR with
+	// following determines if either or these conditions are true and does
-	// zero even though it won't change the value to ensure constant time
+	// the final reduction in constant time.
-	// between the branches.
+	//
-	var mask int32
+	// Note that the if/else statements here intentionally do the bitwise
-	lowBits := uint64(t1)<<fieldBase | uint64(t0)
+	// operators even when it won't change the value to ensure constant time
-	if lowBits < primeLowBits {
+	// between the branches.  Also note that 'm' will be zero when neither
-		mask |= -1
+	// of the aforementioned conditions are true and the value will not be
 	// changed when 'm' is zero.
 	m = 1
 	if t9 == fieldMSBMask {
 		m &= 1
 	} else {
-		mask |= 0
+		m &= 0
 	}
-	if t2 < fieldBaseMask {
+	if t2&t3&t4&t5&t6&t7&t8 == fieldBaseMask {
-		mask |= -1
+		m &= 1
 	} else {
-		mask |= 0
+		m &= 0
 	}
-	if t3 < fieldBaseMask {
+	if ((t0+977)>>fieldBase + t1 + 64) > fieldBaseMask {
-		mask |= -1
+		m &= 1
 	} else {
-		mask |= 0
+		m &= 0
 	}
-	if t4 < fieldBaseMask {
+	if t9>>fieldMSBBits != 0 {
-		mask |= -1
+		m |= 1
 	} else {
-		mask |= 0
+		m |= 0
 	}
-	if t5 < fieldBaseMask {
+	t0 = t0 + m*977
-		mask |= -1
+	t1 = (t0 >> fieldBase) + t1 + (m << 6)
-	} else {
+	t0 = t0 & fieldBaseMask
-		mask |= 0
+	t2 = (t1 >> fieldBase) + t2
-	}
+	t1 = t1 & fieldBaseMask
-	if t6 < fieldBaseMask {
+	t3 = (t2 >> fieldBase) + t3
-		mask |= -1
+	t2 = t2 & fieldBaseMask
-	} else {
+	t4 = (t3 >> fieldBase) + t4
-		mask |= 0
+	t3 = t3 & fieldBaseMask
-	}
+	t5 = (t4 >> fieldBase) + t5
-	if t7 < fieldBaseMask {
+	t4 = t4 & fieldBaseMask
-		mask |= -1
+	t6 = (t5 >> fieldBase) + t6
-	} else {
+	t5 = t5 & fieldBaseMask
-		mask |= 0
+	t7 = (t6 >> fieldBase) + t7
-	}
+	t6 = t6 & fieldBaseMask
-	if t8 < fieldBaseMask {
+	t8 = (t7 >> fieldBase) + t8
-		mask |= -1
+	t7 = t7 & fieldBaseMask
-	} else {
+	t9 = (t8 >> fieldBase) + t9
-		mask |= 0
+	t8 = t8 & fieldBaseMask
-	}
+	t9 = t9 & fieldMSBMask // Remove potential multiple of 2^256.
 	if t9 < fieldMSBMask {
 		mask |= -1
 	} else {
 		mask |= 0
 	}
 	lowBits -= ^uint64(mask) & primeLowBits
 	t0 = uint32(lowBits & fieldBaseMask)
 	t1 = uint32((lowBits >> fieldBase) & fieldBaseMask)
 	t2 = t2 & uint32(mask)
 	t3 = t3 & uint32(mask)
 	t4 = t4 & uint32(mask)
 	t5 = t5 & uint32(mask)
 	t6 = t6 & uint32(mask)
 	t7 = t7 & uint32(mask)
 	t8 = t8 & uint32(mask)
 	t9 = t9 & uint32(mask)
 	// Finally, set the normalized and reduced words.
 	f.n[0] = t0
--- a/btcec/field_test.go
+++ b/btcec/field_test.go
@ -247,17 +247,75 @@ func TestNormalize(t *testing.T) {
 			[10]uint32{0xffffffff, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0x3fffc0},
 			[10]uint32{0x000003d0, 0x00000040, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000},
 		},
 		// Prime with field representation such that the initial
 		// reduction does not result in a carry to bit 256.
 		//
 		// 2^256 - 4294968273 (secp256k1 prime)
 		{
 			[10]uint32{0x03fffc2f, 0x03ffffbf, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x003fffff},
 			[10]uint32{0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000},
 		},
 		// Prime larger than P that reduces to a value which is still
 		// larger than P when it has a magnitude of 1 due to its first
 		// word and does not result in a carry to bit 256.
 		//
 		// 2^256 - 4294968272 (secp256k1 prime + 1)
 		{
 			[10]uint32{0x03fffc30, 0x03ffffbf, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x003fffff},
 			[10]uint32{0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000},
 		},
 		// Prime larger than P that reduces to a value which is still
 		// larger than P when it has a magnitude of 1 due to its second
 		// word and does not result in a carry to bit 256.
 		//
 		// 2^256 - 4227859409 (secp256k1 prime + 0x4000000)
 		{
 			[10]uint32{0x03fffc2f, 0x03ffffc0, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x003fffff},
 			[10]uint32{0x00000000, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000},
 		},
 		// Prime larger than P that reduces to a value which is still
 		// larger than P when it has a magnitude of 1 due to a carry to
 		// bit 256, but would not be without the carry.  These values
 		// come from the fact that P is 2^256 - 4294968273 and 977 is
 		// the low order word in the internal field representation.
 		//
 		// 2^256 * 5 - ((4294968273 - (977+1)) * 4)
 		{
 			[10]uint32{0x03ffffff, 0x03fffeff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x0013fffff},
 			[10]uint32{0x00001314, 0x00000040, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000000},
 		},
 		// Prime larger than P that reduces to a value which is still
 		// larger than P when it has a magnitude of 1 due to both a
 		// carry to bit 256 and the first word.
 		{
 			[10]uint32{0x03fffc30, 0x03ffffbf, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x07ffffff, 0x003fffff},
 			[10]uint32{0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001},
 		},
 		// Prime larger than P that reduces to a value which is still
 		// larger than P when it has a magnitude of 1 due to both a
 		// carry to bit 256 and the second word.
 		//
 		{
 			[10]uint32{0x03fffc2f, 0x03ffffc0, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x3ffffff, 0x07ffffff, 0x003fffff},
 			[10]uint32{0x00000000, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0000000, 0x00000000, 0x00000001},
 		},
 		// Prime larger than P that reduces to a value which is still
 		// larger than P when it has a magnitude of 1 due to a carry to
 		// bit 256 and the first and second words.
 		//
 		{
 			[10]uint32{0x03fffc30, 0x03ffffc0, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x07ffffff, 0x003fffff},
 			[10]uint32{0x00000001, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001},
 		},
 	}
 	t.Logf("Running %d tests", len(tests))
 	for i, test := range tests {
 		f := new(fieldVal)
-		for rawIntIdx := 0; rawIntIdx < len(test.raw); rawIntIdx++ {
+		f.n = test.raw
 			f.n[rawIntIdx] = test.raw[rawIntIdx]
 		}
 		f.Normalize()
 		if !reflect.DeepEqual(f.n, test.normalized) {
-			t.Errorf("fieldVal.Set #%d wrong normalized result\n"+
+			t.Errorf("fieldVal.Normalize #%d wrong result\n"+
 				"got: %x\nwant: %x", i, f.n, test.normalized)
 			continue
 		}