btcec: Optimize and correct normalize.

This modifies the normalize function of the internal field value to both optimize it and address an issue where the reduction could lead to an incorrect result with a small range of values. It also adds tests to ensure the behavior is correct. The following benchmark shows the relative speedups as a result of the optimization on my system. In particular, the changes result in approximately a 14% speedup in Normalize, which ultimately translates to a 2% speedup in signature verifies. benchmark old ns/op new ns/op delta -------------------------------------------------------------------- BenchmarkAddJacobian 1364 1289 -5.50% BenchmarkAddJacobianNotZOne 3150 3091 -1.87% BenchmarkScalarBaseMult 134117 132816 -0.97% BenchmarkScalarBaseMultLarge 135067 132966 -1.56% BenchmarkScalarMult 411218 402217 -2.19% BenchmarkSigVerify 671585 657833 -2.05% BenchmarkFieldNormalize 36.0 31.0 -13.89%
2025-01-19 05:33:36 +01:00 · 2017-06-07 04:31:11 -05:00 · 2017-06-07 04:31:11 -05:00 · 1238b7e55a
commit 1238b7e55a
parent 711e7dbb2e
2 changed files with 143 additions and 124 deletions
--- a/btcec/field.go
+++ b/btcec/field.go
@ -100,10 +100,6 @@ const (
 	// fieldPrimeWordOne is word one of the secp256k1 prime in the
 	// internal field representation.  It is used during negation.
 	fieldPrimeWordOne = 0x3ffffbf
-
-	// primeLowBits is the lower 2*fieldBase bits of the secp256k1 prime in
-	// its standard normalized form.  It is used during modular reduction.
-	primeLowBits = 0xffffefffffc2f
 )

 // fieldVal implements optimized fixed-precision arithmetic over the
@ -250,39 +246,15 @@ func (f *fieldVal) SetHex(hexString string) *fieldVal {
 // performs fast modular reduction over the secp256k1 prime by making use of the
 // special form of the prime.
 func (f *fieldVal) Normalize() *fieldVal {
-	// The field representation leaves 6 bits of overflow in each
-	// word so intermediate calculations can be performed without needing
-	// to propagate the carry to each higher word during the calculations.
-	// In order to normalize, first we need to "compact" the full 256-bit
-	// value to the right and treat the additional 64 leftmost bits as
-	// the magnitude.
-	m := f.n[0]
-	t0 := m & fieldBaseMask
-	m = (m >> fieldBase) + f.n[1]
-	t1 := m & fieldBaseMask
-	m = (m >> fieldBase) + f.n[2]
-	t2 := m & fieldBaseMask
-	m = (m >> fieldBase) + f.n[3]
-	t3 := m & fieldBaseMask
-	m = (m >> fieldBase) + f.n[4]
-	t4 := m & fieldBaseMask
-	m = (m >> fieldBase) + f.n[5]
-	t5 := m & fieldBaseMask
-	m = (m >> fieldBase) + f.n[6]
-	t6 := m & fieldBaseMask
-	m = (m >> fieldBase) + f.n[7]
-	t7 := m & fieldBaseMask
-	m = (m >> fieldBase) + f.n[8]
-	t8 := m & fieldBaseMask
-	m = (m >> fieldBase) + f.n[9]
-	t9 := m & fieldMSBMask
-	m = m >> fieldMSBBits
-
-	// At this point, if the magnitude is greater than 0, the overall value
-	// is greater than the max possible 256-bit value.  In particular, it is
-	// "how many times larger" than the max value it is.  Since this field
-	// is doing arithmetic modulo the secp256k1 prime, we need to perform
-	// modular reduction over the prime.
+	// The field representation leaves 6 bits of overflow in each word so
+	// intermediate calculations can be performed without needing to
+	// propagate the carry to each higher word during the calculations.  In
+	// order to normalize, we need to "compact" the full 256-bit value to
+	// the right while propagating any carries through to the high order
+	// word.
+	//
+	// Since this field is doing arithmetic modulo the secp256k1 prime, we
+	// also need to perform modular reduction over the prime.
 	//
 	// Per [HAC] section 14.3.4: Reduction method of moduli of special form,
 	// when the modulus is of the special form m = b^t - c, highly efficient
@ -298,98 +270,87 @@ func (f *fieldVal) Normalize() *fieldVal {
 	//
 	// The algorithm presented in the referenced section typically repeats
 	// until the quotient is zero.  However, due to our field representation
-	// we already know at least how many times we would need to repeat as
-	// it's the value currently in m.  Thus we can simply multiply the
-	// magnitude by the field representation of the prime and do a single
-	// iteration.  Notice that nothing will be changed when the magnitude is
-	// zero, so we could skip this in that case, however always running
-	// regardless allows it to run in constant time.
-	r := t0 + m*977
-	t0 = r & fieldBaseMask
-	r = (r >> fieldBase) + t1 + m*64
-	t1 = r & fieldBaseMask
-	r = (r >> fieldBase) + t2
-	t2 = r & fieldBaseMask
-	r = (r >> fieldBase) + t3
-	t3 = r & fieldBaseMask
-	r = (r >> fieldBase) + t4
-	t4 = r & fieldBaseMask
-	r = (r >> fieldBase) + t5
-	t5 = r & fieldBaseMask
-	r = (r >> fieldBase) + t6
-	t6 = r & fieldBaseMask
-	r = (r >> fieldBase) + t7
-	t7 = r & fieldBaseMask
-	r = (r >> fieldBase) + t8
-	t8 = r & fieldBaseMask
-	r = (r >> fieldBase) + t9
-	t9 = r & fieldMSBMask
+	// we already know to within one reduction how many times we would need
+	// to repeat as it's the uppermost bits of the high order word.  Thus we
+	// can simply multiply the magnitude by the field representation of the
+	// prime and do a single iteration.  After this step there might be an
+	// additional carry to bit 256 (bit 22 of the high order word).
+	t9 := f.n[9]
+	m := t9 >> fieldMSBBits
+	t9 = t9 & fieldMSBMask
+	t0 := f.n[0] + m*977
+	t1 := (t0 >> fieldBase) + f.n[1] + (m << 6)
+	t0 = t0 & fieldBaseMask
+	t2 := (t1 >> fieldBase) + f.n[2]
+	t1 = t1 & fieldBaseMask
+	t3 := (t2 >> fieldBase) + f.n[3]
+	t2 = t2 & fieldBaseMask
+	t4 := (t3 >> fieldBase) + f.n[4]
+	t3 = t3 & fieldBaseMask
+	t5 := (t4 >> fieldBase) + f.n[5]
+	t4 = t4 & fieldBaseMask
+	t6 := (t5 >> fieldBase) + f.n[6]
+	t5 = t5 & fieldBaseMask
+	t7 := (t6 >> fieldBase) + f.n[7]
+	t6 = t6 & fieldBaseMask
+	t8 := (t7 >> fieldBase) + f.n[8]
+	t7 = t7 & fieldBaseMask
+	t9 = (t8 >> fieldBase) + t9
+	t8 = t8 & fieldBaseMask

-	// At this point, the result will be in the range 0 <= result <=
-	// prime + (2^64 - c).  Therefore, one more subtraction of the prime
-	// might be needed if the current result is greater than or equal to the
-	// prime.  The following does the final reduction in constant time.
-	// Note that the if/else here intentionally does the bitwise OR with
-	// zero even though it won't change the value to ensure constant time
-	// between the branches.
-	var mask int32
-	lowBits := uint64(t1)<<fieldBase | uint64(t0)
-	if lowBits < primeLowBits {
-		mask |= -1
+	// At this point, the magnitude is guaranteed to be one, however, the
+	// value could still be greater than the prime if there was either a
+	// carry through to bit 256 (bit 22 of the higher order word) or the
+	// value is greater than or equal to the field characteristic.  The
+	// following determines if either or these conditions are true and does
+	// the final reduction in constant time.
+	//
+	// Note that the if/else statements here intentionally do the bitwise
+	// operators even when it won't change the value to ensure constant time
+	// between the branches.  Also note that 'm' will be zero when neither
+	// of the aforementioned conditions are true and the value will not be
+	// changed when 'm' is zero.
+	m = 1
+	if t9 == fieldMSBMask {
+		m &= 1
 	} else {
-		mask |= 0
+		m &= 0
 	}
-	if t2 < fieldBaseMask {
-		mask |= -1
+	if t2&t3&t4&t5&t6&t7&t8 == fieldBaseMask {
+		m &= 1
 	} else {
-		mask |= 0
+		m &= 0
 	}
-	if t3 < fieldBaseMask {
-		mask |= -1
+	if ((t0+977)>>fieldBase + t1 + 64) > fieldBaseMask {
+		m &= 1
 	} else {
-		mask |= 0
+		m &= 0
 	}
-	if t4 < fieldBaseMask {
-		mask |= -1
+	if t9>>fieldMSBBits != 0 {
+		m |= 1
 	} else {
-		mask |= 0
+		m |= 0
 	}
-	if t5 < fieldBaseMask {
-		mask |= -1
-	} else {
-		mask |= 0
-	}
-	if t6 < fieldBaseMask {
-		mask |= -1
-	} else {
-		mask |= 0
-	}
-	if t7 < fieldBaseMask {
-		mask |= -1
-	} else {
-		mask |= 0
-	}
-	if t8 < fieldBaseMask {
-		mask |= -1
-	} else {
-		mask |= 0
-	}
-	if t9 < fieldMSBMask {
-		mask |= -1
-	} else {
-		mask |= 0
-	}
-	lowBits -= ^uint64(mask) & primeLowBits
-	t0 = uint32(lowBits & fieldBaseMask)
-	t1 = uint32((lowBits >> fieldBase) & fieldBaseMask)
-	t2 = t2 & uint32(mask)
-	t3 = t3 & uint32(mask)
-	t4 = t4 & uint32(mask)
-	t5 = t5 & uint32(mask)
-	t6 = t6 & uint32(mask)
-	t7 = t7 & uint32(mask)
-	t8 = t8 & uint32(mask)
-	t9 = t9 & uint32(mask)
+	t0 = t0 + m*977
+	t1 = (t0 >> fieldBase) + t1 + (m << 6)
+	t0 = t0 & fieldBaseMask
+	t2 = (t1 >> fieldBase) + t2
+	t1 = t1 & fieldBaseMask
+	t3 = (t2 >> fieldBase) + t3
+	t2 = t2 & fieldBaseMask
+	t4 = (t3 >> fieldBase) + t4
+	t3 = t3 & fieldBaseMask
+	t5 = (t4 >> fieldBase) + t5
+	t4 = t4 & fieldBaseMask
+	t6 = (t5 >> fieldBase) + t6
+	t5 = t5 & fieldBaseMask
+	t7 = (t6 >> fieldBase) + t7
+	t6 = t6 & fieldBaseMask
+	t8 = (t7 >> fieldBase) + t8
+	t7 = t7 & fieldBaseMask
+	t9 = (t8 >> fieldBase) + t9
+	t8 = t8 & fieldBaseMask
+	t9 = t9 & fieldMSBMask // Remove potential multiple of 2^256.

 	// Finally, set the normalized and reduced words.
 	f.n[0] = t0
--- a/btcec/field_test.go
+++ b/btcec/field_test.go
@ -247,17 +247,75 @@ func TestNormalize(t *testing.T) {
 			[10]uint32{0xffffffff, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0xffffffc0, 0x3fffc0},
 			[10]uint32{0x000003d0, 0x00000040, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000},
 		},
+		// Prime with field representation such that the initial
+		// reduction does not result in a carry to bit 256.
+		//
+		// 2^256 - 4294968273 (secp256k1 prime)
+		{
+			[10]uint32{0x03fffc2f, 0x03ffffbf, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x003fffff},
+			[10]uint32{0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000},
+		},
+		// Prime larger than P that reduces to a value which is still
+		// larger than P when it has a magnitude of 1 due to its first
+		// word and does not result in a carry to bit 256.
+		//
+		// 2^256 - 4294968272 (secp256k1 prime + 1)
+		{
+			[10]uint32{0x03fffc30, 0x03ffffbf, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x003fffff},
+			[10]uint32{0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000},
+		},
+		// Prime larger than P that reduces to a value which is still
+		// larger than P when it has a magnitude of 1 due to its second
+		// word and does not result in a carry to bit 256.
+		//
+		// 2^256 - 4227859409 (secp256k1 prime + 0x4000000)
+		{
+			[10]uint32{0x03fffc2f, 0x03ffffc0, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x003fffff},
+			[10]uint32{0x00000000, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000},
+		},
+		// Prime larger than P that reduces to a value which is still
+		// larger than P when it has a magnitude of 1 due to a carry to
+		// bit 256, but would not be without the carry.  These values
+		// come from the fact that P is 2^256 - 4294968273 and 977 is
+		// the low order word in the internal field representation.
+		//
+		// 2^256 * 5 - ((4294968273 - (977+1)) * 4)
+		{
+			[10]uint32{0x03ffffff, 0x03fffeff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x0013fffff},
+			[10]uint32{0x00001314, 0x00000040, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000000},
+		},
+		// Prime larger than P that reduces to a value which is still
+		// larger than P when it has a magnitude of 1 due to both a
+		// carry to bit 256 and the first word.
+		{
+			[10]uint32{0x03fffc30, 0x03ffffbf, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x07ffffff, 0x003fffff},
+			[10]uint32{0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001},
+		},
+		// Prime larger than P that reduces to a value which is still
+		// larger than P when it has a magnitude of 1 due to both a
+		// carry to bit 256 and the second word.
+		//
+		{
+			[10]uint32{0x03fffc2f, 0x03ffffc0, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x3ffffff, 0x07ffffff, 0x003fffff},
+			[10]uint32{0x00000000, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0000000, 0x00000000, 0x00000001},
+		},
+		// Prime larger than P that reduces to a value which is still
+		// larger than P when it has a magnitude of 1 due to a carry to
+		// bit 256 and the first and second words.
+		//
+		{
+			[10]uint32{0x03fffc30, 0x03ffffc0, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x07ffffff, 0x003fffff},
+			[10]uint32{0x00000001, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001},
+		},
 	}

 	t.Logf("Running %d tests", len(tests))
 	for i, test := range tests {
 		f := new(fieldVal)
-		for rawIntIdx := 0; rawIntIdx < len(test.raw); rawIntIdx++ {
-			f.n[rawIntIdx] = test.raw[rawIntIdx]
-		}
+		f.n = test.raw
 		f.Normalize()
 		if !reflect.DeepEqual(f.n, test.normalized) {
-			t.Errorf("fieldVal.Set #%d wrong normalized result\n"+
+			t.Errorf("fieldVal.Normalize #%d wrong result\n"+
 				"got: %x\nwant: %x", i, f.n, test.normalized)
 			continue
 		}