mirror of
https://gitlab.torproject.org/tpo/core/tor.git
synced 2025-02-24 22:58:50 +01:00
Merge remote-tracking branch 'tor-github/pr/318'
This commit is contained in:
commit
8294c40c96
5 changed files with 146 additions and 0 deletions
3
changes/bug27428
Normal file
3
changes/bug27428
Normal file
|
@ -0,0 +1,3 @@
|
|||
o Minor bugfixes (torrc):
|
||||
- Tor now validates that the ContactInfo config option is valid UTF-8
|
||||
when parsing torrc. Fixes bug 27428; bugfix on 0.0.8pre1.
|
|
@ -3389,6 +3389,9 @@ options_validate(or_options_t *old_options, or_options_t *options,
|
|||
log_notice(LD_CONFIG, "Your ContactInfo config option is not set. "
|
||||
"Please consider setting it, so we can contact you if your server is "
|
||||
"misconfigured or something else goes wrong.");
|
||||
const char *ContactInfo = options->ContactInfo;
|
||||
if (ContactInfo && !string_is_utf8(ContactInfo, strlen(ContactInfo)))
|
||||
REJECT("ContactInfo config option must be UTF-8.");
|
||||
|
||||
/* Special case on first boot if no Log options are given. */
|
||||
if (!options->Logs && !options->RunAsDaemon && !from_setconf) {
|
||||
|
|
|
@ -451,3 +451,93 @@ string_is_C_identifier(const char *string)
|
|||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/** A byte with the top <b>x</b> bits set. */
|
||||
#define TOP_BITS(x) ((uint8_t)(0xFF << (8 - (x))))
|
||||
/** A byte with the lowest <b>x</b> bits set. */
|
||||
#define LOW_BITS(x) ((uint8_t)(0xFF >> (8 - (x))))
|
||||
|
||||
/** Given the leading byte <b>b</b>, return the total number of bytes in the
|
||||
* UTF-8 character. Returns 0 if it's an invalid leading byte.
|
||||
*/
|
||||
static uint8_t
|
||||
bytes_in_char(uint8_t b)
|
||||
{
|
||||
if ((TOP_BITS(1) & b) == 0x00)
|
||||
return 1; // a 1-byte UTF-8 char, aka ASCII
|
||||
if ((TOP_BITS(3) & b) == TOP_BITS(2))
|
||||
return 2; // a 2-byte UTF-8 char
|
||||
if ((TOP_BITS(4) & b) == TOP_BITS(3))
|
||||
return 3; // a 3-byte UTF-8 char
|
||||
if ((TOP_BITS(5) & b) == TOP_BITS(4))
|
||||
return 4; // a 4-byte UTF-8 char
|
||||
|
||||
// Invalid: either the top 2 bits are 10, or the top 5 bits are 11111.
|
||||
return 0;
|
||||
}
|
||||
|
||||
/** Returns true iff <b>b</b> is a UTF-8 continuation byte. */
|
||||
static bool
|
||||
is_continuation_byte(uint8_t b)
|
||||
{
|
||||
uint8_t top2bits = b & TOP_BITS(2);
|
||||
return top2bits == TOP_BITS(1);
|
||||
}
|
||||
|
||||
/** Returns true iff the <b>len</b> bytes in <b>c</b> are a valid UTF-8
|
||||
* character.
|
||||
*/
|
||||
static bool
|
||||
validate_char(const uint8_t *c, uint8_t len)
|
||||
{
|
||||
if (len == 1)
|
||||
return true; // already validated this is an ASCII char earlier.
|
||||
|
||||
uint8_t mask = LOW_BITS(7 - len); // bitmask for the leading byte.
|
||||
uint32_t codepoint = c[0] & mask;
|
||||
|
||||
mask = LOW_BITS(6); // bitmask for continuation bytes.
|
||||
for (uint8_t i = 1; i < len; i++) {
|
||||
if (!is_continuation_byte(c[i]))
|
||||
return false;
|
||||
codepoint <<= 6;
|
||||
codepoint |= (c[i] & mask);
|
||||
}
|
||||
|
||||
if (len == 2 && codepoint <= 0x7f)
|
||||
return false; // Invalid, overly long encoding, should have fit in 1 byte.
|
||||
|
||||
if (len == 3 && codepoint <= 0x7ff)
|
||||
return false; // Invalid, overly long encoding, should have fit in 2 bytes.
|
||||
|
||||
if (len == 4 && codepoint <= 0xffff)
|
||||
return false; // Invalid, overly long encoding, should have fit in 3 bytes.
|
||||
|
||||
if (codepoint >= 0xd800 && codepoint <= 0xdfff)
|
||||
return false; // Invalid, reserved for UTF-16 surrogate pairs.
|
||||
|
||||
return codepoint <= 0x10ffff; // Check if within maximum.
|
||||
}
|
||||
|
||||
/** Returns true iff the first <b>len</b> bytes in <b>str</b> are a
|
||||
valid UTF-8 string. */
|
||||
int
|
||||
string_is_utf8(const char *str, size_t len)
|
||||
{
|
||||
for (size_t i = 0; i < len;) {
|
||||
uint8_t num_bytes = bytes_in_char(str[i]);
|
||||
if (num_bytes == 0) // Invalid leading byte found.
|
||||
return false;
|
||||
|
||||
size_t next_char = i + num_bytes;
|
||||
if (next_char > len)
|
||||
return false;
|
||||
|
||||
// Validate the continuation bytes in this multi-byte character,
|
||||
// and advance to the next character in the string.
|
||||
if (!validate_char((const uint8_t*)&str[i], num_bytes))
|
||||
return false;
|
||||
i = next_char;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -52,4 +52,6 @@ const char *find_str_at_start_of_line(const char *haystack,
|
|||
|
||||
int string_is_C_identifier(const char *string);
|
||||
|
||||
int string_is_utf8(const char *str, size_t len);
|
||||
|
||||
#endif /* !defined(TOR_UTIL_STRING_H) */
|
||||
|
|
|
@ -4012,6 +4012,53 @@ test_util_string_is_C_identifier(void *ptr)
|
|||
;
|
||||
}
|
||||
|
||||
static void
|
||||
test_util_string_is_utf8(void *ptr)
|
||||
{
|
||||
(void)ptr;
|
||||
|
||||
tt_int_op(1, OP_EQ, string_is_utf8(NULL, 0));
|
||||
tt_int_op(1, OP_EQ, string_is_utf8("", 1));
|
||||
tt_int_op(1, OP_EQ, string_is_utf8("\uFEFF", 3));
|
||||
tt_int_op(1, OP_EQ, string_is_utf8("\uFFFE", 3));
|
||||
tt_int_op(1, OP_EQ, string_is_utf8("ascii\x7f\n", 7));
|
||||
tt_int_op(1, OP_EQ, string_is_utf8("Risqu\u00e9=1", 9));
|
||||
|
||||
// Validate exactly 'len' bytes.
|
||||
tt_int_op(0, OP_EQ, string_is_utf8("\0\x80", 2));
|
||||
tt_int_op(0, OP_EQ, string_is_utf8("Risqu\u00e9=1", 6));
|
||||
|
||||
// Reject sequences with missing bytes.
|
||||
tt_int_op(0, OP_EQ, string_is_utf8("\x80", 1));
|
||||
tt_int_op(0, OP_EQ, string_is_utf8("\xc2", 1));
|
||||
tt_int_op(0, OP_EQ, string_is_utf8("\xc2 ", 2));
|
||||
tt_int_op(0, OP_EQ, string_is_utf8("\xe1\x80", 2));
|
||||
tt_int_op(0, OP_EQ, string_is_utf8("\xe1\x80 ", 3));
|
||||
tt_int_op(0, OP_EQ, string_is_utf8("\xf1\x80\x80", 3));
|
||||
tt_int_op(0, OP_EQ, string_is_utf8("\xf1\x80\x80 ", 4));
|
||||
|
||||
// Reject encodings that are overly long.
|
||||
tt_int_op(0, OP_EQ, string_is_utf8("\xc1\xbf", 2));
|
||||
tt_int_op(1, OP_EQ, string_is_utf8("\xc2\x80", 2));
|
||||
tt_int_op(0, OP_EQ, string_is_utf8("\xe0\x9f\xbf", 3));
|
||||
tt_int_op(1, OP_EQ, string_is_utf8("\xe0\xa0\x80", 3));
|
||||
tt_int_op(0, OP_EQ, string_is_utf8("\xf0\x8f\xbf\xbf", 4));
|
||||
tt_int_op(1, OP_EQ, string_is_utf8("\xf0\x90\x80\x80", 4));
|
||||
|
||||
// Reject UTF-16 surrogate halves.
|
||||
tt_int_op(1, OP_EQ, string_is_utf8("\xed\x9f\xbf", 3));
|
||||
tt_int_op(0, OP_EQ, string_is_utf8("\xed\xa0\x80", 3));
|
||||
tt_int_op(0, OP_EQ, string_is_utf8("\xed\xbf\xbf", 3));
|
||||
tt_int_op(1, OP_EQ, string_is_utf8("\xee\x80\x80", 3));
|
||||
|
||||
// The maximum legal codepoint, 10FFFF.
|
||||
tt_int_op(1, OP_EQ, string_is_utf8("\xf4\x8f\xbf\xbf", 4));
|
||||
tt_int_op(0, OP_EQ, string_is_utf8("\xf4\x90\x80\x80", 4));
|
||||
|
||||
done:
|
||||
;
|
||||
}
|
||||
|
||||
static void
|
||||
test_util_asprintf(void *ptr)
|
||||
{
|
||||
|
@ -6409,6 +6456,7 @@ struct testcase_t util_tests[] = {
|
|||
UTIL_TEST(clamp_double_to_int64, 0),
|
||||
UTIL_TEST(find_str_at_start_of_line, 0),
|
||||
UTIL_TEST(string_is_C_identifier, 0),
|
||||
UTIL_TEST(string_is_utf8, 0),
|
||||
UTIL_TEST(asprintf, 0),
|
||||
UTIL_TEST(listdir, 0),
|
||||
UTIL_TEST(parent_dir, 0),
|
||||
|
|
Loading…
Add table
Reference in a new issue