mirror of
https://github.com/ElementsProject/lightning.git
synced 2024-11-19 01:43:36 +01:00
ccan: add UTF-8 module for checking alias fields.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
This commit is contained in:
parent
fecef618bc
commit
2639b1e9a9
8
Makefile
8
Makefile
@ -87,7 +87,8 @@ CCAN_OBJS := \
|
||||
ccan-tal-str.o \
|
||||
ccan-tal.o \
|
||||
ccan-time.o \
|
||||
ccan-timer.o
|
||||
ccan-timer.o \
|
||||
ccan-utf8.o
|
||||
|
||||
CCAN_HEADERS := \
|
||||
$(CCANDIR)/config.h \
|
||||
@ -153,7 +154,8 @@ CCAN_HEADERS := \
|
||||
$(CCANDIR)/ccan/tcon/tcon.h \
|
||||
$(CCANDIR)/ccan/time/time.h \
|
||||
$(CCANDIR)/ccan/timer/timer.h \
|
||||
$(CCANDIR)/ccan/typesafe_cb/typesafe_cb.h
|
||||
$(CCANDIR)/ccan/typesafe_cb/typesafe_cb.h \
|
||||
$(CCANDIR)/ccan/utf8/utf8.h
|
||||
|
||||
ALL_GEN_HEADERS += gen_version.h
|
||||
|
||||
@ -577,3 +579,5 @@ ccan-rbuf.o: $(CCANDIR)/ccan/rbuf/rbuf.c
|
||||
$(CC) $(CFLAGS) -c -o $@ $<
|
||||
ccan-str-base32.o: $(CCANDIR)/ccan/str/base32/base32.c
|
||||
$(CC) $(CFLAGS) -c -o $@ $<
|
||||
ccan-utf8.o: $(CCANDIR)/ccan/utf8/utf8.c
|
||||
$(CC) $(CFLAGS) -c -o $@ $<
|
||||
|
@ -1,3 +1,3 @@
|
||||
CCAN imported from http://ccodearchive.net.
|
||||
|
||||
CCAN version: init-2432-gd830ca0e
|
||||
CCAN version: init-2434-gac8694de
|
||||
|
@ -70,7 +70,6 @@ int main(int argc, char *argv[])
|
||||
if (strcmp(argv[1], "depends") == 0) {
|
||||
printf("ccan/array_size\n");
|
||||
printf("ccan/ilog\n");
|
||||
printf("ccan/likely\n");
|
||||
printf("ccan/list\n");
|
||||
printf("ccan/time\n");
|
||||
return 0;
|
||||
|
@ -2,7 +2,6 @@
|
||||
#include <ccan/timer/timer.h>
|
||||
#include <ccan/array_size/array_size.h>
|
||||
#include <ccan/ilog/ilog.h>
|
||||
#include <ccan/likely/likely.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
|
1
ccan/ccan/utf8/LICENSE
Symbolic link
1
ccan/ccan/utf8/LICENSE
Symbolic link
@ -0,0 +1 @@
|
||||
../../licenses/BSD-MIT
|
48
ccan/ccan/utf8/_info
Normal file
48
ccan/ccan/utf8/_info
Normal file
@ -0,0 +1,48 @@
|
||||
#include "config.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
/**
|
||||
* utf8 - Simple routines to encode/decode valid UTF-8.
|
||||
*
|
||||
* This code contains routines to encode and decode UTF-8 characters.
|
||||
* Table and test code stolen entirely from:
|
||||
* Copyright (c) 2017 Christian Hansen <chansen@cpan.org>
|
||||
* <https://github.com/chansen/c-utf8-valid>
|
||||
*
|
||||
* Example:
|
||||
* int main(int argc, char *argv[])
|
||||
* {
|
||||
* size_t i;
|
||||
* struct utf8_state utf8_state = UTF8_STATE_INIT;
|
||||
* bool decoded = true;
|
||||
*
|
||||
* for (i = 0; i < strlen(argv[1]); i++) {
|
||||
* decoded = utf8_decode(&utf8_state, argv[1][i]);
|
||||
* if (decoded) {
|
||||
* if (errno != 0)
|
||||
* err(1, "Invalid UTF8 char %zu-%zu",
|
||||
* i - utf8_state.used_len, i);
|
||||
* printf("Character %u\n", utf8_state.c);
|
||||
* }
|
||||
* }
|
||||
*
|
||||
* if (!decoded)
|
||||
* errx(1, "Incomplete UTF8");
|
||||
* return 0;
|
||||
* }
|
||||
*
|
||||
* License: BSD-MIT
|
||||
*/
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
/* Expect exactly one argument */
|
||||
if (argc != 2)
|
||||
return 1;
|
||||
|
||||
if (strcmp(argv[1], "depends") == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
266
ccan/ccan/utf8/test/run-decode.c
Normal file
266
ccan/ccan/utf8/test/run-decode.c
Normal file
@ -0,0 +1,266 @@
|
||||
#include <ccan/utf8/utf8.h>
|
||||
/* Include the C files directly. */
|
||||
#include <ccan/utf8/utf8.c>
|
||||
#include <ccan/tap/tap.h>
|
||||
#include <assert.h>
|
||||
|
||||
/* Stolen from https://github.com/chansen/c-utf8-valid/blob/master/test.c */
|
||||
|
||||
/*
|
||||
* UTF-8
|
||||
*
|
||||
* U+0000..U+007F 00..7F
|
||||
* n C0..C1 80..BF
|
||||
* U+0080..U+07FF C2..DF 80..BF
|
||||
* n E0 80..9F 80..BF
|
||||
* U+0800..U+D7FF E0..ED A0..9F 80..BF
|
||||
* U+D800..U+DFFF s ED A0..BF 80..BF
|
||||
* U+E000..U+FFFF EE..EF 80..BF 80..BF
|
||||
* n F0 80..8F 80..BF 80..BF
|
||||
* U+0800..U+FFFF F0 80..8F A0..BF 80..BF
|
||||
* U+10000..U+10FFFF F0..F4 90..8F 80..BF 80..BF
|
||||
*
|
||||
* U-110000..U-1FFFFF x F4..F7 90..BF 80..BF 80..BF
|
||||
* xn F8 80..87 80..BF 80..BF 80..BF
|
||||
* U-200000..U-3FFFFFF x F8..FB 88..BF 80..BF 80..BF 80..BF
|
||||
* xn FC 80..83 80..BF 80..BF 80..BF 80..BF
|
||||
* U-4000000..U-7FFFFFFF x FC..FD 84..BF 80..BF 80..BF 80..BF 80..BF
|
||||
*
|
||||
* Legend:
|
||||
* n = Non-shortest form
|
||||
* s = Surrogates
|
||||
* x = Codepoints outside Unicode codespace
|
||||
*/
|
||||
|
||||
/*
|
||||
* Encodes the given ordinal [0, 7FFFFFFF] using the UTF-8 encoding scheme
|
||||
* to the given sequence length [1, 6]. This routine can be used to
|
||||
* produce well-formed and ill-formed UTF-8.
|
||||
*
|
||||
* To encode a Unicode scalar value to a well-formed representation:
|
||||
*
|
||||
* [U+0000, U+007F] should be encoded to a sequence length of 1
|
||||
* [U+0080, U+07FF] should be encoded to a sequence length of 2
|
||||
* [U+0800, U+D7FF] should be encoded to a sequence length of 3
|
||||
* [U+E000, U+FFFF] should be encoded to a sequence length of 3
|
||||
* [U+10000, U+10FFFF] should be encoded to a sequence length of 4
|
||||
*
|
||||
* To encode a Unicode scalar value to non-shortest form representation:
|
||||
*
|
||||
* [U+0000, U+007F] can be encoded to a sequence length of [2, 6]
|
||||
* [U+0080, U+07FF] can be encoded to a sequence length of [3, 6]
|
||||
* [U+0800, U+FFFF] can be encoded to a sequence length of [4, 6]
|
||||
*
|
||||
* To encode an ordinal outside of Unicode codespace:
|
||||
*
|
||||
* [110000, 1FFFFF] can be encoded to a sequence length of 4
|
||||
* [200000, 3FFFFFF] can be encoded to a sequence length of 5
|
||||
* [4000000, 7FFFFFFF] can be encoded to a sequence length of 6
|
||||
*/
|
||||
|
||||
static char *
|
||||
encode_ord(uint32_t ord, size_t len, char *dst) {
|
||||
static const uint32_t kMask[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
|
||||
static const uint32_t kMax[6] = { 1 << 7, 1 << 11, 1 << 16,
|
||||
1 << 21, 1 << 26, 1 << 31 };
|
||||
size_t i;
|
||||
|
||||
assert(len >= 1);
|
||||
assert(len <= 6);
|
||||
assert(ord < kMax[len - 1]);
|
||||
|
||||
for (i = len - 1; i > 0; i--) {
|
||||
dst[i] = (ord & 0x3F) | 0x80;
|
||||
ord >>= 6;
|
||||
}
|
||||
dst[0] = ord | kMask[len - 1];
|
||||
return dst;
|
||||
}
|
||||
|
||||
static int utf8_check(const char *src, size_t len)
|
||||
{
|
||||
bool decoded = false;
|
||||
struct utf8_state utf8_state = UTF8_STATE_INIT;
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < len; i++) {
|
||||
decoded = utf8_decode(&utf8_state, src[i]);
|
||||
if (decoded) {
|
||||
if (errno != 0)
|
||||
return errno;
|
||||
}
|
||||
}
|
||||
if (!decoded)
|
||||
return EMLINK;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
test_utf8(const char *src, size_t len, int exp_err, unsigned line) {
|
||||
int got_err;
|
||||
|
||||
assert(len <= 255);
|
||||
|
||||
got_err = utf8_check(src, len);
|
||||
|
||||
ok(got_err == exp_err, "Got result %i, expected %i at line %u",
|
||||
got_err, exp_err, line);
|
||||
}
|
||||
|
||||
#define TEST_UTF8(src, len, exp) \
|
||||
test_utf8(src, len, exp, __LINE__)
|
||||
|
||||
|
||||
static void
|
||||
test_unicode_scalar_value(void) {
|
||||
uint32_t ord;
|
||||
char src[4];
|
||||
|
||||
/* Unicode scalar value [U+0000, U+007F] */
|
||||
for (ord = 0x0000; ord <= 0x007F; ord++) {
|
||||
encode_ord(ord, 1, src);
|
||||
TEST_UTF8(src, 1, ord ? 0 : ERANGE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Unicode scalar value [U+0080, U+07FF]
|
||||
* The maximal subpart is the length of the truncated sequence
|
||||
*/
|
||||
for (ord = 0x0080; ord <= 0x07FF; ord++) {
|
||||
encode_ord(ord, 2, src);
|
||||
TEST_UTF8(src, 2, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Unicode scalar value [U+0800, U+D7FF] and [U+E000, U+FFFF]
|
||||
* The maximal subpart is the length of the truncated sequence
|
||||
*/
|
||||
for (ord = 0x0800; ord <= 0xFFFF && (ord & 0xF800) != 0xD800; ord++) {
|
||||
encode_ord(ord, 3, src);
|
||||
|
||||
TEST_UTF8(src, 3, 0);
|
||||
if ((ord % (1 << 6)) == 0)
|
||||
TEST_UTF8(src, 2, EMLINK);
|
||||
}
|
||||
|
||||
/*
|
||||
* Unicode scalar value [U+10000, U+10FFF]
|
||||
* The maximal subpart is the length of the truncated sequence
|
||||
*/
|
||||
for (ord = 0x10000; ord <= 0x10FFFF; ord++) {
|
||||
encode_ord(ord, 4, src);
|
||||
|
||||
TEST_UTF8(src, 4, 0);
|
||||
if ((ord % (1 << 6)) == 0)
|
||||
TEST_UTF8(src, 3, EMLINK);
|
||||
if ((ord % (1 << 12)) == 0)
|
||||
TEST_UTF8(src, 2, EMLINK);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
test_non_shortest_form(void) {
|
||||
uint32_t ord;
|
||||
char src[4];
|
||||
|
||||
/*
|
||||
* Non-shortest form 2-byte sequence [U+0000, U+007F]
|
||||
* The maximal subpart is 1-byte
|
||||
*/
|
||||
for (ord = 0x0001; ord <= 0x007F; ord++) {
|
||||
encode_ord(ord, 2, src);
|
||||
TEST_UTF8(src, 2, EFBIG);
|
||||
}
|
||||
|
||||
/*
|
||||
* Non-shortest form 3-byte sequence [U+0000, U+07FF]
|
||||
* The maximal subpart is 1-byte
|
||||
*/
|
||||
for (ord = 0x0001; ord <= 0x07FF; ord++) {
|
||||
encode_ord(ord, 3, src);
|
||||
|
||||
TEST_UTF8(src, 3, EFBIG);
|
||||
if ((ord % (1 << 6)) == 0)
|
||||
TEST_UTF8(src, 2, EMLINK);
|
||||
}
|
||||
|
||||
/*
|
||||
* Non-shortest form 4-byte sequence [U+0000, U+FFFF]
|
||||
* The maximal subpart is 1-byte
|
||||
*/
|
||||
for (ord = 0x0001; ord <= 0xFFFF; ord++) {
|
||||
encode_ord(ord, 4, src);
|
||||
|
||||
TEST_UTF8(src, 4, EFBIG);
|
||||
if ((ord % (1 << 6)) == 0)
|
||||
TEST_UTF8(src, 3, EMLINK);
|
||||
if ((ord % (1 << 12)) == 0)
|
||||
TEST_UTF8(src, 2, EMLINK);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
test_non_unicode(void) {
|
||||
uint32_t ord;
|
||||
char src[4];
|
||||
|
||||
/*
|
||||
* Code point outside Unicode codespace
|
||||
* The maximal subpart is 1-byte
|
||||
*/
|
||||
for (ord = 0x110000; ord <= 0x1FFFFF; ord++) {
|
||||
encode_ord(ord, 4, src);
|
||||
|
||||
TEST_UTF8(src, 4, ERANGE);
|
||||
if ((ord % (1 << 6)) == 0)
|
||||
TEST_UTF8(src, 3, EMLINK);
|
||||
if ((ord % (1 << 12)) == 0)
|
||||
TEST_UTF8(src, 2, EMLINK);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
test_surrogates(void) {
|
||||
uint32_t ord;
|
||||
char src[4];
|
||||
|
||||
/*
|
||||
* Surrogates [U+D800, U+DFFF]
|
||||
* The maximal subpart is 1-byte
|
||||
*/
|
||||
for (ord = 0xD800; ord <= 0xDFFF; ord++) {
|
||||
encode_ord(ord, 3, src);
|
||||
|
||||
TEST_UTF8(src, 3, ERANGE);
|
||||
if ((ord % (1 << 6)) == 0)
|
||||
TEST_UTF8(src, 2, EMLINK);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
test_continuations(void) {
|
||||
uint8_t ord;
|
||||
char src[4];
|
||||
|
||||
/*
|
||||
* Missplaced continuation [\x80, \xBF]
|
||||
* The maximal subpart is 1-byte
|
||||
*/
|
||||
for (ord = 0x80; ord <= 0xBF; ord++) {
|
||||
src[0] = ord;
|
||||
TEST_UTF8(src, 1, EINVAL);
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
plan_tests(2190906);
|
||||
test_unicode_scalar_value();
|
||||
test_surrogates();
|
||||
test_non_shortest_form();
|
||||
test_non_unicode();
|
||||
test_continuations();
|
||||
|
||||
return exit_status();
|
||||
}
|
42
ccan/ccan/utf8/test/run-encode-decode.c
Normal file
42
ccan/ccan/utf8/test/run-encode-decode.c
Normal file
@ -0,0 +1,42 @@
|
||||
#include <ccan/utf8/utf8.h>
|
||||
/* Include the C files directly. */
|
||||
#include <ccan/utf8/utf8.c>
|
||||
#include <ccan/tap/tap.h>
|
||||
#include <assert.h>
|
||||
|
||||
static bool utf8_check(const char *src, size_t len)
|
||||
{
|
||||
bool decoded = false;
|
||||
struct utf8_state utf8_state = UTF8_STATE_INIT;
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < len; i++) {
|
||||
decoded = utf8_decode(&utf8_state, src[i]);
|
||||
if (decoded) {
|
||||
if (errno != 0)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (!decoded)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int i;
|
||||
char dest[UTF8_MAX_LEN];
|
||||
|
||||
plan_tests(0x10FFFF - (0xDFFF - 0xD7FF + 2));
|
||||
|
||||
for (i = 1; i < 0x10FFFF; i++) {
|
||||
int len;
|
||||
if (i >= 0xD7FF && i <= 0xDFFF)
|
||||
continue;
|
||||
len = utf8_encode(i, dest);
|
||||
assert(len != 0);
|
||||
ok1(utf8_check(dest, len));
|
||||
}
|
||||
|
||||
return exit_status();
|
||||
}
|
30
ccan/ccan/utf8/test/run-encode.c
Normal file
30
ccan/ccan/utf8/test/run-encode.c
Normal file
@ -0,0 +1,30 @@
|
||||
#include <ccan/utf8/utf8.h>
|
||||
/* Include the C files directly. */
|
||||
#include <ccan/utf8/utf8.c>
|
||||
#include <ccan/tap/tap.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int i;
|
||||
char dest[UTF8_MAX_LEN];
|
||||
|
||||
plan_tests(1 + 0x10FFFF + 1);
|
||||
|
||||
for (i = 0; i < 1; i++)
|
||||
ok1(utf8_encode(i, dest) == 0 && errno == ERANGE);
|
||||
for (; i <= 0x7F; i++)
|
||||
ok1(utf8_encode(i, dest) == 1);
|
||||
for (; i <= 0x7FF; i++)
|
||||
ok1(utf8_encode(i, dest) == 2);
|
||||
for (; i <= 0xD7FF; i++)
|
||||
ok1(utf8_encode(i, dest) == 3);
|
||||
for (; i <= 0xDFFF; i++)
|
||||
ok1(utf8_encode(i, dest) == 0 && errno == ERANGE);
|
||||
for (; i <= 0xFFFF; i++)
|
||||
ok1(utf8_encode(i, dest) == 3);
|
||||
for (; i <= 0x10FFFF; i++)
|
||||
ok1(utf8_encode(i, dest) == 4);
|
||||
ok1(utf8_encode(i, dest) == 0 && errno == ERANGE);
|
||||
|
||||
return exit_status();
|
||||
}
|
176
ccan/ccan/utf8/utf8.c
Normal file
176
ccan/ccan/utf8/utf8.c
Normal file
@ -0,0 +1,176 @@
|
||||
/* MIT (BSD) license - see LICENSE file for details */
|
||||
#include <ccan/utf8/utf8.h>
|
||||
#include <errno.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
/* I loved this table, so I stole it: */
|
||||
/*
|
||||
* Copyright (c) 2017 Christian Hansen <chansen@cpan.org>
|
||||
* <https://github.com/chansen/c-utf8-valid>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/*
|
||||
* UTF-8 Encoding Form
|
||||
*
|
||||
* U+0000..U+007F 0xxxxxxx <= 7 bits
|
||||
* U+0080..U+07FF 110xxxxx 10xxxxxx <= 11 bits
|
||||
* U+0800..U+FFFF 1110xxxx 10xxxxxx 10xxxxxx <= 16 bits
|
||||
* U+10000..U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx <= 21 bits
|
||||
*
|
||||
*
|
||||
* U+0000..U+007F 00..7F
|
||||
* N C0..C1 80..BF 1100000x 10xxxxxx
|
||||
* U+0080..U+07FF C2..DF 80..BF
|
||||
* N E0 80..9F 80..BF 11100000 100xxxxx
|
||||
* U+0800..U+0FFF E0 A0..BF 80..BF
|
||||
* U+1000..U+CFFF E1..EC 80..BF 80..BF
|
||||
* U+D000..U+D7FF ED 80..9F 80..BF
|
||||
* S ED A0..BF 80..BF 11101101 101xxxxx
|
||||
* U+E000..U+FFFF EE..EF 80..BF 80..BF
|
||||
* N F0 80..8F 80..BF 80..BF 11110000 1000xxxx
|
||||
* U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
|
||||
* U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
|
||||
* U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 11110100 1000xxxx
|
||||
*
|
||||
* Legend:
|
||||
* N = Non-shortest form
|
||||
* S = Surrogates
|
||||
*/
|
||||
bool utf8_decode(struct utf8_state *utf8_state, char c)
|
||||
{
|
||||
if (utf8_state->used_len == utf8_state->total_len) {
|
||||
utf8_state->used_len = 1;
|
||||
/* First character in sequence. */
|
||||
if (((unsigned char)c & 0x80) == 0) {
|
||||
/* ASCII, easy. */
|
||||
utf8_state->total_len = 1;
|
||||
utf8_state->c = c;
|
||||
goto finished_decoding;
|
||||
} else if (((unsigned char)c & 0xE0) == 0xC0) {
|
||||
utf8_state->total_len = 2;
|
||||
utf8_state->c = ((unsigned char)c & 0x1F);
|
||||
return false;
|
||||
} else if (((unsigned char)c & 0xF0) == 0xE0) {
|
||||
utf8_state->total_len = 3;
|
||||
utf8_state->c = ((unsigned char)c & 0x0F);
|
||||
return false;
|
||||
} else if (((unsigned char)c & 0xF8) == 0xF0) {
|
||||
utf8_state->total_len = 4;
|
||||
utf8_state->c = ((unsigned char)c & 0x07);
|
||||
return false;
|
||||
}
|
||||
goto bad_encoding;
|
||||
}
|
||||
|
||||
if (((unsigned char)c & 0xC0) != 0x80)
|
||||
goto bad_encoding;
|
||||
|
||||
utf8_state->c <<= 6;
|
||||
utf8_state->c |= ((unsigned char)c & 0x3F);
|
||||
|
||||
utf8_state->used_len++;
|
||||
if (utf8_state->used_len == utf8_state->total_len)
|
||||
goto finished_decoding;
|
||||
return false;
|
||||
|
||||
finished_decoding:
|
||||
if (utf8_state->c == 0 || utf8_state->c > 0x10FFFF)
|
||||
errno = ERANGE;
|
||||
/* The UTF-16 "surrogate range": illegal in UTF-8 */
|
||||
else if (utf8_state->total_len == 3
|
||||
&& (utf8_state->c & 0xFFFFF800) == 0x0000D800)
|
||||
errno = ERANGE;
|
||||
else {
|
||||
int min_bits;
|
||||
switch (utf8_state->total_len) {
|
||||
case 1:
|
||||
min_bits = 0;
|
||||
break;
|
||||
case 2:
|
||||
min_bits = 7;
|
||||
break;
|
||||
case 3:
|
||||
min_bits = 11;
|
||||
break;
|
||||
case 4:
|
||||
min_bits = 16;
|
||||
break;
|
||||
default:
|
||||
abort();
|
||||
}
|
||||
if ((utf8_state->c >> min_bits) == 0)
|
||||
errno = EFBIG;
|
||||
else
|
||||
errno = 0;
|
||||
}
|
||||
return true;
|
||||
|
||||
bad_encoding:
|
||||
utf8_state->total_len = utf8_state->used_len;
|
||||
errno = EINVAL;
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN])
|
||||
{
|
||||
if ((point >> 7) == 0) {
|
||||
if (point == 0) {
|
||||
errno = ERANGE;
|
||||
return 0;
|
||||
}
|
||||
/* 0xxxxxxx */
|
||||
dest[0] = point;
|
||||
return 1;
|
||||
}
|
||||
|
||||
if ((point >> 11) == 0) {
|
||||
/* 110xxxxx 10xxxxxx */
|
||||
dest[1] = 0x80 | (point & 0x3F);
|
||||
dest[0] = 0xC0 | (point >> 6);
|
||||
return 2;
|
||||
}
|
||||
|
||||
if ((point >> 16) == 0) {
|
||||
if (point >= 0xD800 && point <= 0xDFFF) {
|
||||
errno = ERANGE;
|
||||
return 0;
|
||||
}
|
||||
/* 1110xxxx 10xxxxxx 10xxxxxx */
|
||||
dest[2] = 0x80 | (point & 0x3F);
|
||||
dest[1] = 0x80 | ((point >> 6) & 0x3F);
|
||||
dest[0] = 0xE0 | (point >> 12);
|
||||
return 3;
|
||||
}
|
||||
|
||||
if (point > 0x10FFFF) {
|
||||
errno = ERANGE;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
||||
dest[3] = 0x80 | (point & 0x3F);
|
||||
dest[2] = 0x80 | ((point >> 6) & 0x3F);
|
||||
dest[1] = 0x80 | ((point >> 12) & 0x3F);
|
||||
dest[0] = 0xF0 | (point >> 18);
|
||||
return 4;
|
||||
}
|
54
ccan/ccan/utf8/utf8.h
Normal file
54
ccan/ccan/utf8/utf8.h
Normal file
@ -0,0 +1,54 @@
|
||||
/* MIT (BSD) license - see LICENSE file for details */
|
||||
#ifndef CCAN_UTF8_H
|
||||
#define CCAN_UTF8_H
|
||||
#include <inttypes.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
|
||||
/* Unicode is limited to 21 bits. */
|
||||
#define UTF8_MAX_LEN 4
|
||||
|
||||
struct utf8_state {
|
||||
/* How many characters we are expecting as part of this Unicode point */
|
||||
uint16_t total_len;
|
||||
/* How many characters we've already seen. */
|
||||
uint16_t used_len;
|
||||
/* Compound character, aka Unicode point. */
|
||||
uint32_t c;
|
||||
};
|
||||
|
||||
#define UTF8_STATE_INIT { 0, 0, 0 }
|
||||
|
||||
static inline void utf8_state_init(struct utf8_state *utf8_state)
|
||||
{
|
||||
memset(utf8_state, 0, sizeof(*utf8_state));
|
||||
}
|
||||
|
||||
/**
|
||||
* utf8_decode - continue UTF8 decoding with this character.
|
||||
* @utf8_state - initialized UTF8 state.
|
||||
* @c - the character.
|
||||
*
|
||||
* Returns false if it needs another character to give results.
|
||||
* Otherwise returns true, @utf8_state can be reused without initializeation,
|
||||
* and sets errno:
|
||||
* 0: success
|
||||
* EINVAL: bad encoding.
|
||||
* EFBIG: not a minimal encoding.
|
||||
* ERANGE: encoding of invalid character.
|
||||
*
|
||||
* You can extract the character from @utf8_state->c; @utf8_state->used_len
|
||||
* indicates how many characters have been consumed.
|
||||
*/
|
||||
bool utf8_decode(struct utf8_state *utf8_state, char c);
|
||||
|
||||
/**
|
||||
* utf8_encode - encode a point into UTF8.
|
||||
* @point - Unicode point to include.
|
||||
* @dest - buffer to fill.
|
||||
*
|
||||
* Returns 0 if point was invalid, otherwise bytes of dest used.
|
||||
* Sets errno to ERANGE if point was invalid.
|
||||
*/
|
||||
size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN]);
|
||||
#endif /* CCAN_UTF8_H */
|
Loading…
Reference in New Issue
Block a user