From ca2e0fcd89599819b0a808aff5a8125df26c36d5 Mon Sep 17 00:00:00 2001 From: default Date: Tue, 30 May 2023 19:49:30 +0200 Subject: [PATCH] Backport from xs. --- xs_unicode.h | 99 +++++++++++++++++++++++++++++++++++++++------------- xs_version.h | 2 +- 2 files changed, 75 insertions(+), 26 deletions(-) diff --git a/xs_unicode.h b/xs_unicode.h index 6f78d58..2f081ad 100644 --- a/xs_unicode.h +++ b/xs_unicode.h @@ -5,42 +5,91 @@ #define _XS_UNICODE_H xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint); + char *xs_utf8_dec(const char *str, unsigned int *cpoint); #ifdef XS_IMPLEMENTATION -/** utf-8 **/ + +char *_xs_utf8_enc(char buf[4], unsigned int cpoint) +/* encodes an Unicode codepoint to utf-8 into buf and returns the new position */ +{ + unsigned char *p = (unsigned char *)buf; + + if (cpoint < 0x80) /* 1 byte char */ + *p++ = cpoint & 0xff; + else { + if (cpoint < 0x800) /* 2 byte char */ + *p++ = 0xc0 | (cpoint >> 6); + else { + if (cpoint < 0x10000) /* 3 byte char */ + *p++ = 0xe0 | (cpoint >> 12); + else { /* 4 byte char */ + *p++ = 0xf0 | (cpoint >> 18); + *p++ = 0x80 | ((cpoint >> 12) & 0x3f); + } + + *p++ = 0x80 | ((cpoint >> 6) & 0x3f); + } + + *p++ = 0x80 | (cpoint & 0x3f); + } + + return (char *)p; +} + xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint) -/* encodes an Unicode codepoint to utf8 */ +/* encodes an Unicode codepoint to utf-8 into str */ { - unsigned char tmp[4]; - int n = 0; + char tmp[4], *p; - if (cpoint < 0x80) - tmp[n++] = cpoint & 0xff; - else - if (cpoint < 0x800) { - tmp[n++] = 0xc0 | (cpoint >> 6); - tmp[n++] = 0x80 | (cpoint & 0x3f); - } - else - if (cpoint < 0x10000) { - tmp[n++] = 0xe0 | (cpoint >> 12); - tmp[n++] = 0x80 | ((cpoint >> 6) & 0x3f); - tmp[n++] = 0x80 | (cpoint & 0x3f); - } - else - if (cpoint < 0x200000) { - tmp[n++] = 0xf0 | (cpoint >> 18); - tmp[n++] = 0x80 | ((cpoint >> 12) & 0x3f); - tmp[n++] = 0x80 | ((cpoint >> 6) & 0x3f); - tmp[n++] = 0x80 | (cpoint & 0x3f); - } + p = _xs_utf8_enc(tmp, cpoint); - return xs_append_m(str, (char *)tmp, n); + return xs_append_m(str, tmp, p - tmp); } + +char *xs_utf8_dec(const char *str, unsigned int *cpoint) +/* decodes an utf-8 char inside str into cpoint and returns the next position */ +{ + unsigned char *p = (unsigned char *)str; + int c = *p++; + int cb = 0; + + if ((c & 0x80) == 0) { /* 1 byte char */ + *cpoint = c; + } + else + if ((c & 0xe0) == 0xc0) { /* 2 byte char */ + *cpoint = (c & 0x1f) << 6; + cb = 1; + } + else + if ((c & 0xf0) == 0xe0) { /* 3 byte char */ + *cpoint = (c & 0x0f) << 12; + cb = 2; + } + else + if ((c & 0xf8) == 0xf0) { /* 4 byte char */ + *cpoint = (c & 0x07) << 18; + cb = 3; + } + + /* process the continuation bytes */ + while (cb--) { + if ((*p & 0xc0) == 0x80) + *cpoint |= (*p++ & 0x3f) << (cb * 6); + else { + *cpoint = 0xfffd; + break; + } + } + + return (char *)p; +} + + #endif /* XS_IMPLEMENTATION */ #endif /* _XS_UNICODE_H */ diff --git a/xs_version.h b/xs_version.h index 099bc71..7a793d1 100644 --- a/xs_version.h +++ b/xs_version.h @@ -1 +1 @@ -/* 1948fa3c5f0df994170cd38b9144b99734b071e6 */ +/* 3588cbb7859917f1c5965254f8a53c3349c773ea */