From 8586e44de92c827d6a19a7700121c8b21d3687b1 Mon Sep 17 00:00:00 2001 From: default Date: Fri, 23 Aug 2024 17:22:10 +0200 Subject: [PATCH] Some optimizations. --- xs_json.h | 7 +++- xs_unicode.h | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++- xs_version.h | 2 +- 3 files changed, 120 insertions(+), 5 deletions(-) diff --git a/xs_json.h b/xs_json.h index a4112b0..3a91de9 100644 --- a/xs_json.h +++ b/xs_json.h @@ -208,6 +208,7 @@ static xs_val *_xs_json_load_lexer(FILE *f, js_type *t) { int c; xs_val *v = NULL; + int offset; *t = JS_ERROR; @@ -236,6 +237,7 @@ static xs_val *_xs_json_load_lexer(FILE *f, js_type *t) *t = JS_STRING; v = xs_str_new(NULL); + offset = 0; while ((c = fgetc(f)) != '"' && c != EOF && *t != JS_ERROR) { if (c == '\\') { @@ -274,11 +276,12 @@ static xs_val *_xs_json_load_lexer(FILE *f, js_type *t) break; } - v = xs_utf8_cat(v, cp); + v = xs_utf8_insert(v, cp, &offset); } else { char cc = c; - v = xs_append_m(v, &cc, 1); + v = xs_insert_m(v, offset, &cc, 1); + offset++; } } diff --git a/xs_unicode.h b/xs_unicode.h index 2e9a754..a5a1dcb 100644 --- a/xs_unicode.h +++ b/xs_unicode.h @@ -9,6 +9,7 @@ unsigned int xs_utf8_dec(const char **str); int xs_unicode_width(unsigned int cpoint); int xs_is_surrogate(unsigned int cpoint); + int xs_is_diacritic(unsigned int cpoint); unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2); unsigned int xs_surrogate_enc(unsigned int cpoint); unsigned int *_xs_unicode_upper_search(unsigned int cpoint); @@ -22,7 +23,12 @@ int xs_unicode_is_alpha(unsigned int cpoint); #ifdef _XS_H + xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset); xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint); + xs_str *xs_utf8_to_upper(const char *str); + xs_str *xs_utf8_to_lower(const char *str); + xs_str *xs_utf8_to_nfd(const char *str); + xs_str *xs_utf8_to_nfc(const char *str); #endif #ifdef XS_IMPLEMENTATION @@ -144,6 +150,12 @@ int xs_unicode_width(unsigned int cpoint) } +int xs_is_diacritic(unsigned int cpoint) +{ + return cpoint >= 0x300 && cpoint <= 0x36f; +} + + /** surrogate pairs **/ int xs_is_surrogate(unsigned int cpoint) @@ -172,14 +184,27 @@ unsigned int xs_surrogate_enc(unsigned int cpoint) #ifdef _XS_H -xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint) +xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset) /* encodes an Unicode codepoint to utf-8 into str */ { char tmp[4]; int c = xs_utf8_enc(tmp, cpoint); - return xs_append_m(str, tmp, c); + str = xs_insert_m(str, *offset, tmp, c); + + *offset += c; + + return str; +} + + +xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint) +/* encodes an Unicode codepoint to utf-8 into str */ +{ + int offset = strlen(str); + + return xs_utf8_insert(str, cpoint, &offset); } #endif /* _XS_H */ @@ -232,6 +257,9 @@ unsigned int *_xs_unicode_lower_search(unsigned int cpoint) unsigned int xs_unicode_to_lower(unsigned int cpoint) /* returns the cpoint to lowercase */ { + if (cpoint < 0x80) + return tolower(cpoint); + unsigned int *p = _xs_unicode_upper_search(cpoint); return p == NULL ? cpoint : p[1]; @@ -241,6 +269,9 @@ unsigned int xs_unicode_to_lower(unsigned int cpoint) unsigned int xs_unicode_to_upper(unsigned int cpoint) /* returns the cpoint to uppercase */ { + if (cpoint < 0x80) + return toupper(cpoint); + unsigned int *p = _xs_unicode_lower_search(cpoint); return p == NULL ? cpoint : p[0]; @@ -317,6 +348,87 @@ int xs_unicode_is_alpha(unsigned int cpoint) } +#ifdef _XS_H + +xs_str *xs_utf8_to_upper(const char *str) +{ + xs_str *s = xs_str_new(NULL); + unsigned int cpoint; + int offset = 0; + + while ((cpoint = xs_utf8_dec(&str))) { + cpoint = xs_unicode_to_upper(cpoint); + s = xs_utf8_insert(s, cpoint, &offset); + } + + return s; +} + + +xs_str *xs_utf8_to_lower(const char *str) +{ + xs_str *s = xs_str_new(NULL); + unsigned int cpoint; + int offset = 0; + + while ((cpoint = xs_utf8_dec(&str))) { + cpoint = xs_unicode_to_lower(cpoint); + s = xs_utf8_insert(s, cpoint, &offset); + } + + return s; +} + + +xs_str *xs_utf8_to_nfd(const char *str) +{ + xs_str *s = xs_str_new(NULL); + unsigned int cpoint; + int offset = 0; + + while ((cpoint = xs_utf8_dec(&str))) { + unsigned int base; + unsigned int diac; + + if (xs_unicode_nfd(cpoint, &base, &diac)) { + s = xs_utf8_insert(s, base, &offset); + s = xs_utf8_insert(s, diac, &offset); + } + else + s = xs_utf8_insert(s, cpoint, &offset); + } + + return s; +} + + +xs_str *xs_utf8_to_nfc(const char *str) +{ + xs_str *s = xs_str_new(NULL); + unsigned int cpoint; + unsigned int base = 0; + int offset = 0; + + while ((cpoint = xs_utf8_dec(&str))) { + if (xs_is_diacritic(cpoint)) { + if (xs_unicode_nfc(base, cpoint, &base)) + continue; + } + + if (base) + s = xs_utf8_insert(s, base, &offset); + + base = cpoint; + } + + if (base) + s = xs_utf8_insert(s, base, &offset); + + return s; +} + +#endif /* _XS_H */ + #endif /* _XS_UNICODE_TBL_H */ #endif /* XS_IMPLEMENTATION */ diff --git a/xs_version.h b/xs_version.h index 4318c7e..ce88558 100644 --- a/xs_version.h +++ b/xs_version.h @@ -1 +1 @@ -/* c6eca9593f9b3d6791cba600e5950f682fdb36cf 2024-08-12T16:08:37+02:00 */ +/* cc9ebd36ae640e4701277327fbba9996143076f6 2024-08-23T17:17:08+02:00 */