Some optimizations.

This commit is contained in:
default 2024-08-23 17:22:10 +02:00
parent d2daba7b9c
commit 8586e44de9
3 changed files with 120 additions and 5 deletions

View File

@ -208,6 +208,7 @@ static xs_val *_xs_json_load_lexer(FILE *f, js_type *t)
{ {
int c; int c;
xs_val *v = NULL; xs_val *v = NULL;
int offset;
*t = JS_ERROR; *t = JS_ERROR;
@ -236,6 +237,7 @@ static xs_val *_xs_json_load_lexer(FILE *f, js_type *t)
*t = JS_STRING; *t = JS_STRING;
v = xs_str_new(NULL); v = xs_str_new(NULL);
offset = 0;
while ((c = fgetc(f)) != '"' && c != EOF && *t != JS_ERROR) { while ((c = fgetc(f)) != '"' && c != EOF && *t != JS_ERROR) {
if (c == '\\') { if (c == '\\') {
@ -274,11 +276,12 @@ static xs_val *_xs_json_load_lexer(FILE *f, js_type *t)
break; break;
} }
v = xs_utf8_cat(v, cp); v = xs_utf8_insert(v, cp, &offset);
} }
else { else {
char cc = c; char cc = c;
v = xs_append_m(v, &cc, 1); v = xs_insert_m(v, offset, &cc, 1);
offset++;
} }
} }

View File

@ -9,6 +9,7 @@
unsigned int xs_utf8_dec(const char **str); unsigned int xs_utf8_dec(const char **str);
int xs_unicode_width(unsigned int cpoint); int xs_unicode_width(unsigned int cpoint);
int xs_is_surrogate(unsigned int cpoint); int xs_is_surrogate(unsigned int cpoint);
int xs_is_diacritic(unsigned int cpoint);
unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2); unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2);
unsigned int xs_surrogate_enc(unsigned int cpoint); unsigned int xs_surrogate_enc(unsigned int cpoint);
unsigned int *_xs_unicode_upper_search(unsigned int cpoint); unsigned int *_xs_unicode_upper_search(unsigned int cpoint);
@ -22,7 +23,12 @@
int xs_unicode_is_alpha(unsigned int cpoint); int xs_unicode_is_alpha(unsigned int cpoint);
#ifdef _XS_H #ifdef _XS_H
xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset);
xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint); xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint);
xs_str *xs_utf8_to_upper(const char *str);
xs_str *xs_utf8_to_lower(const char *str);
xs_str *xs_utf8_to_nfd(const char *str);
xs_str *xs_utf8_to_nfc(const char *str);
#endif #endif
#ifdef XS_IMPLEMENTATION #ifdef XS_IMPLEMENTATION
@ -144,6 +150,12 @@ int xs_unicode_width(unsigned int cpoint)
} }
int xs_is_diacritic(unsigned int cpoint)
{
return cpoint >= 0x300 && cpoint <= 0x36f;
}
/** surrogate pairs **/ /** surrogate pairs **/
int xs_is_surrogate(unsigned int cpoint) int xs_is_surrogate(unsigned int cpoint)
@ -172,14 +184,27 @@ unsigned int xs_surrogate_enc(unsigned int cpoint)
#ifdef _XS_H #ifdef _XS_H
xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint) xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset)
/* encodes an Unicode codepoint to utf-8 into str */ /* encodes an Unicode codepoint to utf-8 into str */
{ {
char tmp[4]; char tmp[4];
int c = xs_utf8_enc(tmp, cpoint); int c = xs_utf8_enc(tmp, cpoint);
return xs_append_m(str, tmp, c); str = xs_insert_m(str, *offset, tmp, c);
*offset += c;
return str;
}
xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint)
/* encodes an Unicode codepoint to utf-8 into str */
{
int offset = strlen(str);
return xs_utf8_insert(str, cpoint, &offset);
} }
#endif /* _XS_H */ #endif /* _XS_H */
@ -232,6 +257,9 @@ unsigned int *_xs_unicode_lower_search(unsigned int cpoint)
unsigned int xs_unicode_to_lower(unsigned int cpoint) unsigned int xs_unicode_to_lower(unsigned int cpoint)
/* returns the cpoint to lowercase */ /* returns the cpoint to lowercase */
{ {
if (cpoint < 0x80)
return tolower(cpoint);
unsigned int *p = _xs_unicode_upper_search(cpoint); unsigned int *p = _xs_unicode_upper_search(cpoint);
return p == NULL ? cpoint : p[1]; return p == NULL ? cpoint : p[1];
@ -241,6 +269,9 @@ unsigned int xs_unicode_to_lower(unsigned int cpoint)
unsigned int xs_unicode_to_upper(unsigned int cpoint) unsigned int xs_unicode_to_upper(unsigned int cpoint)
/* returns the cpoint to uppercase */ /* returns the cpoint to uppercase */
{ {
if (cpoint < 0x80)
return toupper(cpoint);
unsigned int *p = _xs_unicode_lower_search(cpoint); unsigned int *p = _xs_unicode_lower_search(cpoint);
return p == NULL ? cpoint : p[0]; return p == NULL ? cpoint : p[0];
@ -317,6 +348,87 @@ int xs_unicode_is_alpha(unsigned int cpoint)
} }
#ifdef _XS_H
xs_str *xs_utf8_to_upper(const char *str)
{
xs_str *s = xs_str_new(NULL);
unsigned int cpoint;
int offset = 0;
while ((cpoint = xs_utf8_dec(&str))) {
cpoint = xs_unicode_to_upper(cpoint);
s = xs_utf8_insert(s, cpoint, &offset);
}
return s;
}
xs_str *xs_utf8_to_lower(const char *str)
{
xs_str *s = xs_str_new(NULL);
unsigned int cpoint;
int offset = 0;
while ((cpoint = xs_utf8_dec(&str))) {
cpoint = xs_unicode_to_lower(cpoint);
s = xs_utf8_insert(s, cpoint, &offset);
}
return s;
}
xs_str *xs_utf8_to_nfd(const char *str)
{
xs_str *s = xs_str_new(NULL);
unsigned int cpoint;
int offset = 0;
while ((cpoint = xs_utf8_dec(&str))) {
unsigned int base;
unsigned int diac;
if (xs_unicode_nfd(cpoint, &base, &diac)) {
s = xs_utf8_insert(s, base, &offset);
s = xs_utf8_insert(s, diac, &offset);
}
else
s = xs_utf8_insert(s, cpoint, &offset);
}
return s;
}
xs_str *xs_utf8_to_nfc(const char *str)
{
xs_str *s = xs_str_new(NULL);
unsigned int cpoint;
unsigned int base = 0;
int offset = 0;
while ((cpoint = xs_utf8_dec(&str))) {
if (xs_is_diacritic(cpoint)) {
if (xs_unicode_nfc(base, cpoint, &base))
continue;
}
if (base)
s = xs_utf8_insert(s, base, &offset);
base = cpoint;
}
if (base)
s = xs_utf8_insert(s, base, &offset);
return s;
}
#endif /* _XS_H */
#endif /* _XS_UNICODE_TBL_H */ #endif /* _XS_UNICODE_TBL_H */
#endif /* XS_IMPLEMENTATION */ #endif /* XS_IMPLEMENTATION */

View File

@ -1 +1 @@
/* c6eca9593f9b3d6791cba600e5950f682fdb36cf 2024-08-12T16:08:37+02:00 */ /* cc9ebd36ae640e4701277327fbba9996143076f6 2024-08-23T17:17:08+02:00 */