mirror of
https://github.com/postgres/postgres.git
synced 2026-01-13 02:07:39 +00:00
ICU: use UTF8-optimized case conversion API
Initializes a UCaseMap object once for use across calls, and uses UTF8-optimized APIs. Author: Andreas Karlsson <andreas@proxel.se> Reviewed-by: zengman <zengman@halodbtech.com> Discussion: https://postgr.es/m/5a010b27-8ed9-4739-86fe-1562b07ba564@proxel.se
This commit is contained in:
@ -52,6 +52,7 @@ extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context);
|
||||
#ifdef USE_ICU
|
||||
|
||||
extern UCollator *pg_ucol_open(const char *loc_str);
|
||||
static UCaseMap *pg_ucasemap_open(const char *loc_str);
|
||||
|
||||
static size_t strlower_icu(char *dest, size_t destsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
@ -61,6 +62,14 @@ static size_t strupper_icu(char *dest, size_t destsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
static size_t strfold_icu(char *dest, size_t destsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
static size_t strlower_icu_utf8(char *dest, size_t destsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
static size_t strtitle_icu_utf8(char *dest, size_t destsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
static size_t strupper_icu_utf8(char *dest, size_t destsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
static size_t strfold_icu_utf8(char *dest, size_t destsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
static size_t downcase_ident_icu(char *dst, size_t dstsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
static int strncoll_icu(const char *arg1, ssize_t len1,
|
||||
@ -111,9 +120,9 @@ static size_t icu_from_uchar(char *dest, size_t destsize,
|
||||
const UChar *buff_uchar, int32_t len_uchar);
|
||||
static void icu_set_collation_attributes(UCollator *collator, const char *loc,
|
||||
UErrorCode *status);
|
||||
static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
|
||||
UChar **buff_dest, UChar *buff_source,
|
||||
int32_t len_source);
|
||||
static int32_t icu_convert_case(ICU_Convert_Func func, char *dest,
|
||||
size_t destsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
const char *locale,
|
||||
@ -122,6 +131,7 @@ static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
const char *locale,
|
||||
UErrorCode *pErrorCode);
|
||||
static int32_t foldcase_options(const char *locale);
|
||||
|
||||
/*
|
||||
* XXX: many of the functions below rely on casts directly from pg_wchar to
|
||||
@ -245,6 +255,28 @@ static const struct ctype_methods ctype_methods_icu = {
|
||||
.wc_tolower = tolower_icu,
|
||||
};
|
||||
|
||||
static const struct ctype_methods ctype_methods_icu_utf8 = {
|
||||
.strlower = strlower_icu_utf8,
|
||||
.strtitle = strtitle_icu_utf8,
|
||||
.strupper = strupper_icu_utf8,
|
||||
.strfold = strfold_icu_utf8,
|
||||
/* uses plain ASCII semantics for historical reasons */
|
||||
.downcase_ident = NULL,
|
||||
.wc_isdigit = wc_isdigit_icu,
|
||||
.wc_isalpha = wc_isalpha_icu,
|
||||
.wc_isalnum = wc_isalnum_icu,
|
||||
.wc_isupper = wc_isupper_icu,
|
||||
.wc_islower = wc_islower_icu,
|
||||
.wc_isgraph = wc_isgraph_icu,
|
||||
.wc_isprint = wc_isprint_icu,
|
||||
.wc_ispunct = wc_ispunct_icu,
|
||||
.wc_isspace = wc_isspace_icu,
|
||||
.wc_isxdigit = wc_isxdigit_icu,
|
||||
.wc_iscased = wc_iscased_icu,
|
||||
.wc_toupper = toupper_icu,
|
||||
.wc_tolower = tolower_icu,
|
||||
};
|
||||
|
||||
/*
|
||||
* ICU still depends on libc for compatibility with certain historical
|
||||
* behavior for single-byte encodings. See downcase_ident_icu().
|
||||
@ -347,10 +379,16 @@ create_pg_locale_icu(Oid collid, MemoryContext context)
|
||||
result->collate_is_c = false;
|
||||
result->ctype_is_c = false;
|
||||
if (GetDatabaseEncoding() == PG_UTF8)
|
||||
{
|
||||
result->icu.ucasemap = pg_ucasemap_open(iculocstr);
|
||||
result->collate = &collate_methods_icu_utf8;
|
||||
result->ctype = &ctype_methods_icu_utf8;
|
||||
}
|
||||
else
|
||||
{
|
||||
result->collate = &collate_methods_icu;
|
||||
result->ctype = &ctype_methods_icu;
|
||||
result->ctype = &ctype_methods_icu;
|
||||
}
|
||||
|
||||
return result;
|
||||
#else
|
||||
@ -366,19 +404,15 @@ create_pg_locale_icu(Oid collid, MemoryContext context)
|
||||
#ifdef USE_ICU
|
||||
|
||||
/*
|
||||
* Wrapper around ucol_open() to handle API differences for older ICU
|
||||
* versions.
|
||||
* Check locale string and fix it if necessary. Returns a new palloc'd string.
|
||||
*
|
||||
* Ensure that no path leaks a UCollator.
|
||||
* In ICU versions 54 and earlier, "und" is not a recognized spelling of the
|
||||
* root locale. If the first component of the locale is "und", replace with
|
||||
* "root" before opening.
|
||||
*/
|
||||
UCollator *
|
||||
pg_ucol_open(const char *loc_str)
|
||||
static char *
|
||||
fix_icu_locale_str(const char *loc_str)
|
||||
{
|
||||
UCollator *collator;
|
||||
UErrorCode status;
|
||||
const char *orig_str = loc_str;
|
||||
char *fixed_str = NULL;
|
||||
|
||||
/*
|
||||
* Must never open default collator, because it depends on the environment
|
||||
* and may change at any time. Should not happen, but check here to catch
|
||||
@ -391,16 +425,11 @@ pg_ucol_open(const char *loc_str)
|
||||
if (loc_str == NULL)
|
||||
elog(ERROR, "opening default collator is not supported");
|
||||
|
||||
/*
|
||||
* In ICU versions 54 and earlier, "und" is not a recognized spelling of
|
||||
* the root locale. If the first component of the locale is "und", replace
|
||||
* with "root" before opening.
|
||||
*/
|
||||
if (U_ICU_VERSION_MAJOR_NUM < 55)
|
||||
{
|
||||
char lang[ULOC_LANG_CAPACITY];
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
|
||||
if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
|
||||
{
|
||||
@ -413,28 +442,47 @@ pg_ucol_open(const char *loc_str)
|
||||
if (strcmp(lang, "und") == 0)
|
||||
{
|
||||
const char *remainder = loc_str + strlen("und");
|
||||
char *fixed_str;
|
||||
|
||||
fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
|
||||
strcpy(fixed_str, "root");
|
||||
strcat(fixed_str, remainder);
|
||||
|
||||
loc_str = fixed_str;
|
||||
return fixed_str;
|
||||
}
|
||||
}
|
||||
|
||||
return pstrdup(loc_str);
|
||||
}
|
||||
|
||||
/*
|
||||
* Wrapper around ucol_open() to handle API differences for older ICU
|
||||
* versions.
|
||||
*
|
||||
* Ensure that no path leaks a UCollator.
|
||||
*/
|
||||
UCollator *
|
||||
pg_ucol_open(const char *loc_str)
|
||||
{
|
||||
UCollator *collator;
|
||||
UErrorCode status;
|
||||
char *fixed_str;
|
||||
|
||||
fixed_str = fix_icu_locale_str(loc_str);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
collator = ucol_open(loc_str, &status);
|
||||
collator = ucol_open(fixed_str, &status);
|
||||
if (U_FAILURE(status))
|
||||
ereport(ERROR,
|
||||
/* use original string for error report */
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("could not open collator for locale \"%s\": %s",
|
||||
orig_str, u_errorName(status))));
|
||||
loc_str, u_errorName(status))));
|
||||
|
||||
if (U_ICU_VERSION_MAJOR_NUM < 54)
|
||||
{
|
||||
status = U_ZERO_ERROR;
|
||||
icu_set_collation_attributes(collator, loc_str, &status);
|
||||
icu_set_collation_attributes(collator, fixed_str, &status);
|
||||
|
||||
/*
|
||||
* Pretend the error came from ucol_open(), for consistent error
|
||||
@ -446,16 +494,43 @@ pg_ucol_open(const char *loc_str)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("could not open collator for locale \"%s\": %s",
|
||||
orig_str, u_errorName(status))));
|
||||
loc_str, u_errorName(status))));
|
||||
}
|
||||
}
|
||||
|
||||
if (fixed_str != NULL)
|
||||
pfree(fixed_str);
|
||||
pfree(fixed_str);
|
||||
|
||||
return collator;
|
||||
}
|
||||
|
||||
/*
|
||||
* Wrapper around ucasemap_open() to handle API differences for older ICU
|
||||
* versions.
|
||||
*
|
||||
* Additionally makes sure we get the right options for case folding.
|
||||
*/
|
||||
static UCaseMap *
|
||||
pg_ucasemap_open(const char *loc_str)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UCaseMap *casemap;
|
||||
char *fixed_str;
|
||||
|
||||
fixed_str = fix_icu_locale_str(loc_str);
|
||||
|
||||
casemap = ucasemap_open(fixed_str, foldcase_options(fixed_str), &status);
|
||||
if (U_FAILURE(status))
|
||||
/* use original string for error report */
|
||||
ereport(ERROR,
|
||||
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("could not open casemap for locale \"%s\": %s",
|
||||
loc_str, u_errorName(status)));
|
||||
|
||||
pfree(fixed_str);
|
||||
|
||||
return casemap;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create a UCollator with the given locale string and rules.
|
||||
*
|
||||
@ -528,80 +603,84 @@ static size_t
|
||||
strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
||||
pg_locale_t locale)
|
||||
{
|
||||
int32_t len_uchar;
|
||||
int32_t len_conv;
|
||||
UChar *buff_uchar;
|
||||
UChar *buff_conv;
|
||||
size_t result_len;
|
||||
|
||||
len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
|
||||
len_conv = icu_convert_case(u_strToLower, locale,
|
||||
&buff_conv, buff_uchar, len_uchar);
|
||||
result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
|
||||
pfree(buff_uchar);
|
||||
pfree(buff_conv);
|
||||
|
||||
return result_len;
|
||||
return icu_convert_case(u_strToLower, dest, destsize, src, srclen, locale);
|
||||
}
|
||||
|
||||
static size_t
|
||||
strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
||||
pg_locale_t locale)
|
||||
{
|
||||
int32_t len_uchar;
|
||||
int32_t len_conv;
|
||||
UChar *buff_uchar;
|
||||
UChar *buff_conv;
|
||||
size_t result_len;
|
||||
|
||||
len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
|
||||
len_conv = icu_convert_case(u_strToTitle_default_BI, locale,
|
||||
&buff_conv, buff_uchar, len_uchar);
|
||||
result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
|
||||
pfree(buff_uchar);
|
||||
pfree(buff_conv);
|
||||
|
||||
return result_len;
|
||||
return icu_convert_case(u_strToTitle_default_BI, dest, destsize, src, srclen, locale);
|
||||
}
|
||||
|
||||
static size_t
|
||||
strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
||||
pg_locale_t locale)
|
||||
{
|
||||
int32_t len_uchar;
|
||||
int32_t len_conv;
|
||||
UChar *buff_uchar;
|
||||
UChar *buff_conv;
|
||||
size_t result_len;
|
||||
|
||||
len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
|
||||
len_conv = icu_convert_case(u_strToUpper, locale,
|
||||
&buff_conv, buff_uchar, len_uchar);
|
||||
result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
|
||||
pfree(buff_uchar);
|
||||
pfree(buff_conv);
|
||||
|
||||
return result_len;
|
||||
return icu_convert_case(u_strToUpper, dest, destsize, src, srclen, locale);
|
||||
}
|
||||
|
||||
static size_t
|
||||
strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
||||
pg_locale_t locale)
|
||||
{
|
||||
int32_t len_uchar;
|
||||
int32_t len_conv;
|
||||
UChar *buff_uchar;
|
||||
UChar *buff_conv;
|
||||
size_t result_len;
|
||||
return icu_convert_case(u_strFoldCase_default, dest, destsize, src, srclen, locale);
|
||||
}
|
||||
|
||||
len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
|
||||
len_conv = icu_convert_case(u_strFoldCase_default, locale,
|
||||
&buff_conv, buff_uchar, len_uchar);
|
||||
result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
|
||||
pfree(buff_uchar);
|
||||
pfree(buff_conv);
|
||||
static size_t
|
||||
strlower_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
||||
pg_locale_t locale)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t needed;
|
||||
|
||||
return result_len;
|
||||
needed = ucasemap_utf8ToLower(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
|
||||
if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
|
||||
ereport(ERROR,
|
||||
errmsg("case conversion failed: %s", u_errorName(status)));
|
||||
return needed;
|
||||
}
|
||||
|
||||
static size_t
|
||||
strtitle_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
||||
pg_locale_t locale)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t needed;
|
||||
|
||||
needed = ucasemap_utf8ToTitle(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
|
||||
if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
|
||||
ereport(ERROR,
|
||||
errmsg("case conversion failed: %s", u_errorName(status)));
|
||||
return needed;
|
||||
}
|
||||
|
||||
static size_t
|
||||
strupper_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
||||
pg_locale_t locale)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t needed;
|
||||
|
||||
needed = ucasemap_utf8ToUpper(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
|
||||
if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
|
||||
ereport(ERROR,
|
||||
errmsg("case conversion failed: %s", u_errorName(status)));
|
||||
return needed;
|
||||
}
|
||||
|
||||
static size_t
|
||||
strfold_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
||||
pg_locale_t locale)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t needed;
|
||||
|
||||
needed = ucasemap_utf8FoldCase(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
|
||||
if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
|
||||
ereport(ERROR,
|
||||
errmsg("case conversion failed: %s", u_errorName(status)));
|
||||
return needed;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -829,8 +908,8 @@ icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len
|
||||
}
|
||||
|
||||
static int32_t
|
||||
icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
|
||||
UChar **buff_dest, UChar *buff_source, int32_t len_source)
|
||||
convert_case_uchar(ICU_Convert_Func func, pg_locale_t mylocale,
|
||||
UChar **buff_dest, UChar *buff_source, int32_t len_source)
|
||||
{
|
||||
UErrorCode status;
|
||||
int32_t len_dest;
|
||||
@ -855,6 +934,26 @@ icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
|
||||
return len_dest;
|
||||
}
|
||||
|
||||
static int32_t
|
||||
icu_convert_case(ICU_Convert_Func func, char *dest, size_t destsize,
|
||||
const char *src, ssize_t srclen, pg_locale_t locale)
|
||||
{
|
||||
int32_t len_uchar;
|
||||
int32_t len_conv;
|
||||
UChar *buff_uchar;
|
||||
UChar *buff_conv;
|
||||
size_t result_len;
|
||||
|
||||
len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
|
||||
len_conv = convert_case_uchar(func, locale, &buff_conv,
|
||||
buff_uchar, len_uchar);
|
||||
result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
|
||||
pfree(buff_uchar);
|
||||
pfree(buff_conv);
|
||||
|
||||
return result_len;
|
||||
}
|
||||
|
||||
static int32_t
|
||||
u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
@ -870,18 +969,25 @@ u_strFoldCase_default(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
const char *locale,
|
||||
UErrorCode *pErrorCode)
|
||||
{
|
||||
return u_strFoldCase(dest, destCapacity, src, srcLength,
|
||||
foldcase_options(locale), pErrorCode);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the correct u_strFoldCase() options for the given locale.
|
||||
*
|
||||
* Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
|
||||
* folding does not accept a locale. Instead it just supports a single option
|
||||
* relevant to Turkic languages 'az' and 'tr'; check for those languages.
|
||||
*/
|
||||
static int32_t
|
||||
foldcase_options(const char *locale)
|
||||
{
|
||||
uint32 options = U_FOLD_CASE_DEFAULT;
|
||||
char lang[3];
|
||||
UErrorCode status;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
/*
|
||||
* Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
|
||||
* folding does not accept a locale. Instead it just supports a single
|
||||
* option relevant to Turkic languages 'az' and 'tr'; check for those
|
||||
* languages to enable the option.
|
||||
*/
|
||||
status = U_ZERO_ERROR;
|
||||
uloc_getLanguage(locale, lang, 3, &status);
|
||||
if (U_SUCCESS(status))
|
||||
{
|
||||
@ -893,8 +999,7 @@ u_strFoldCase_default(UChar *dest, int32_t destCapacity,
|
||||
options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
|
||||
}
|
||||
|
||||
return u_strFoldCase(dest, destCapacity, src, srcLength,
|
||||
options, pErrorCode);
|
||||
return options;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@ -30,7 +30,7 @@
|
||||
#define BUFSZ 256
|
||||
|
||||
#ifdef USE_ICU
|
||||
static UCaseMap * casemap = NULL;
|
||||
static UCaseMap *casemap = NULL;
|
||||
#endif
|
||||
|
||||
typedef size_t (*TestFunc) (char *dst, size_t dstsize, const char *src,
|
||||
|
||||
@ -21,6 +21,7 @@
|
||||
#undef U_SHOW_CPLUSPLUS_HEADER_API
|
||||
#define U_SHOW_CPLUSPLUS_HEADER_API 0
|
||||
#include <unicode/ucol.h>
|
||||
#include <unicode/ucasemap.h>
|
||||
#endif
|
||||
|
||||
/* use for libc locale names */
|
||||
@ -168,6 +169,7 @@ struct pg_locale_struct
|
||||
const char *locale;
|
||||
UCollator *ucol;
|
||||
locale_t lt;
|
||||
UCaseMap *ucasemap;
|
||||
} icu;
|
||||
#endif
|
||||
};
|
||||
|
||||
@ -3190,6 +3190,7 @@ TypeName
|
||||
TzAbbrevCache
|
||||
U32
|
||||
U8
|
||||
UCaseMap
|
||||
UChar
|
||||
UCharIterator
|
||||
UColAttributeValue
|
||||
|
||||
Reference in New Issue
Block a user