Make regex "max_chr" depend on encoding, not provider.
authorJeff Davis <[email protected]>
Mon, 1 Dec 2025 19:06:17 +0000 (11:06 -0800)
committerJeff Davis <[email protected]>
Mon, 1 Dec 2025 19:06:17 +0000 (11:06 -0800)
The regex mechanism scans through the first "max_chr" character values
to cache character property ranges (isalpha, etc.). For single-byte
encodings, there's no sense in scanning beyond UCHAR_MAX; but for
UTF-8 it makes sense to cache higher code point values (though not all
of them; only up to MAX_SIMPLE_CHR).

Prior to 5a38104b36, the logic about how many character values to scan
was based on the pg_regex_strategy, which was dependent on the
provider. Commit 5a38104b36 preserved that logic exactly, allowing
different providers to define the "max_chr".

Now, change it to depend only on the encoding and whether
ctype_is_c. For this specific calculation, distinguishing between
providers creates more complexity than it's worth.

Discussion: https://postgr.es/m/450ceb6260cad30d7afdf155d991a9caafee7c0d[email protected]
Reviewed-by: Chao Li <[email protected]>
src/backend/regex/regc_pg_locale.c
src/backend/utils/adt/pg_locale_libc.c
src/include/utils/pg_locale.h

index 4698f110a0c21a98e293e7aa367396af4a53e5b8..bb0e3f1d139201ca06953e9140b6febc4a7a1ec7 100644 (file)
@@ -320,16 +320,18 @@ regc_ctype_get_cache(regc_wc_probefunc probefunc, int cclasscode)
                max_chr = (pg_wchar) MAX_SIMPLE_CHR;
 #endif
        }
+       else if (GetDatabaseEncoding() == PG_UTF8)
+       {
+               max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+       }
        else
        {
-               if (pg_regex_locale->ctype->max_chr != 0 &&
-                       pg_regex_locale->ctype->max_chr <= MAX_SIMPLE_CHR)
-               {
-                       max_chr = pg_regex_locale->ctype->max_chr;
-                       pcc->cv.cclasscode = -1;
-               }
-               else
-                       max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+#if MAX_SIMPLE_CHR >= UCHAR_MAX
+               max_chr = (pg_wchar) UCHAR_MAX;
+               pcc->cv.cclasscode = -1;
+#else
+               max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+#endif
        }
 
        /*
index e2beee44335a712175dac600a602ba38b5c0f180..6ad3f93b54364158fc5728af58b67b26ae72d3a8 100644 (file)
@@ -342,7 +342,6 @@ static const struct ctype_methods ctype_methods_libc_sb = {
        .char_tolower = char_tolower_libc,
        .wc_toupper = toupper_libc_sb,
        .wc_tolower = tolower_libc_sb,
-       .max_chr = UCHAR_MAX,
 };
 
 /*
@@ -369,7 +368,6 @@ static const struct ctype_methods ctype_methods_libc_other_mb = {
        .char_tolower = char_tolower_libc,
        .wc_toupper = toupper_libc_sb,
        .wc_tolower = tolower_libc_sb,
-       .max_chr = UCHAR_MAX,
 };
 
 static const struct ctype_methods ctype_methods_libc_utf8 = {
index 54193a17a90fed4c3df591f7266c29ed0a9e6895..42e21e7fb8aedc47b86a5c785c62d43d30a6d336 100644 (file)
@@ -134,12 +134,6 @@ struct ctype_methods
         * pg_strlower().
         */
        char            (*char_tolower) (unsigned char ch, pg_locale_t locale);
-
-       /*
-        * For regex and pattern matching efficiency, the maximum char value
-        * supported by the above methods. If zero, limit is set by regex code.
-        */
-       pg_wchar        max_chr;
 };
 
 /*