06aaf91d创建于 2023年8月30日历史提交
/*
 * Encoding names and routines for work with it. All
 * in this file is shared bedween FE and BE.
 *
 * src/backend/utils/mb/encnames.c
 */
#ifdef FRONTEND
#include "postgres_fe.h"
#ifndef Assert
#define Assert(condition)
#endif
#else
#include "postgres.h"
#include "knl/knl_variable.h"
#include "utils/builtins.h"
#endif

#include <ctype.h>
#include <unistd.h>

#include "mb/pg_wchar.h"

/* ----------
 * All encoding names, sorted:		 *** A L P H A B E T I C ***
 *
 * All names must be without irrelevant chars, search routines use
 * isalnum() chars only. It means ISO-8859-1, iso_8859-1 and Iso8859_1
 * are always converted to 'iso88591'. All must be lower case.
 *
 * The table doesn't contain 'cs' aliases (like csISOLatin1). It's needed?
 *
 * Karel Zak, Aug 2001
 * ----------
 */
pg_encname pg_encname_tbl[] = {
    {"abc", PG_WIN1258},             /* alias for WIN1258 */
    {"alt", PG_WIN866},              /* IBM866 */
    {"big5", PG_BIG5},               /* Big5; Chinese for Taiwan multibyte set */
    {"binary", PG_SQL_ASCII},        /* Binary; alias for sqlascii */
    {"euccn", PG_EUC_CN},            /* EUC-CN; Extended Unix Code for simplified Chinese */
    /* EUC-JIS-2004; Extended UNIX Code fixed
     * Width for Japanese, standard JIS X 0213 */
    {"eucjis2004", PG_EUC_JIS_2004},
    {"eucjp", PG_EUC_JP},            /* EUC-JP; Extended UNIX Code fixed Width for Japanese, standard OSF */
    {"euckr", PG_EUC_KR},            /* EUC-KR; Extended Unix Code for Korean, KS X 1001 standard */
    {"euctw", PG_EUC_TW},            /* EUC-TW; Extended Unix Code for traditional Chinese */
    {"gb18030", PG_GB18030},         /* GB18030;GB18030 */
    {"gb180302022", PG_GB18030_2022},  /* GB18030-2022;version 2022 for GB18030 */
    {"gbk", PG_GBK},                 /* GBK; Chinese Windows CodePage 936 simplified Chinese */
    {"iso88591", PG_LATIN1},         /* ISO-8859-1; RFC1345,KXS2 */
    {"iso885910", PG_LATIN6},        /* ISO-8859-10; RFC1345,KXS2 */
    {"iso885913", PG_LATIN7},        /* ISO-8859-13; RFC1345,KXS2 */
    {"iso885914", PG_LATIN8},        /* ISO-8859-14; RFC1345,KXS2 */
    {"iso885915", PG_LATIN9},        /* ISO-8859-15; RFC1345,KXS2 */
    {"iso885916", PG_LATIN10},       /* ISO-8859-16; RFC1345,KXS2 */
    {"iso88592", PG_LATIN2},         /* ISO-8859-2; RFC1345,KXS2 */
    {"iso88593", PG_LATIN3},         /* ISO-8859-3; RFC1345,KXS2 */
    {"iso88594", PG_LATIN4},         /* ISO-8859-4; RFC1345,KXS2 */
    {"iso88595", PG_ISO_8859_5},     /* ISO-8859-5; RFC1345,KXS2 */
    {"iso88596", PG_ISO_8859_6},     /* ISO-8859-6; RFC1345,KXS2 */
    {"iso88597", PG_ISO_8859_7},     /* ISO-8859-7; RFC1345,KXS2 */
    {"iso88598", PG_ISO_8859_8},     /* ISO-8859-8; RFC1345,KXS2 */
    {"iso88599", PG_LATIN5},         /* ISO-8859-9; RFC1345,KXS2 */
    {"johab", PG_JOHAB},             /* JOHAB; Extended Unix Code for simplified Chinese */
    {"koi8", PG_KOI8R},              /* _dirty_ alias for KOI8-R (backward compatibility) */
    {"koi8r", PG_KOI8R},             /* KOI8-R; RFC1489 */
    {"koi8u", PG_KOI8U},             /* KOI8-U; RFC2319 */
    {"latin1", PG_LATIN1},           /* alias for ISO-8859-1 */
    {"latin10", PG_LATIN10},         /* alias for ISO-8859-16 */
    {"latin2", PG_LATIN2},           /* alias for ISO-8859-2 */
    {"latin3", PG_LATIN3},           /* alias for ISO-8859-3 */
    {"latin4", PG_LATIN4},           /* alias for ISO-8859-4 */
    {"latin5", PG_LATIN5},           /* alias for ISO-8859-9 */
    {"latin6", PG_LATIN6},           /* alias for ISO-8859-10 */
    {"latin7", PG_LATIN7},           /* alias for ISO-8859-13 */
    {"latin8", PG_LATIN8},           /* alias for ISO-8859-14 */
    {"latin9", PG_LATIN9},           /* alias for ISO-8859-15 */
    {"mskanji", PG_SJIS},            /* alias for Shift_JIS */
    {"muleinternal", PG_MULE_INTERNAL},
    {"shiftjis", PG_SJIS}, /* Shift_JIS; JIS X 0202-1991 */

    {"shiftjis2004", PG_SHIFT_JIS_2004}, /* SHIFT-JIS-2004; Shift JIS for Japanese, standard JIS X 0213 */
    {"sjis", PG_SJIS},                   /* alias for Shift_JIS */
    {"sqlascii", PG_SQL_ASCII},
    {"tcvn", PG_WIN1258},        /* alias for WIN1258 */
    {"tcvn5712", PG_WIN1258},    /* alias for WIN1258 */
    {"uhc", PG_UHC},             /* UHC; Korean Windows CodePage 949 */
    {"unicode", PG_UTF8},        /* alias for UTF8 */
    {"utf8", PG_UTF8},           /* alias for UTF8 */
    {"utf8mb4", PG_UTF8},        /* alias for UTF8 */
    {"vscii", PG_WIN1258},       /* alias for WIN1258 */
    {"win", PG_WIN1251},         /* _dirty_ alias for windows-1251 (backward compatibility) */
    {"win1250", PG_WIN1250},     /* alias for Windows-1250 */
    {"win1251", PG_WIN1251},     /* alias for Windows-1251 */
    {"win1252", PG_WIN1252},     /* alias for Windows-1252 */
    {"win1253", PG_WIN1253},     /* alias for Windows-1253 */
    {"win1254", PG_WIN1254},     /* alias for Windows-1254 */
    {"win1255", PG_WIN1255},     /* alias for Windows-1255 */
    {"win1256", PG_WIN1256},     /* alias for Windows-1256 */
    {"win1257", PG_WIN1257},     /* alias for Windows-1257 */
    {"win1258", PG_WIN1258},     /* alias for Windows-1258 */
    {"win866", PG_WIN866},       /* IBM866 */
    {"win874", PG_WIN874},       /* alias for Windows-874 */
    {"win932", PG_SJIS},         /* alias for Shift_JIS */
    {"win936", PG_GBK},          /* alias for GBK */
    {"win949", PG_UHC},          /* alias for UHC */
    {"win950", PG_BIG5},         /* alias for BIG5 */
    {"windows1250", PG_WIN1250}, /* Windows-1251; Microsoft */
    {"windows1251", PG_WIN1251}, /* Windows-1251; Microsoft */
    {"windows1252", PG_WIN1252}, /* Windows-1252; Microsoft */
    {"windows1253", PG_WIN1253}, /* Windows-1253; Microsoft */
    {"windows1254", PG_WIN1254}, /* Windows-1254; Microsoft */
    {"windows1255", PG_WIN1255}, /* Windows-1255; Microsoft */
    {"windows1256", PG_WIN1256}, /* Windows-1256; Microsoft */
    {"windows1257", PG_WIN1257}, /* Windows-1257; Microsoft */
    {"windows1258", PG_WIN1258}, /* Windows-1258; Microsoft */
    {"windows866", PG_WIN866},   /* IBM866 */
    {"windows874", PG_WIN874},   /* Windows-874; Microsoft */
    {"windows932", PG_SJIS},     /* alias for Shift_JIS */
    {"windows936", PG_GBK},      /* alias for GBK */
    {"windows949", PG_UHC},      /* alias for UHC */
    {"windows950", PG_BIG5},     /* alias for BIG5 */
    {NULL, (pg_enc)0}            /* last */
};

unsigned int pg_encname_tbl_sz = sizeof(pg_encname_tbl) / sizeof(pg_encname_tbl[0]) - 1;

/*
 * character set and its map supported in B format. We can find charset quickly
 * by collation oid use FAST_GET_CHARSET_BY_COLL.
 */
pg_enc pg_enc_coll_map_b[] = {
    PG_SQL_ASCII,     /* SQL/ASCII */
    PG_GBK,           /* GBK (Windows-936), collation oid range(1280 - 1535) */
    PG_UTF8,          /* Unicode UTF8 */
    PG_GB18030        /* GB18030, collation oid range(1768 - 2024) */
};

/* ----------
 * These are "official" encoding names.
 * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
 * ----------
 */
#ifndef WIN32
#define DEF_ENC2NAME(name, codepage) \
    {                                \
#name, PG_##name             \
    }
#else
#define DEF_ENC2NAME(name, codepage) \
    {                                \
#name, PG_##name, codepage   \
    }
#endif
pg_enc2name pg_enc2name_tbl[] = {DEF_ENC2NAME(SQL_ASCII, 0),
    DEF_ENC2NAME(EUC_JP, 20932),
    DEF_ENC2NAME(EUC_CN, 20936),
    DEF_ENC2NAME(EUC_KR, 51949),
    DEF_ENC2NAME(EUC_TW, 0),
    DEF_ENC2NAME(EUC_JIS_2004, 20932),
    DEF_ENC2NAME(GBK, 936),
    DEF_ENC2NAME(UTF8, 65001),
    DEF_ENC2NAME(MULE_INTERNAL, 0),
    DEF_ENC2NAME(LATIN1, 28591),
    DEF_ENC2NAME(LATIN2, 28592),
    DEF_ENC2NAME(LATIN3, 28593),
    DEF_ENC2NAME(LATIN4, 28594),
    DEF_ENC2NAME(LATIN5, 28599),
    DEF_ENC2NAME(LATIN6, 0),
    DEF_ENC2NAME(LATIN7, 0),
    DEF_ENC2NAME(LATIN8, 0),
    DEF_ENC2NAME(LATIN9, 28605),
    DEF_ENC2NAME(LATIN10, 0),
    DEF_ENC2NAME(WIN1256, 1256),
    DEF_ENC2NAME(WIN1258, 1258),
    DEF_ENC2NAME(WIN866, 866),
    DEF_ENC2NAME(WIN874, 874),
    DEF_ENC2NAME(KOI8R, 20866),
    DEF_ENC2NAME(WIN1251, 1251),
    DEF_ENC2NAME(WIN1252, 1252),
    DEF_ENC2NAME(ISO_8859_5, 28595),
    DEF_ENC2NAME(ISO_8859_6, 28596),
    DEF_ENC2NAME(ISO_8859_7, 28597),
    DEF_ENC2NAME(ISO_8859_8, 28598),
    DEF_ENC2NAME(WIN1250, 1250),
    DEF_ENC2NAME(WIN1253, 1253),
    DEF_ENC2NAME(WIN1254, 1254),
    DEF_ENC2NAME(WIN1255, 1255),
    DEF_ENC2NAME(WIN1257, 1257),
    DEF_ENC2NAME(KOI8U, 21866),
    DEF_ENC2NAME(GB18030, 54936),
    DEF_ENC2NAME(GB18030_2022, 54936),
    DEF_ENC2NAME(SJIS, 932),
    DEF_ENC2NAME(BIG5, 950),
    DEF_ENC2NAME(UHC, 0),
    DEF_ENC2NAME(JOHAB, 0),
    DEF_ENC2NAME(SHIFT_JIS_2004, 932)};

/* ----------
 * These are encoding names for gettext.
 * ----------
 */
pg_enc2gettext pg_enc2gettext_tbl[] = {{PG_UTF8, "UTF-8"},
    {PG_LATIN1, "LATIN1"},
    {PG_LATIN2, "LATIN2"},
    {PG_LATIN3, "LATIN3"},
    {PG_LATIN4, "LATIN4"},
    {PG_ISO_8859_5, "ISO-8859-5"},
    {PG_ISO_8859_6, "ISO_8859-6"},
    {PG_ISO_8859_7, "ISO-8859-7"},
    {PG_ISO_8859_8, "ISO-8859-8"},
    {PG_LATIN5, "LATIN5"},
    {PG_LATIN6, "LATIN6"},
    {PG_LATIN7, "LATIN7"},
    {PG_LATIN8, "LATIN8"},
    {PG_LATIN9, "LATIN-9"},
    {PG_LATIN10, "LATIN10"},
    {PG_KOI8R, "KOI8-R"},
    {PG_KOI8U, "KOI8-U"},
    {PG_WIN1250, "CP1250"},
    {PG_WIN1251, "CP1251"},
    {PG_WIN1252, "CP1252"},
    {PG_WIN1253, "CP1253"},
    {PG_WIN1254, "CP1254"},
    {PG_WIN1255, "CP1255"},
    {PG_WIN1256, "CP1256"},
    {PG_WIN1257, "CP1257"},
    {PG_WIN1258, "CP1258"},
    {PG_WIN866, "CP866"},
    {PG_WIN874, "CP874"},
    {PG_EUC_CN, "EUC-CN"},
    {PG_EUC_JP, "EUC-JP"},
    {PG_EUC_KR, "EUC-KR"},
    {PG_EUC_TW, "EUC-TW"},
    {PG_EUC_JIS_2004, "EUC-JP"},
    {(pg_enc)0, NULL}};

/* ----------
 * Encoding checks, for error returns -1 else encoding id
 * ----------
 */
int pg_valid_client_encoding(const char* name)
{
    int enc;

    if ((enc = pg_char_to_encoding(name)) < 0) {
        return -1;
    }
    if (!PG_VALID_FE_ENCODING(enc)) {
        return -1;
    }
    return enc;
}

int pg_valid_server_encoding(const char* name)
{
    int enc;

    if ((enc = pg_char_to_encoding(name)) < 0) {
        return -1;
    }
    if (!PG_VALID_BE_ENCODING(enc)) {
        return -1;
    }
    return enc;
}

int pg_valid_server_encoding_id(int encoding)
{
    return (int)PG_VALID_BE_ENCODING(encoding);
}

/* ----------
 * Remove irrelevant chars from encoding name
 * ----------
 */
static char* clean_encoding_name(const char* key, char* newkey)
{
    const char* p = NULL;
    char* np = NULL;

    for (p = key, np = newkey; *p != '\0'; p++) {
        if (isalnum((unsigned char)*p)) {
            if (*p >= 'A' && *p <= 'Z') {
                *np++ = *p + 'a' - 'A';
            } else {
                *np++ = *p;
            }
        }
    }
    *np = '\0';
    return newkey;
}

/* ----------
 * Search encoding by encoding name
 * ----------
 */
pg_encname* pg_char_to_encname_struct(const char* name)
{
    unsigned int nel = pg_encname_tbl_sz;
    pg_encname *base = pg_encname_tbl;
    pg_encname *last = base + nel - 1;
    pg_encname *position = NULL;
    int result;
    char buff[NAMEDATALEN];
    char *key = NULL;

    if (name == NULL || *name == '\0') {
        return NULL;
    }
    if (strlen(name) >= NAMEDATALEN) {
#ifdef FRONTEND
        fprintf(stderr, "encoding name too long\n");
        return NULL;
#else
#ifndef ENABLE_UT
        ereport(ERROR, (errcode(ERRCODE_NAME_TOO_LONG), errmsg("encoding name too long")));
#else
	return NULL;
#endif
#endif
    }
    key = clean_encoding_name(name, buff);

    while (last >= base) {
        position = base + ((last - base) >> 1);
        result = key[0] - position->name[0];

        if (result == 0) {
            result = strcmp(key, position->name);
            if (result == 0)
                return position;
        }
        if (result < 0) {
            last = position - 1;
        } else {
            base = position + 1;
        }
    }
    return NULL;
}

/*
 * Returns encoding or -1 for error
 */
int pg_char_to_encoding(const char* name)
{
    pg_encname* p = NULL;

    if (name == NULL) {
        return -1;
    }
    p = pg_char_to_encname_struct(name);
    return p ? p->encoding : -1;
}

#ifndef FRONTEND
Datum PG_char_to_encoding(PG_FUNCTION_ARGS)
{
    Name s = PG_GETARG_NAME(0);

    PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
}
#endif

const char* pg_encoding_to_char(int encoding)
{
    if (PG_VALID_ENCODING(encoding)) {
        pg_enc2name* p = &pg_enc2name_tbl[encoding];

        Assert(encoding == p->encoding);
        return p->name;
    }
    return "";
}

#ifndef FRONTEND
Datum PG_encoding_to_char(PG_FUNCTION_ARGS)
{
    int32 encoding = PG_GETARG_INT32(0);
    const char* encoding_name = pg_encoding_to_char(encoding);

    return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
}

#endif