*
* tsvector_parser.c
* Parser for tsvector
*
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* src/backend/utils/adt/tsvector_parser.c
*
* -------------------------------------------------------------------------
*/
#include "postgres.h"
#include "knl/knl_variable.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_utils.h"
* Private state of tsvector parser. Note that tsquery also uses this code to
* parse its input, hence the boolean flags. The two flags are both true or
* both false in current usage, but we keep them separate for clarity.
* is_tsquery affects *only* the content of error messages.
*/
struct TSVectorParseStateData {
char* prsbuf;
char* bufstart;
char* word;
int len;
int eml;
bool oprisdelim;
bool is_tsquery;
};
* Initializes parser for the input string. If oprisdelim is set, the
* following characters are treated as delimiters in addition to whitespace:
* ! | & ( )
*/
TSVectorParseState init_tsvector_parser(char* input, bool oprisdelim, bool is_tsquery)
{
TSVectorParseState state;
state = (TSVectorParseState)palloc(sizeof(struct TSVectorParseStateData));
state->prsbuf = input;
state->bufstart = input;
state->len = 32;
state->word = (char*)palloc(state->len);
state->eml = pg_database_encoding_max_length();
state->oprisdelim = oprisdelim;
state->is_tsquery = is_tsquery;
return state;
}
* Reinitializes parser to parse 'input', instead of previous input.
*/
void reset_tsvector_parser(TSVectorParseState state, char* input)
{
state->prsbuf = input;
}
* Shuts down a tsvector parser.
*/
void close_tsvector_parser(TSVectorParseState state)
{
pfree_ext(state->word);
pfree_ext(state);
}
#define RESIZEPRSBUF \
do { \
int clen = curpos - state->word; \
if (clen + state->eml >= state->len) { \
state->len *= 2; \
state->word = (char*)repalloc(state->word, state->len); \
curpos = state->word + clen; \
} \
} while (0)
#define ISOPERATOR(x) (pg_mblen(x) == 1 && (*(x) == '!' || *(x) == '&' || *(x) == '|' || *(x) == '(' || *(x) == ')'))
#define RETURN_TOKEN \
do { \
if (pos_ptr != NULL) { \
*pos_ptr = pos; \
*poslen = npos; \
} else if (pos != NULL) \
pfree_ext(pos); \
\
if (strval != NULL) \
*strval = state->word; \
if (lenval != NULL) \
*lenval = curpos - state->word; \
if (endptr != NULL) \
*endptr = state->prsbuf; \
return true; \
} while (0)
#define WAITWORD 1
#define WAITENDWORD 2
#define WAITNEXTCHAR 3
#define WAITENDCMPLX 4
#define WAITPOSINFO 5
#define INPOSINFO 6
#define WAITPOSDELIM 7
#define WAITCHARCMPLX 8
#define PRSSYNTAXERROR prssyntaxerror(state)
static void prssyntaxerror(TSVectorParseState state)
{
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
state->is_tsquery ? errmsg("syntax error in tsquery: \"%s\"", state->bufstart)
: errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
}
* Get next token from string being parsed. Returns true if successful,
* false if end of input string is reached. On success, these output
* parameters are filled in:
*
* *strval pointer to token
* *lenval length of *strval
* *pos_ptr pointer to a palloc'd array of positions and weights
* associated with the token. If the caller is not interested
* in the information, NULL can be supplied. Otherwise
* the caller is responsible for pfreeing the array.
* *poslen number of elements in *pos_ptr
* *endptr scan resumption point
*
* Pass NULL for unwanted output parameters.
*/
bool gettoken_tsvector(
TSVectorParseState state, char** strval, int* lenval, WordEntryPos** pos_ptr, int* poslen, char** endptr)
{
int oldstate = 0;
char* curpos = state->word;
int statecode = WAITWORD;
* pos is for collecting the comma delimited list of positions followed by
* the actual token.
*/
WordEntryPos* pos = NULL;
int npos = 0;
int posalen = 0;
while (1) {
if (statecode == WAITWORD) {
if (*(state->prsbuf) == '\0')
return false;
else if (t_iseq(state->prsbuf, '\''))
statecode = WAITENDCMPLX;
else if (t_iseq(state->prsbuf, '\\')) {
statecode = WAITNEXTCHAR;
oldstate = WAITENDWORD;
} else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
PRSSYNTAXERROR;
else if (!t_isspace(state->prsbuf)) {
COPYCHAR(curpos, state->prsbuf);
curpos += pg_mblen(state->prsbuf);
statecode = WAITENDWORD;
}
} else if (statecode == WAITNEXTCHAR) {
if (*(state->prsbuf) == '\0')
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR), errmsg("there is no escaped character: \"%s\"", state->bufstart)));
else {
RESIZEPRSBUF;
COPYCHAR(curpos, state->prsbuf);
curpos += pg_mblen(state->prsbuf);
Assert(oldstate != 0);
statecode = oldstate;
}
} else if (statecode == WAITENDWORD) {
if (t_iseq(state->prsbuf, '\\')) {
statecode = WAITNEXTCHAR;
oldstate = WAITENDWORD;
} else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
(state->oprisdelim && ISOPERATOR(state->prsbuf))) {
RESIZEPRSBUF;
if (curpos == state->word)
PRSSYNTAXERROR;
*(curpos) = '\0';
RETURN_TOKEN;
} else if (t_iseq(state->prsbuf, ':')) {
if (curpos == state->word)
PRSSYNTAXERROR;
*(curpos) = '\0';
if (state->oprisdelim)
RETURN_TOKEN;
else
statecode = INPOSINFO;
} else {
RESIZEPRSBUF;
COPYCHAR(curpos, state->prsbuf);
curpos += pg_mblen(state->prsbuf);
}
} else if (statecode == WAITENDCMPLX) {
if (t_iseq(state->prsbuf, '\'')) {
statecode = WAITCHARCMPLX;
} else if (t_iseq(state->prsbuf, '\\')) {
statecode = WAITNEXTCHAR;
oldstate = WAITENDCMPLX;
} else if (*(state->prsbuf) == '\0')
PRSSYNTAXERROR;
else {
RESIZEPRSBUF;
COPYCHAR(curpos, state->prsbuf);
curpos += pg_mblen(state->prsbuf);
}
} else if (statecode == WAITCHARCMPLX) {
if (t_iseq(state->prsbuf, '\'')) {
RESIZEPRSBUF;
COPYCHAR(curpos, state->prsbuf);
curpos += pg_mblen(state->prsbuf);
statecode = WAITENDCMPLX;
} else {
RESIZEPRSBUF;
*(curpos) = '\0';
if (curpos == state->word)
PRSSYNTAXERROR;
if (state->oprisdelim) {
RETURN_TOKEN;
} else
statecode = WAITPOSINFO;
continue;
}
} else if (statecode == WAITPOSINFO) {
if (t_iseq(state->prsbuf, ':'))
statecode = INPOSINFO;
else
RETURN_TOKEN;
} else if (statecode == INPOSINFO) {
if (t_isdigit(state->prsbuf)) {
if (posalen == 0) {
posalen = 4;
pos = (WordEntryPos*)palloc(sizeof(WordEntryPos) * posalen);
npos = 0;
} else if (npos + 1 >= posalen) {
posalen *= 2;
pos = (WordEntryPos*)repalloc(pos, sizeof(WordEntryPos) * posalen);
}
npos++;
WEP_SETPOS(pos[npos - 1], (unsigned int)LIMITPOS(atoi(state->prsbuf)));
if (WEP_GETPOS(pos[npos - 1]) == 0)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("wrong position info in tsvector: \"%s\"", state->bufstart)));
WEP_SETWEIGHT(pos[npos - 1], 0);
statecode = WAITPOSDELIM;
} else
PRSSYNTAXERROR;
} else if (statecode == WAITPOSDELIM) {
if (t_iseq(state->prsbuf, ','))
statecode = INPOSINFO;
else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*')) {
if (WEP_GETWEIGHT(pos[npos - 1]))
PRSSYNTAXERROR;
WEP_SETWEIGHT(pos[npos - 1], 3);
} else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B')) {
if (WEP_GETWEIGHT(pos[npos - 1]))
PRSSYNTAXERROR;
WEP_SETWEIGHT(pos[npos - 1], 2);
} else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C')) {
if (WEP_GETWEIGHT(pos[npos - 1]))
PRSSYNTAXERROR;
WEP_SETWEIGHT(pos[npos - 1], 1);
} else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D')) {
if (WEP_GETWEIGHT(pos[npos - 1]))
PRSSYNTAXERROR;
WEP_SETWEIGHT(pos[npos - 1], 0);
} else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0')
RETURN_TOKEN;
else if (!t_isdigit(state->prsbuf))
PRSSYNTAXERROR;
} else
ereport(ERROR,
(errcode(ERRCODE_UNRECOGNIZED_NODE_TYPE),
errmsg("unrecognized state in gettoken_tsvector: %d", statecode)));
state->prsbuf += pg_mblen(state->prsbuf);
}
return false;
}