/*-------------------------------------------------------------------------
 *
 * scanner.c
 *	  lexical scanning for PL/pgPSM
 *
 *
 * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *	  src/pl/plpgpsm/src/scanner.c
 *
 *-------------------------------------------------------------------------
 */
#include "plpgpsm.h"

#include "mb/pg_wchar.h"
#include "parser/scanner.h"

#include "lib/stringinfo.h"

#include "gram.h"			/* must be after parser/scanner.h */

#define PG_KEYWORD(a,b,c) {a,b,c},

/*
 * A word about keywords:
 *
 *
 * For the most part, the reserved keywords are those that start a PL/pgPSM
 * statement (and so would conflict with an assignment to a variable of the
 * same name).	We also don't sweat it much about reserving keywords that
 * are reserved in the core grammar.  Try to avoid reserving other words.
 */

/*
 * Lists of keyword (name, token-value, category) entries.
 *
 * !!WARNING!!: These lists must be sorted by ASCII name, because binary
 *		 search is used to locate entries.
 *
 * Be careful not to put the same word in both lists.  Also be sure that
 * gram.y's unreserved_keyword production agrees with the second list.
 */

static const ScanKeyword reserved_keywords[] = {
	PG_KEYWORD("begin", BEGIN, RESERVED_KEYWORD)
	PG_KEYWORD("case", CASE, RESERVED_KEYWORD)
	PG_KEYWORD("close", CLOSE, RESERVED_KEYWORD)
	PG_KEYWORD("condition", CONDITION, RESERVED_KEYWORD)
	PG_KEYWORD("cursor", CURSOR, RESERVED_KEYWORD)
	PG_KEYWORD("declare", DECLARE, RESERVED_KEYWORD)
	PG_KEYWORD("default", DEFAULT, RESERVED_KEYWORD)
	PG_KEYWORD("do", DO, RESERVED_KEYWORD)
	PG_KEYWORD("else", ELSE, RESERVED_KEYWORD)
	PG_KEYWORD("elseif", ELSEIF, RESERVED_KEYWORD)
	PG_KEYWORD("end", END, RESERVED_KEYWORD)
	PG_KEYWORD("fetch", FETCH, RESERVED_KEYWORD)
	PG_KEYWORD("for", FOR, RESERVED_KEYWORD)
	PG_KEYWORD("from", FROM, RESERVED_KEYWORD)
	PG_KEYWORD("get", GET, RESERVED_KEYWORD)
	PG_KEYWORD("handler", HANDLER, RESERVED_KEYWORD)
	PG_KEYWORD("if", IF, RESERVED_KEYWORD)
	PG_KEYWORD("into", INTO, RESERVED_KEYWORD)
	PG_KEYWORD("iterate", ITERATE, RESERVED_KEYWORD)
	PG_KEYWORD("leave", LEAVE, RESERVED_KEYWORD)
	PG_KEYWORD("loop", LOOP, RESERVED_KEYWORD)
	PG_KEYWORD("open", OPEN, RESERVED_KEYWORD)
	PG_KEYWORD("repeat", REPEAT, RESERVED_KEYWORD)
	PG_KEYWORD("resignal", RESIGNAL, RESERVED_KEYWORD)
	PG_KEYWORD("return", RETURN, RESERVED_KEYWORD)
	PG_KEYWORD("select", SELECT, RESERVED_KEYWORD)
	PG_KEYWORD("set", SET, RESERVED_KEYWORD)
	PG_KEYWORD("signal", SIGNAL, RESERVED_KEYWORD)
	PG_KEYWORD("then", THEN, RESERVED_KEYWORD)
	PG_KEYWORD("until", UNTIL, RESERVED_KEYWORD)
	PG_KEYWORD("value", VALUE, RESERVED_KEYWORD)
	PG_KEYWORD("when", WHEN, RESERVED_KEYWORD)
	PG_KEYWORD("while", WHILE, RESERVED_KEYWORD)
};

static const int num_reserved_keywords = lengthof(reserved_keywords);

static const ScanKeyword unreserved_keywords[] = {
	PG_KEYWORD("as", AS, UNRESERVED_KEYWORD)
	PG_KEYWORD("atomic", ATOMIC, UNRESERVED_KEYWORD)
	PG_KEYWORD("condition_identifier", CONDITION_IDENTIFIER, UNRESERVED_KEYWORD)
	PG_KEYWORD("continue", CONTINUE, UNRESERVED_KEYWORD)
	PG_KEYWORD("current", CURRENT, UNRESERVED_KEYWORD)
	PG_KEYWORD("detail_text", DETAIL_TEXT, UNRESERVED_KEYWORD)
	PG_KEYWORD("diagnostics", DIAGNOSTICS, UNRESERVED_KEYWORD)
	PG_KEYWORD("exit", EXIT, UNRESERVED_KEYWORD)
	PG_KEYWORD("found", FOUND, UNRESERVED_KEYWORD)
	PG_KEYWORD("hint_text", HINT_TEXT, UNRESERVED_KEYWORD)
	PG_KEYWORD("message_text", MESSAGE_TEXT, UNRESERVED_KEYWORD)
	PG_KEYWORD("no", NO, UNRESERVED_KEYWORD)
	PG_KEYWORD("not", NOT, UNRESERVED_KEYWORD)
	PG_KEYWORD("print", PRINT, UNRESERVED_KEYWORD)
	PG_KEYWORD("returned_sqlcode", RETURNED_SQLCODE, UNRESERVED_KEYWORD)
	PG_KEYWORD("returned_sqlstate", RETURNED_SQLSTATE, UNRESERVED_KEYWORD)
	PG_KEYWORD("row_count", ROW_COUNT, UNRESERVED_KEYWORD)
	PG_KEYWORD("scroll", SCROLL, UNRESERVED_KEYWORD)
	PG_KEYWORD("sqlcode", SQLCODE, UNRESERVED_KEYWORD)
	PG_KEYWORD("sqlexception", SQLEXCEPTION, UNRESERVED_KEYWORD)
	PG_KEYWORD("sqlstate", SQLSTATE, UNRESERVED_KEYWORD)
	PG_KEYWORD("sqlwarning", SQLWARNING, UNRESERVED_KEYWORD)
	PG_KEYWORD("stacked", STACKED, UNRESERVED_KEYWORD)
	PG_KEYWORD("undo", UNDO, UNRESERVED_KEYWORD)
};

static const int num_unreserved_keywords = lengthof(unreserved_keywords);


/* Auxiliary data about a token (other than the token type) */
typedef struct
{
	YYSTYPE		lval;			/* semantic information */
	YYLTYPE		lloc;			/* offset in scanbuf */
	int			leng;			/* length in bytes */
} TokenAuxData;

/*
 * Scanner working state.  At some point we might wish to fold all this
 * into a YY_EXTRA struct.	For the moment, there is no need for plpgpsm's
 * lexer to be re-entrant, and the notational burden of passing a yyscanner
 * pointer around is great enough to not want to do it without need.
 */

/* The stuff the core lexer needs */
static core_yyscan_t yyscanner = NULL;
static core_yy_extra_type core_yy;

/* The original input string */
static const char *scanorig;

/* Current token's length (corresponds to plpgpsm_yylval and plpgpsm_yylloc) */
static int	plpgpsm_yyleng;

/* Token pushback stack */
#define MAX_PUSHBACKS 4

static int	num_pushbacks;
static int	pushback_token[MAX_PUSHBACKS];
static TokenAuxData pushback_auxdata[MAX_PUSHBACKS];

/* State for plpgpsm_location_to_lineno() */
static const char *cur_line_start;
static const char *cur_line_end;
static int	cur_line_num;

/* Internal functions */
static int	internal_yylex(TokenAuxData *auxdata);
static void push_back_token(int token, TokenAuxData *auxdata);
static void location_lineno_init(void);

static void parse_word(char *word1, const char *yytxt, PLword *word);


/*
 * This is the yylex routine called from the PL/pgSQL grammar.
 * It is a wrapper around the core lexer,
 *
 */
int
plpgpsm_yylex(void)
{
	int			tok1;
	TokenAuxData aux1;
	const ScanKeyword *kw;

	tok1 = internal_yylex(&aux1);
	if (tok1 == IDENT)
	{
		int			tok2;
		TokenAuxData aux2;

		tok2 = internal_yylex(&aux2);
		if (tok2 == '.')
		{
			int			tok3;
			TokenAuxData aux3;

			tok3 = internal_yylex(&aux3);
			if (tok3 == IDENT)
			{
				int			tok4;
				TokenAuxData aux4;

				tok4 = internal_yylex(&aux4);
				if (tok4 == '.')
				{
					int			tok5;
					TokenAuxData aux5;

					tok5 = internal_yylex(&aux5);
					if (tok5 == IDENT)
					{
						tok1 = T_CWORD;
					}
					else
					{
						/* not A.B.C, so just process A.B */
						push_back_token(tok5, &aux5);
						push_back_token(tok4, &aux4);
						tok1 = T_CWORD;
					}
				}
				else
				{
					/* not A.B.C, so just process A.B */
					push_back_token(tok4, &aux4);
					tok1 = T_CWORD;
				}
			}
			else
			{
				/* not A.B, so just process A */
				push_back_token(tok3, &aux3);
				push_back_token(tok2, &aux2);
				parse_word(aux1.lval.str, core_yy.scanbuf + aux1.lloc, &aux1.lval.word);

				if (!aux1.lval.word.quoted &&
						 (kw = ScanKeywordLookup(aux1.lval.word.ident,
												 unreserved_keywords,
												 num_unreserved_keywords)))
				{
					aux1.lval.keyword = kw->name;
					tok1 = kw->value;
				}
				else
					tok1 = T_WORD;
			}
		}
		else
		{
			/* not A.B, so just process A */
			push_back_token(tok2, &aux2);
			parse_word(aux1.lval.str, core_yy.scanbuf + aux1.lloc, &aux1.lval.word);

			if (!aux1.lval.word.quoted &&
					 (kw = ScanKeywordLookup(aux1.lval.word.ident,
											 unreserved_keywords,
											 num_unreserved_keywords)))
			{
				aux1.lval.keyword = kw->name;
				tok1 = kw->value;
			}
			else
				tok1 = T_WORD;
		}
	}

	plpgpsm_yylval = aux1.lval;
	plpgpsm_yylloc = aux1.lloc;
	plpgpsm_yyleng = aux1.leng;

	return tok1;
}

static void
parse_word(char *word1, const char *yytxt, PLword *word)
{
	word->ident = word1;
	word->quoted = (yytxt[0] == '"');
}

/*
 * Internal yylex function.  This wraps the core lexer and adds one feature:
 * a token pushback stack.	We also make a couple of trivial single-token
 * translations from what the core lexer does to what we want, in particular
 * interfacing from the core_YYSTYPE to YYSTYPE union.
 */
static int
internal_yylex(TokenAuxData *auxdata)
{
	int			token;
	const char *yytext;

	if (num_pushbacks > 0)
	{
		num_pushbacks--;
		token = pushback_token[num_pushbacks];
		*auxdata = pushback_auxdata[num_pushbacks];
	}
	else
	{
		token = core_yylex(&auxdata->lval.core_yystype,
						   &auxdata->lloc,
						   yyscanner);

		/* remember the length of yytext before it gets changed */
		yytext = core_yy.scanbuf + auxdata->lloc;
		auxdata->leng = strlen(yytext);
	}

	return token;
}

/*
 * Push back a token to be re-read by next internal_yylex() call.
 */
static void
push_back_token(int token, TokenAuxData *auxdata)
{
	if (num_pushbacks >= MAX_PUSHBACKS)
		elog(ERROR, "too many tokens pushed back");
	pushback_token[num_pushbacks] = token;
	pushback_auxdata[num_pushbacks] = *auxdata;
	num_pushbacks++;
}

/*
 * Push back a single token to be re-read by next plpgpsm_yylex() call.
 *
 * NOTE: this does not cause yylval or yylloc to "back up".  Also, it
 * is not a good idea to push back a token code other than what you read.
 */
void
plpgpsm_push_back_token(int token)
{
	TokenAuxData auxdata;

	auxdata.lval = plpgpsm_yylval;
	auxdata.lloc = plpgpsm_yylloc;
	auxdata.leng = plpgpsm_yyleng;
	push_back_token(token, &auxdata);
}

/*
 * Append the function text starting at startlocation and extending to
 * (not including) endlocation onto the existing contents of "buf".
 */
void
plpgpsm_append_source_text(StringInfo buf,
						   int startlocation, int endlocation)
{
	Assert(startlocation <= endlocation);
	appendBinaryStringInfo(buf, scanorig + startlocation,
						   endlocation - startlocation);
}

/*
 * Peek two tokens ahead in the input stream. The first token and its
 * location the query are returned in *tok1_p and *tok1_loc, second token
 * and its location in *tok2_p and *tok2_loc.
 *
 * NB: no variable or unreserved keyword lookup is performed here, they will
 * be returned as IDENT. Reserved keywords are resolved as usual.
 */
void
plpgpsm_peek2(int *tok1_p, int *tok2_p, int *tok1_loc, int *tok2_loc)
{
	int			tok1,
				tok2;
	TokenAuxData aux1,
				aux2;

	tok1 = internal_yylex(&aux1);
	tok2 = internal_yylex(&aux2);

	*tok1_p = tok1;
	if (tok1_loc)
		*tok1_loc = aux1.lloc;
	*tok2_p = tok2;
	if (tok2_loc)
		*tok2_loc = aux2.lloc;

	push_back_token(tok2, &aux2);
	push_back_token(tok1, &aux1);
}

/*
 * plpgpsm_scanner_errposition
 *		Report an error cursor position, if possible.
 *
 * This is expected to be used within an ereport() call.  The return value
 * is a dummy (always 0, in fact).
 *
 * Note that this can only be used for messages emitted during initial
 * parsing of a plpgpsm function, since it requires the scanorig string
 * to still be available.
 */
int
plpgpsm_scanner_errposition(int location)
{
	int			pos;

	if (location < 0 || scanorig == NULL)
		return 0;				/* no-op if location is unknown */

	/* Convert byte offset to character number */
	pos = pg_mbstrlen_with_len(scanorig, location) + 1;
	/* And pass it to the ereport mechanism */
	(void) internalerrposition(pos);
	/* Also pass the function body string */
	return internalerrquery(scanorig);
}

/*
 * plpgpsm_yyerror
 *		Report a lexer or grammar error.
 *
 * The message's cursor position refers to the current token (the one
 * last returned by plpgpsm_yylex()).
 * This is OK for syntax error messages from the Bison parser, because Bison
 * parsers report error as soon as the first unparsable token is reached.
 * Beware of using yyerror for other purposes, as the cursor position might
 * be misleading!
 */
void
plpgpsm_yyerror(const char *message)
{
	char	   *yytext = core_yy.scanbuf + plpgpsm_yylloc;

	if (*yytext == '\0')
	{
		ereport(ERROR,
				(errcode(ERRCODE_SYNTAX_ERROR),
		/* translator: %s is typically the translation of "syntax error" */
				 errmsg("%s at end of input", _(message)),
				 plpgpsm_scanner_errposition(plpgpsm_yylloc)));
	}
	else
	{
		/*
		 * If we have done any lookahead then flex will have restored the
		 * character after the end-of-token.  Zap it again so that we report
		 * only the single token here.	This modifies scanbuf but we no longer
		 * care about that.
		 */
		yytext[plpgpsm_yyleng] = '\0';

		ereport(ERROR,
				(errcode(ERRCODE_SYNTAX_ERROR),
		/* translator: first %s is typically the translation of "syntax error" */
				 errmsg("%s at or near \"%s\"", _(message), yytext),
				 plpgpsm_scanner_errposition(plpgpsm_yylloc)));
	}
}

/*
 * Given a location (a byte offset in the function source text),
 * return a line number.
 *
 * We expect that this is typically called for a sequence of increasing
 * location values, so optimize accordingly by tracking the endpoints
 * of the "current" line.
 */
int
plpgpsm_location_to_lineno(int location)
{
	const char *loc;

	if (location < 0 || scanorig == NULL)
		return 0;				/* garbage in, garbage out */
	loc = scanorig + location;

	/* be correct, but not fast, if input location goes backwards */
	if (loc < cur_line_start)
		location_lineno_init();

	while (cur_line_end != NULL && loc > cur_line_end)
	{
		cur_line_start = cur_line_end + 1;
		cur_line_num++;
		cur_line_end = strchr(cur_line_start, '\n');
	}

	return cur_line_num;
}

/* initialize or reset the state for plpgpsm_location_to_lineno */
static void
location_lineno_init(void)
{
	cur_line_start = scanorig;
	cur_line_num = 1;

	cur_line_end = strchr(cur_line_start, '\n');
}

/* return the most recently computed lineno */
int
plpgpsm_latest_lineno(void)
{
	return cur_line_num;
}


/*
 * Called before any actual parsing is done
 *
 * Note: the passed "str" must remain valid until plpgpsm_scanner_finish().
 * Although it is not fed directly to flex, we need the original string
 * to cite in error messages.
 */
void
plpgpsm_scanner_init(const char *str)
{
	/* Start up the core scanner */
	yyscanner = scanner_init(str, &core_yy,
							 reserved_keywords, num_reserved_keywords);

	/*
	 * scanorig points to the original string, which unlike the scanner's
	 * scanbuf won't be modified on-the-fly by flex.  Notice that although
	 * yytext points into scanbuf, we rely on being able to apply locations
	 * (offsets from string start) to scanorig as well.
	 */
	scanorig = str;
	num_pushbacks = 0;

	location_lineno_init();
}

/*
 * Called after parsing is done to clean up after plpgpsm_scanner_init()
 */
void
plpgpsm_scanner_finish(void)
{
	/* release storage */
	scanner_finish(yyscanner);
	/* avoid leaving any dangling pointers */
	yyscanner = NULL;
	scanorig = NULL;
}
