|  | /* | 
|  | ** 2001 September 15 | 
|  | ** | 
|  | ** The author disclaims copyright to this source code.  In place of | 
|  | ** a legal notice, here is a blessing: | 
|  | ** | 
|  | **    May you do good and not evil. | 
|  | **    May you find forgiveness for yourself and forgive others. | 
|  | **    May you share freely, never taking more than you give. | 
|  | ** | 
|  | ************************************************************************* | 
|  | ** An tokenizer for SQL | 
|  | ** | 
|  | ** This file contains C code that splits an SQL input string up into | 
|  | ** individual tokens and sends those tokens one-by-one over to the | 
|  | ** parser for analysis. | 
|  | */ | 
|  |  | 
|  | #include <ctype.h> | 
|  | #include <stdarg.h> | 
|  | #include <stdlib.h> | 
|  |  | 
|  | #include "windef.h" | 
|  | #include "winbase.h" | 
|  | #include "wine/debug.h" | 
|  | #include "winnls.h" | 
|  | #include "query.h" | 
|  | #include "sql.tab.h" | 
|  |  | 
|  | WINE_DEFAULT_DEBUG_CHANNEL(msi); | 
|  |  | 
|  | /* | 
|  | ** All the keywords of the SQL language are stored as in a hash | 
|  | ** table composed of instances of the following structure. | 
|  | */ | 
|  | typedef struct Keyword Keyword; | 
|  | struct Keyword { | 
|  | const char *zName;             /* The keyword name */ | 
|  | int tokenType;           /* The token value for this keyword */ | 
|  | }; | 
|  |  | 
|  | /* | 
|  | ** These are the keywords | 
|  | */ | 
|  | static const Keyword aKeywordTable[] = { | 
|  | { "ABORT", TK_ABORT }, | 
|  | { "AFTER", TK_AFTER }, | 
|  | { "ALL", TK_ALL }, | 
|  | { "AND", TK_AND }, | 
|  | { "AS", TK_AS }, | 
|  | { "ASC", TK_ASC }, | 
|  | { "BEFORE", TK_BEFORE }, | 
|  | { "BEGIN", TK_BEGIN }, | 
|  | { "BETWEEN", TK_BETWEEN }, | 
|  | { "BY", TK_BY }, | 
|  | { "CASCADE", TK_CASCADE }, | 
|  | { "CASE", TK_CASE }, | 
|  | { "CHAR", TK_CHAR }, | 
|  | { "CHARACTER", TK_CHAR }, | 
|  | { "CHECK", TK_CHECK }, | 
|  | { "CLUSTER", TK_CLUSTER }, | 
|  | { "COLLATE", TK_COLLATE }, | 
|  | { "COMMIT", TK_COMMIT }, | 
|  | { "CONFLICT", TK_CONFLICT }, | 
|  | { "CONSTRAINT", TK_CONSTRAINT }, | 
|  | { "COPY", TK_COPY }, | 
|  | { "CREATE", TK_CREATE }, | 
|  | { "CROSS", TK_JOIN_KW }, | 
|  | { "DEFAULT", TK_DEFAULT }, | 
|  | { "DEFERRED", TK_DEFERRED }, | 
|  | { "DEFERRABLE", TK_DEFERRABLE }, | 
|  | { "DELETE", TK_DELETE }, | 
|  | { "DELIMITERS", TK_DELIMITERS }, | 
|  | { "DESC", TK_DESC }, | 
|  | { "DISTINCT", TK_DISTINCT }, | 
|  | { "DROP", TK_DROP }, | 
|  | { "END", TK_END }, | 
|  | { "EACH", TK_EACH }, | 
|  | { "ELSE", TK_ELSE }, | 
|  | { "EXCEPT", TK_EXCEPT }, | 
|  | { "EXPLAIN", TK_EXPLAIN }, | 
|  | { "FAIL", TK_FAIL }, | 
|  | { "FOR", TK_FOR }, | 
|  | { "FOREIGN", TK_FOREIGN }, | 
|  | { "FROM", TK_FROM }, | 
|  | { "FULL", TK_JOIN_KW }, | 
|  | { "GLOB", TK_GLOB }, | 
|  | { "GROUP", TK_GROUP }, | 
|  | { "HAVING", TK_HAVING }, | 
|  | { "HOLD", TK_HOLD }, | 
|  | { "IGNORE", TK_IGNORE }, | 
|  | { "IMMEDIATE", TK_IMMEDIATE }, | 
|  | { "IN", TK_IN }, | 
|  | { "INDEX", TK_INDEX }, | 
|  | { "INITIALLY", TK_INITIALLY }, | 
|  | { "INNER", TK_JOIN_KW }, | 
|  | { "INSERT", TK_INSERT }, | 
|  | { "INSTEAD", TK_INSTEAD }, | 
|  | { "INT", TK_INT }, | 
|  | { "INTERSECT", TK_INTERSECT }, | 
|  | { "INTO", TK_INTO }, | 
|  | { "IS", TK_IS }, | 
|  | { "ISNULL", TK_ISNULL }, | 
|  | { "JOIN", TK_JOIN }, | 
|  | { "KEY", TK_KEY }, | 
|  | { "LEFT", TK_JOIN_KW }, | 
|  | { "LIKE", TK_LIKE }, | 
|  | { "LIMIT", TK_LIMIT }, | 
|  | { "LOCALIZABLE", TK_LOCALIZABLE }, | 
|  | { "LONG", TK_LONG }, | 
|  | { "LONGCHAR", TK_LONGCHAR }, | 
|  | { "MATCH", TK_MATCH }, | 
|  | { "NATURAL", TK_JOIN_KW }, | 
|  | { "NOT", TK_NOT }, | 
|  | { "NOTNULL", TK_NOTNULL }, | 
|  | { "NULL", TK_NULL }, | 
|  | { "OBJECT", TK_OBJECT }, | 
|  | { "OF", TK_OF }, | 
|  | { "OFFSET", TK_OFFSET }, | 
|  | { "ON", TK_ON }, | 
|  | { "OR", TK_OR }, | 
|  | { "ORDER", TK_ORDER }, | 
|  | { "OUTER", TK_JOIN_KW }, | 
|  | { "PRAGMA", TK_PRAGMA }, | 
|  | { "PRIMARY", TK_PRIMARY }, | 
|  | { "RAISE", TK_RAISE }, | 
|  | { "REFERENCES", TK_REFERENCES }, | 
|  | { "REPLACE", TK_REPLACE }, | 
|  | { "RESTRICT", TK_RESTRICT }, | 
|  | { "RIGHT", TK_JOIN_KW }, | 
|  | { "ROLLBACK", TK_ROLLBACK }, | 
|  | { "ROW", TK_ROW }, | 
|  | { "SELECT", TK_SELECT }, | 
|  | { "SET", TK_SET }, | 
|  | { "SHORT", TK_SHORT }, | 
|  | { "STATEMENT", TK_STATEMENT }, | 
|  | { "TABLE", TK_TABLE }, | 
|  | { "TEMP", TK_TEMP }, | 
|  | { "TEMPORARY", TK_TEMP }, | 
|  | { "THEN", TK_THEN }, | 
|  | { "TRANSACTION", TK_TRANSACTION }, | 
|  | { "TRIGGER", TK_TRIGGER }, | 
|  | { "UNION", TK_UNION }, | 
|  | { "UNIQUE", TK_UNIQUE }, | 
|  | { "UPDATE", TK_UPDATE }, | 
|  | { "USING", TK_USING }, | 
|  | { "VACUUM", TK_VACUUM }, | 
|  | { "VALUES", TK_VALUES }, | 
|  | { "VIEW", TK_VIEW }, | 
|  | { "WHEN", TK_WHEN }, | 
|  | { "WHERE", TK_WHERE }, | 
|  | }; | 
|  |  | 
|  | #define KEYWORD_COUNT ( sizeof aKeywordTable/sizeof (Keyword) ) | 
|  |  | 
|  | /* | 
|  | ** This function looks up an identifier to determine if it is a | 
|  | ** keyword.  If it is a keyword, the token code of that keyword is | 
|  | ** returned.  If the input is not a keyword, TK_ID is returned. | 
|  | */ | 
|  | int sqliteKeywordCode(const WCHAR *z, int n){ | 
|  | int i, len; | 
|  | char buffer[0x10]; | 
|  |  | 
|  | len = WideCharToMultiByte( CP_ACP, 0, z, n, buffer, sizeof buffer, NULL, NULL ); | 
|  | for(i=0; i<len; i++) | 
|  | buffer[i] = toupper(buffer[i]); | 
|  | for(i=0; i<KEYWORD_COUNT; i++) | 
|  | { | 
|  | if(memcmp(buffer, aKeywordTable[i].zName, len)) | 
|  | continue; | 
|  | if(strlen(aKeywordTable[i].zName) == len ) | 
|  | return aKeywordTable[i].tokenType; | 
|  | } | 
|  | return TK_ID; | 
|  | } | 
|  |  | 
|  |  | 
|  | /* | 
|  | ** If X is a character that can be used in an identifier then | 
|  | ** isIdChar[X] will be 1.  Otherwise isIdChar[X] will be 0. | 
|  | ** | 
|  | ** In this implementation, an identifier can be a string of | 
|  | ** alphabetic characters, digits, and "_" plus any character | 
|  | ** with the high-order bit set.  The latter rule means that | 
|  | ** any sequence of UTF-8 characters or characters taken from | 
|  | ** an extended ISO8859 character set can form an identifier. | 
|  | */ | 
|  | static const char isIdChar[] = { | 
|  | /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */ | 
|  | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x */ | 
|  | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 1x */ | 
|  | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 2x */ | 
|  | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,  /* 3x */ | 
|  | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 4x */ | 
|  | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,  /* 5x */ | 
|  | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 6x */ | 
|  | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,  /* 7x */ | 
|  | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 8x */ | 
|  | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 9x */ | 
|  | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* Ax */ | 
|  | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* Bx */ | 
|  | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* Cx */ | 
|  | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* Dx */ | 
|  | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* Ex */ | 
|  | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* Fx */ | 
|  | }; | 
|  |  | 
|  |  | 
|  | /* | 
|  | ** Return the length of the token that begins at z[0].  Return | 
|  | ** -1 if the token is (or might be) incomplete.  Store the token | 
|  | ** type in *tokenType before returning. | 
|  | */ | 
|  | int sqliteGetToken(const WCHAR *z, int *tokenType){ | 
|  | int i; | 
|  | switch( *z ){ | 
|  | case ' ': case '\t': case '\n': case '\f': case '\r': { | 
|  | for(i=1; isspace(z[i]); i++){} | 
|  | *tokenType = TK_SPACE; | 
|  | return i; | 
|  | } | 
|  | case '-': { | 
|  | if( z[1]==0 ) return -1; | 
|  | if( z[1]=='-' ){ | 
|  | for(i=2; z[i] && z[i]!='\n'; i++){} | 
|  | *tokenType = TK_COMMENT; | 
|  | return i; | 
|  | } | 
|  | *tokenType = TK_MINUS; | 
|  | return 1; | 
|  | } | 
|  | case '(': { | 
|  | if( z[1]=='+' && z[2]==')' ){ | 
|  | *tokenType = TK_ORACLE_OUTER_JOIN; | 
|  | return 3; | 
|  | }else{ | 
|  | *tokenType = TK_LP; | 
|  | return 1; | 
|  | } | 
|  | } | 
|  | case ')': { | 
|  | *tokenType = TK_RP; | 
|  | return 1; | 
|  | } | 
|  | case ';': { | 
|  | *tokenType = TK_SEMI; | 
|  | return 1; | 
|  | } | 
|  | case '+': { | 
|  | *tokenType = TK_PLUS; | 
|  | return 1; | 
|  | } | 
|  | case '*': { | 
|  | *tokenType = TK_STAR; | 
|  | return 1; | 
|  | } | 
|  | case '/': { | 
|  | if( z[1]!='*' || z[2]==0 ){ | 
|  | *tokenType = TK_SLASH; | 
|  | return 1; | 
|  | } | 
|  | for(i=3; z[i] && (z[i]!='/' || z[i-1]!='*'); i++){} | 
|  | if( z[i] ) i++; | 
|  | *tokenType = TK_COMMENT; | 
|  | return i; | 
|  | } | 
|  | case '%': { | 
|  | *tokenType = TK_REM; | 
|  | return 1; | 
|  | } | 
|  | case '=': { | 
|  | *tokenType = TK_EQ; | 
|  | return 1 + (z[1]=='='); | 
|  | } | 
|  | case '<': { | 
|  | if( z[1]=='=' ){ | 
|  | *tokenType = TK_LE; | 
|  | return 2; | 
|  | }else if( z[1]=='>' ){ | 
|  | *tokenType = TK_NE; | 
|  | return 2; | 
|  | }else if( z[1]=='<' ){ | 
|  | *tokenType = TK_LSHIFT; | 
|  | return 2; | 
|  | }else{ | 
|  | *tokenType = TK_LT; | 
|  | return 1; | 
|  | } | 
|  | } | 
|  | case '>': { | 
|  | if( z[1]=='=' ){ | 
|  | *tokenType = TK_GE; | 
|  | return 2; | 
|  | }else if( z[1]=='>' ){ | 
|  | *tokenType = TK_RSHIFT; | 
|  | return 2; | 
|  | }else{ | 
|  | *tokenType = TK_GT; | 
|  | return 1; | 
|  | } | 
|  | } | 
|  | case '!': { | 
|  | if( z[1]!='=' ){ | 
|  | *tokenType = TK_ILLEGAL; | 
|  | return 2; | 
|  | }else{ | 
|  | *tokenType = TK_NE; | 
|  | return 2; | 
|  | } | 
|  | } | 
|  | case '|': { | 
|  | if( z[1]!='|' ){ | 
|  | *tokenType = TK_BITOR; | 
|  | return 1; | 
|  | }else{ | 
|  | *tokenType = TK_CONCAT; | 
|  | return 2; | 
|  | } | 
|  | } | 
|  | case ',': { | 
|  | *tokenType = TK_COMMA; | 
|  | return 1; | 
|  | } | 
|  | case '&': { | 
|  | *tokenType = TK_BITAND; | 
|  | return 1; | 
|  | } | 
|  | case '~': { | 
|  | *tokenType = TK_BITNOT; | 
|  | return 1; | 
|  | } | 
|  | case '`': case '\'': case '"': { | 
|  | int delim = z[0]; | 
|  | for(i=1; z[i]; i++){ | 
|  | if( z[i]==delim ){ | 
|  | if( z[i+1]==delim ){ | 
|  | i++; | 
|  | }else{ | 
|  | break; | 
|  | } | 
|  | } | 
|  | } | 
|  | if( z[i] ) i++; | 
|  | *tokenType = TK_STRING; | 
|  | return i; | 
|  | } | 
|  | case '.': { | 
|  | if( !isdigit(z[1]) ){ | 
|  | *tokenType = TK_DOT; | 
|  | return 1; | 
|  | } | 
|  | /* Fall thru into the next case */ | 
|  | } | 
|  | case '0': case '1': case '2': case '3': case '4': | 
|  | case '5': case '6': case '7': case '8': case '9': { | 
|  | *tokenType = TK_INTEGER; | 
|  | for(i=1; isdigit(z[i]); i++){} | 
|  | if( z[i]=='.' ){ | 
|  | i++; | 
|  | while( isdigit(z[i]) ){ i++; } | 
|  | *tokenType = TK_FLOAT; | 
|  | } | 
|  | if( (z[i]=='e' || z[i]=='E') && | 
|  | ( isdigit(z[i+1]) | 
|  | || ((z[i+1]=='+' || z[i+1]=='-') && isdigit(z[i+2])) | 
|  | ) | 
|  | ){ | 
|  | i += 2; | 
|  | while( isdigit(z[i]) ){ i++; } | 
|  | *tokenType = TK_FLOAT; | 
|  | }else if( z[0]=='.' ){ | 
|  | *tokenType = TK_FLOAT; | 
|  | } | 
|  | return i; | 
|  | } | 
|  | case '[': { | 
|  | for(i=1; z[i] && z[i-1]!=']'; i++){} | 
|  | *tokenType = TK_ID; | 
|  | return i; | 
|  | } | 
|  | default: { | 
|  | if( !isIdChar[*z] ){ | 
|  | break; | 
|  | } | 
|  | for(i=1; isIdChar[z[i]]; i++){} | 
|  | *tokenType = sqliteKeywordCode(z, i); | 
|  | return i; | 
|  | } | 
|  | } | 
|  | *tokenType = TK_ILLEGAL; | 
|  | return 1; | 
|  | } |