LCOV - code coverage report
Current view: top level - sql/server - sql_scan.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 1046 1111 94.1 %
Date: 2024-10-07 21:21:43 Functions: 26 26 100.0 %

          Line data    Source code
       1             : /*
       2             :  * SPDX-License-Identifier: MPL-2.0
       3             :  *
       4             :  * This Source Code Form is subject to the terms of the Mozilla Public
       5             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       6             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       7             :  *
       8             :  * Copyright 2024 MonetDB Foundation;
       9             :  * Copyright August 2008 - 2023 MonetDB B.V.;
      10             :  * Copyright 1997 - July 2008 CWI.
      11             :  */
      12             : 
      13             : #include "monetdb_config.h"
      14             : #include <wctype.h>
      15             : #include "sql_mem.h"
      16             : #include "sql_scan.h"
      17             : #include "sql_types.h"
      18             : #include "sql_symbol.h"
      19             : #include "sql_mvc.h"
      20             : #include "sql_parser.tab.h"
      21             : #include "sql_semantic.h"
      22             : #include "sql_parser.h"               /* for sql_error() */
      23             : 
      24             : #include "stream.h"
      25             : #include "mapi_prompt.h"
      26             : #include <unistd.h>
      27             : #include <string.h>
      28             : #include <ctype.h>
      29             : #include "sql_keyword.h"
      30             : 
      31             : /**
      32             :  * Removes all comments before the query. In query comments are kept.
      33             :  */
      34             : char *
      35      399654 : query_cleaned(allocator *sa, const char *query)
      36             : {
      37      399654 :         char *q, *r, *c = NULL;
      38      399654 :         int lines = 0;
      39      399654 :         int quote = 0;          /* inside quotes ('..', "..", {..}) */
      40      399654 :         bool bs = false;                /* seen a backslash in a quoted string */
      41      399654 :         bool incomment1 = false;        /* inside traditional C style comment */
      42      399654 :         bool incomment2 = false;        /* inside comment starting with --  */
      43      399654 :         bool inline_comment = false;
      44             : 
      45      399654 :         r = SA_NEW_ARRAY(sa, char, strlen(query) + 1);
      46      399852 :         if(!r)
      47             :                 return NULL;
      48             : 
      49    67581544 :         for (q = r; *query; query++) {
      50    67181692 :                 if (incomment1) {
      51       16046 :                         if (*query == '/' && query[-1] == '*') {
      52         232 :                                 incomment1 = false;
      53         232 :                                 if (c == r && lines > 0) {
      54         224 :                                         q = r; // reset to beginning
      55         224 :                                         lines = 0;
      56         224 :                                         continue;
      57             :                                 }
      58             :                         }
      59       15822 :                         if (*query == '\n') lines++;
      60       15822 :                         *q++ = *query;
      61    67165646 :                 } else if (incomment2) {
      62      827681 :                         if (*query == '\n') {
      63        2834 :                                 incomment2 = false;
      64        2834 :                                 inline_comment = false;
      65             :                                 /* add newline only if comment doesn't
      66             :                                  * occupy whole line */
      67        2834 :                                 if (q > r && q[-1] != '\n'){
      68         939 :                                         *q++ = '\n';
      69         939 :                                         lines++;
      70             :                                 }
      71      824847 :                         } else if (inline_comment){
      72       20650 :                                 *q++ = *query; // preserve in line query comments
      73             :                         }
      74    66337965 :                 } else if (quote) {
      75    22256606 :                         if (bs) {
      76             :                                 bs = false;
      77    22253326 :                         } else if (*query == '\\') {
      78             :                                 bs = true;
      79    22250037 :                         } else if (*query == quote) {
      80      671304 :                                 quote = 0;
      81             :                         }
      82    22256606 :                         *q++ = *query;
      83    44081359 :                 } else if (*query == '"' || *query == '\'') {
      84      670841 :                         quote = *query;
      85      670841 :                         *q++ = *query;
      86    43410518 :                 } else if (*query == '{') {
      87         508 :                         quote = '}';
      88         508 :                         *q++ = *query;
      89    43410010 :                 } else if (*query == '-' && query[1] == '-') {
      90        2834 :                         if (q > r && q[-1] != '\n') {
      91         939 :                                 inline_comment = true;
      92         939 :                                 *q++ = *query; // preserve in line query comments
      93             :                         }
      94             :                         incomment2 = true;
      95    43407176 :                 } else if (*query == '/' && query[1] == '*') {
      96         232 :                         incomment1 = true;
      97         232 :                         c = q;
      98         232 :                         *q++ = *query;
      99    43406944 :                 } else if (*query == '\n') {
     100             :                         /* collapse newlines */
     101      860705 :                         if (q > r && q[-1] != '\n') {
     102      818874 :                                 *q++ = '\n';
     103      818874 :                                 lines++;
     104             :                         }
     105    42546239 :                 } else if (*query == ' ' || *query == '\t') {
     106             :                         /* collapse white space */
     107     7010946 :                         if (q > r && q[-1] != ' ')
     108     5531839 :                                 *q++ = ' ';
     109             :                 } else {
     110    35535293 :                         *q++ = *query;
     111             :                 }
     112             :         }
     113      399852 :         *q = 0;
     114      399852 :         return r;
     115             : }
     116             : 
     117             : int
     118         330 : scanner_init_keywords(void)
     119             : {
     120         330 :         int failed = 0;
     121             : 
     122         330 :         failed += keywords_insert("false", BOOL_FALSE);
     123         330 :         failed += keywords_insert("true", BOOL_TRUE);
     124         330 :         failed += keywords_insert("bool", sqlBOOL);
     125             : 
     126         330 :         failed += keywords_insert("ALTER", ALTER);
     127         330 :         failed += keywords_insert("ADD", ADD);
     128         330 :         failed += keywords_insert("AND", AND);
     129             : 
     130         330 :         failed += keywords_insert("RANK", RANK);
     131         330 :         failed += keywords_insert("DENSE_RANK", RANK);
     132         330 :         failed += keywords_insert("PERCENT_RANK", RANK);
     133         330 :         failed += keywords_insert("CUME_DIST", RANK);
     134         330 :         failed += keywords_insert("ROW_NUMBER", RANK);
     135         330 :         failed += keywords_insert("NTILE", RANK);
     136         330 :         failed += keywords_insert("LAG", RANK);
     137         330 :         failed += keywords_insert("LEAD", RANK);
     138         330 :         failed += keywords_insert("FETCH", FETCH);
     139         330 :         failed += keywords_insert("FIRST_VALUE", RANK);
     140         330 :         failed += keywords_insert("LAST_VALUE", RANK);
     141         330 :         failed += keywords_insert("NTH_VALUE", RANK);
     142             : 
     143         330 :         failed += keywords_insert("BEST", BEST);
     144         330 :         failed += keywords_insert("EFFORT", EFFORT);
     145             : 
     146         330 :         failed += keywords_insert("AS", AS);
     147         330 :         failed += keywords_insert("ASC", ASC);
     148         330 :         failed += keywords_insert("AUTHORIZATION", AUTHORIZATION);
     149         330 :         failed += keywords_insert("BETWEEN", BETWEEN);
     150         330 :         failed += keywords_insert("SYMMETRIC", SYMMETRIC);
     151         330 :         failed += keywords_insert("ASYMMETRIC", ASYMMETRIC);
     152         330 :         failed += keywords_insert("BY", BY);
     153         330 :         failed += keywords_insert("CAST", CAST);
     154         330 :         failed += keywords_insert("CONVERT", CONVERT);
     155         330 :         failed += keywords_insert("CHARACTER", CHARACTER);
     156         330 :         failed += keywords_insert("CHAR", CHARACTER);
     157         330 :         failed += keywords_insert("VARYING", VARYING);
     158         330 :         failed += keywords_insert("VARCHAR", VARCHAR);
     159         330 :         failed += keywords_insert("BINARY", BINARY);
     160         330 :         failed += keywords_insert("LARGE", LARGE);
     161         330 :         failed += keywords_insert("OBJECT", OBJECT);
     162         330 :         failed += keywords_insert("CLOB", CLOB);
     163         330 :         failed += keywords_insert("BLOB", sqlBLOB);
     164         330 :         failed += keywords_insert("TEXT", sqlTEXT);
     165         330 :         failed += keywords_insert("TINYTEXT", sqlTEXT);
     166         330 :         failed += keywords_insert("STRING", CLOB);    /* ? */
     167         330 :         failed += keywords_insert("CHECK", CHECK);
     168         330 :         failed += keywords_insert("CLIENT", CLIENT);
     169         330 :         failed += keywords_insert("SERVER", SERVER);
     170         330 :         failed += keywords_insert("COMMENT", COMMENT);
     171         330 :         failed += keywords_insert("CONSTRAINT", CONSTRAINT);
     172         330 :         failed += keywords_insert("CREATE", CREATE);
     173         330 :         failed += keywords_insert("CROSS", CROSS);
     174         330 :         failed += keywords_insert("COPY", COPY);
     175         330 :         failed += keywords_insert("RECORDS", RECORDS);
     176         330 :         failed += keywords_insert("DELIMITERS", DELIMITERS);
     177         330 :         failed += keywords_insert("STDIN", STDIN);
     178         330 :         failed += keywords_insert("STDOUT", STDOUT);
     179             : 
     180         330 :         failed += keywords_insert("TINYINT", TINYINT);
     181         330 :         failed += keywords_insert("SMALLINT", SMALLINT);
     182         330 :         failed += keywords_insert("INTEGER", sqlINTEGER);
     183         330 :         failed += keywords_insert("INT", sqlINTEGER);
     184         330 :         failed += keywords_insert("MEDIUMINT", sqlINTEGER);
     185         330 :         failed += keywords_insert("BIGINT", BIGINT);
     186             : #ifdef HAVE_HGE
     187         330 :         failed += keywords_insert("HUGEINT", HUGEINT);
     188             : #endif
     189         330 :         failed += keywords_insert("DEC", sqlDECIMAL);
     190         330 :         failed += keywords_insert("DECIMAL", sqlDECIMAL);
     191         330 :         failed += keywords_insert("NUMERIC", sqlDECIMAL);
     192         330 :         failed += keywords_insert("DECLARE", DECLARE);
     193         330 :         failed += keywords_insert("DEFAULT", DEFAULT);
     194         330 :         failed += keywords_insert("DESC", DESC);
     195         330 :         failed += keywords_insert("DISTINCT", DISTINCT);
     196         330 :         failed += keywords_insert("DOUBLE", sqlDOUBLE);
     197         330 :         failed += keywords_insert("REAL", sqlREAL);
     198         330 :         failed += keywords_insert("DROP", DROP);
     199         330 :         failed += keywords_insert("ESCAPE", ESCAPE);
     200         330 :         failed += keywords_insert("EXISTS", EXISTS);
     201         330 :         failed += keywords_insert("UESCAPE", UESCAPE);
     202         330 :         failed += keywords_insert("EXTRACT", EXTRACT);
     203         330 :         failed += keywords_insert("FLOAT", sqlFLOAT);
     204         330 :         failed += keywords_insert("FOR", FOR);
     205         330 :         failed += keywords_insert("FOREIGN", FOREIGN);
     206         330 :         failed += keywords_insert("FROM", FROM);
     207         330 :         failed += keywords_insert("FWF", FWF);
     208             : 
     209         330 :         failed += keywords_insert("BIG", BIG);
     210         330 :         failed += keywords_insert("LITTLE", LITTLE);
     211         330 :         failed += keywords_insert("NATIVE", NATIVE);
     212         330 :         failed += keywords_insert("ENDIAN", ENDIAN);
     213             : 
     214         330 :         failed += keywords_insert("REFERENCES", REFERENCES);
     215             : 
     216         330 :         failed += keywords_insert("MATCH", MATCH);
     217         330 :         failed += keywords_insert("FULL", FULL);
     218         330 :         failed += keywords_insert("PARTIAL", PARTIAL);
     219         330 :         failed += keywords_insert("SIMPLE", SIMPLE);
     220             : 
     221         330 :         failed += keywords_insert("INSERT", INSERT);
     222         330 :         failed += keywords_insert("UPDATE", UPDATE);
     223         330 :         failed += keywords_insert("DELETE", sqlDELETE);
     224         330 :         failed += keywords_insert("TRUNCATE", TRUNCATE);
     225         330 :         failed += keywords_insert("MATCHED", MATCHED);
     226             : 
     227         330 :         failed += keywords_insert("ACTION", ACTION);
     228         330 :         failed += keywords_insert("CASCADE", CASCADE);
     229         330 :         failed += keywords_insert("RESTRICT", RESTRICT);
     230         330 :         failed += keywords_insert("FIRST", FIRST);
     231         330 :         failed += keywords_insert("GLOBAL", GLOBAL);
     232         330 :         failed += keywords_insert("GROUP", sqlGROUP);
     233         330 :         failed += keywords_insert("GROUPING", GROUPING);
     234         330 :         failed += keywords_insert("ROLLUP", ROLLUP);
     235         330 :         failed += keywords_insert("CUBE", CUBE);
     236         330 :         failed += keywords_insert("HAVING", HAVING);
     237         330 :         failed += keywords_insert("ILIKE", ILIKE);
     238         330 :         failed += keywords_insert("IMPRINTS", IMPRINTS);
     239         330 :         failed += keywords_insert("IN", sqlIN);
     240         330 :         failed += keywords_insert("INNER", INNER);
     241         330 :         failed += keywords_insert("INTO", INTO);
     242         330 :         failed += keywords_insert("IS", IS);
     243         330 :         failed += keywords_insert("JOIN", JOIN);
     244         330 :         failed += keywords_insert("KEY", KEY);
     245         330 :         failed += keywords_insert("LATERAL", LATERAL);
     246         330 :         failed += keywords_insert("LEFT", LEFT);
     247         330 :         failed += keywords_insert("LIKE", LIKE);
     248         330 :         failed += keywords_insert("LIMIT", LIMIT);
     249         330 :         failed += keywords_insert("SAMPLE", SAMPLE);
     250         330 :         failed += keywords_insert("SEED", SEED);
     251         330 :         failed += keywords_insert("LAST", LAST);
     252         330 :         failed += keywords_insert("LOCAL", LOCAL);
     253         330 :         failed += keywords_insert("NATURAL", NATURAL);
     254         330 :         failed += keywords_insert("NOT", NOT);
     255         330 :         failed += keywords_insert("NULL", sqlNULL);
     256         330 :         failed += keywords_insert("NULLS", NULLS);
     257         330 :         failed += keywords_insert("OFFSET", OFFSET);
     258         330 :         failed += keywords_insert("ON", ON);
     259         330 :         failed += keywords_insert("OPTIONS", OPTIONS);
     260         330 :         failed += keywords_insert("OPTION", OPTION);
     261         330 :         failed += keywords_insert("OR", OR);
     262         330 :         failed += keywords_insert("ORDER", ORDER);
     263         330 :         failed += keywords_insert("ORDERED", ORDERED);
     264         330 :         failed += keywords_insert("OUTER", OUTER);
     265         330 :         failed += keywords_insert("OVER", OVER);
     266         330 :         failed += keywords_insert("PARTITION", PARTITION);
     267         330 :         failed += keywords_insert("PATH", PATH);
     268         330 :         failed += keywords_insert("PRECISION", PRECISION);
     269         330 :         failed += keywords_insert("PRIMARY", PRIMARY);
     270             : 
     271         330 :         failed += keywords_insert("USER", USER);
     272         330 :         failed += keywords_insert("RENAME", RENAME);
     273         330 :         failed += keywords_insert("UNENCRYPTED", UNENCRYPTED);
     274         330 :         failed += keywords_insert("ENCRYPTED", ENCRYPTED);
     275         330 :         failed += keywords_insert("PASSWORD", PASSWORD);
     276         330 :         failed += keywords_insert("GRANT", GRANT);
     277         330 :         failed += keywords_insert("REVOKE", REVOKE);
     278         330 :         failed += keywords_insert("ROLE", ROLE);
     279         330 :         failed += keywords_insert("ADMIN", ADMIN);
     280         330 :         failed += keywords_insert("PRIVILEGES", PRIVILEGES);
     281         330 :         failed += keywords_insert("PUBLIC", PUBLIC);
     282         330 :         failed += keywords_insert("CURRENT_USER", CURRENT_USER);
     283         330 :         failed += keywords_insert("CURRENT_ROLE", CURRENT_ROLE);
     284         330 :         failed += keywords_insert("SESSION_USER", SESSION_USER);
     285         330 :         failed += keywords_insert("CURRENT_SCHEMA", CURRENT_SCHEMA);
     286         330 :         failed += keywords_insert("SESSION", sqlSESSION);
     287         330 :         failed += keywords_insert("MAX_MEMORY", MAX_MEMORY);
     288         330 :         failed += keywords_insert("MAX_WORKERS", MAX_WORKERS);
     289         330 :         failed += keywords_insert("OPTIMIZER", OPTIMIZER);
     290             : 
     291         330 :         failed += keywords_insert("RIGHT", RIGHT);
     292         330 :         failed += keywords_insert("SCHEMA", SCHEMA);
     293         330 :         failed += keywords_insert("SELECT", SELECT);
     294         330 :         failed += keywords_insert("SET", SET);
     295         330 :         failed += keywords_insert("SETS", SETS);
     296         330 :         failed += keywords_insert("AUTO_COMMIT", AUTO_COMMIT);
     297             : 
     298         330 :         failed += keywords_insert("ALL", ALL);
     299         330 :         failed += keywords_insert("ANY", ANY);
     300         330 :         failed += keywords_insert("SOME", SOME);
     301         330 :         failed += keywords_insert("EVERY", ANY);
     302             :         /*
     303             :            failed += keywords_insert("SQLCODE", SQLCODE );
     304             :          */
     305         330 :         failed += keywords_insert("COLUMN", COLUMN);
     306         330 :         failed += keywords_insert("TABLE", TABLE);
     307         330 :         failed += keywords_insert("TEMPORARY", TEMPORARY);
     308         330 :         failed += keywords_insert("TEMP", TEMP);
     309         330 :         failed += keywords_insert("REMOTE", REMOTE);
     310         330 :         failed += keywords_insert("MERGE", MERGE);
     311         330 :         failed += keywords_insert("REPLICA", REPLICA);
     312         330 :         failed += keywords_insert("UNLOGGED", UNLOGGED);
     313         330 :         failed += keywords_insert("TO", TO);
     314         330 :         failed += keywords_insert("UNION", UNION);
     315         330 :         failed += keywords_insert("EXCEPT", EXCEPT);
     316         330 :         failed += keywords_insert("INTERSECT", INTERSECT);
     317         330 :         failed += keywords_insert("CORRESPONDING", CORRESPONDING);
     318         330 :         failed += keywords_insert("UNIQUE", UNIQUE);
     319         330 :         failed += keywords_insert("USING", USING);
     320         330 :         failed += keywords_insert("VALUES", VALUES);
     321         330 :         failed += keywords_insert("VIEW", VIEW);
     322         330 :         failed += keywords_insert("WHERE", WHERE);
     323         330 :         failed += keywords_insert("WITH", WITH);
     324         330 :         failed += keywords_insert("WITHOUT", WITHOUT);
     325         330 :         failed += keywords_insert("DATA", DATA);
     326             : 
     327         330 :         failed += keywords_insert("DATE", sqlDATE);
     328         330 :         failed += keywords_insert("TIME", TIME);
     329         330 :         failed += keywords_insert("TIMESTAMP", TIMESTAMP);
     330         330 :         failed += keywords_insert("INTERVAL", INTERVAL);
     331         330 :         failed += keywords_insert("CURRENT_DATE", CURRENT_DATE);
     332         330 :         failed += keywords_insert("CURRENT_TIME", CURRENT_TIME);
     333         330 :         failed += keywords_insert("CURRENT_TIMESTAMP", CURRENT_TIMESTAMP);
     334         330 :         failed += keywords_insert("CURRENT_TIMEZONE", CURRENT_TIMEZONE);
     335         330 :         failed += keywords_insert("NOW", CURRENT_TIMESTAMP);
     336         330 :         failed += keywords_insert("LOCALTIME", LOCALTIME);
     337         330 :         failed += keywords_insert("LOCALTIMESTAMP", LOCALTIMESTAMP);
     338         330 :         failed += keywords_insert("ZONE", ZONE);
     339             : 
     340         330 :         failed += keywords_insert("CENTURY", CENTURY);
     341         330 :         failed += keywords_insert("DECADE", DECADE);
     342         330 :         failed += keywords_insert("YEAR", YEAR);
     343         330 :         failed += keywords_insert("QUARTER", QUARTER);
     344         330 :         failed += keywords_insert("MONTH", MONTH);
     345         330 :         failed += keywords_insert("WEEK", WEEK);
     346         330 :         failed += keywords_insert("DOW", DOW);
     347         330 :         failed += keywords_insert("DOY", DOY);
     348         330 :         failed += keywords_insert("DAY", DAY);
     349         330 :         failed += keywords_insert("HOUR", HOUR);
     350         330 :         failed += keywords_insert("MINUTE", MINUTE);
     351         330 :         failed += keywords_insert("SECOND", SECOND);
     352         330 :         failed += keywords_insert("EPOCH", EPOCH);
     353             : 
     354         330 :         failed += keywords_insert("POSITION", POSITION);
     355         330 :         failed += keywords_insert("SUBSTRING", SUBSTRING);
     356         330 :         failed += keywords_insert("SPLIT_PART", SPLIT_PART);
     357         330 :         failed += keywords_insert("TRIM", TRIM);
     358         330 :         failed += keywords_insert("LEADING", LEADING);
     359         330 :         failed += keywords_insert("TRAILING", TRAILING);
     360         330 :         failed += keywords_insert("BOTH", BOTH);
     361             : 
     362         330 :         failed += keywords_insert("CASE", CASE);
     363         330 :         failed += keywords_insert("WHEN", WHEN);
     364         330 :         failed += keywords_insert("THEN", THEN);
     365         330 :         failed += keywords_insert("ELSE", ELSE);
     366         330 :         failed += keywords_insert("END", END);
     367         330 :         failed += keywords_insert("NULLIF", NULLIF);
     368         330 :         failed += keywords_insert("COALESCE", COALESCE);
     369         330 :         failed += keywords_insert("ELSEIF", ELSEIF);
     370         330 :         failed += keywords_insert("IF", IF);
     371         330 :         failed += keywords_insert("WHILE", WHILE);
     372         330 :         failed += keywords_insert("DO", DO);
     373             : 
     374         330 :         failed += keywords_insert("COMMIT", COMMIT);
     375         330 :         failed += keywords_insert("ROLLBACK", ROLLBACK);
     376         330 :         failed += keywords_insert("SAVEPOINT", SAVEPOINT);
     377         330 :         failed += keywords_insert("RELEASE", RELEASE);
     378         330 :         failed += keywords_insert("WORK", WORK);
     379         330 :         failed += keywords_insert("CHAIN", CHAIN);
     380         330 :         failed += keywords_insert("PRESERVE", PRESERVE);
     381         330 :         failed += keywords_insert("ROWS", ROWS);
     382         330 :         failed += keywords_insert("NO", NO);
     383         330 :         failed += keywords_insert("START", START);
     384         330 :         failed += keywords_insert("TRANSACTION", TRANSACTION);
     385         330 :         failed += keywords_insert("READ", READ);
     386         330 :         failed += keywords_insert("WRITE", WRITE);
     387         330 :         failed += keywords_insert("ONLY", ONLY);
     388         330 :         failed += keywords_insert("ISOLATION", ISOLATION);
     389         330 :         failed += keywords_insert("LEVEL", LEVEL);
     390         330 :         failed += keywords_insert("UNCOMMITTED", UNCOMMITTED);
     391         330 :         failed += keywords_insert("COMMITTED", COMMITTED);
     392         330 :         failed += keywords_insert("REPEATABLE", sqlREPEATABLE);
     393         330 :         failed += keywords_insert("SNAPSHOT", SNAPSHOT);
     394         330 :         failed += keywords_insert("SERIALIZABLE", SERIALIZABLE);
     395         330 :         failed += keywords_insert("DIAGNOSTICS", DIAGNOSTICS);
     396         330 :         failed += keywords_insert("SIZE", sqlSIZE);
     397         330 :         failed += keywords_insert("STORAGE", STORAGE);
     398             : 
     399         330 :         failed += keywords_insert("TYPE", TYPE);
     400         330 :         failed += keywords_insert("PROCEDURE", PROCEDURE);
     401         330 :         failed += keywords_insert("FUNCTION", FUNCTION);
     402         330 :         failed += keywords_insert("LOADER", sqlLOADER);
     403         330 :         failed += keywords_insert("REPLACE", REPLACE);
     404             : 
     405         330 :         failed += keywords_insert("FIELD", FIELD);
     406         330 :         failed += keywords_insert("FILTER", FILTER);
     407         330 :         failed += keywords_insert("AGGREGATE", AGGREGATE);
     408         330 :         failed += keywords_insert("RETURNS", RETURNS);
     409         330 :         failed += keywords_insert("EXTERNAL", EXTERNAL);
     410         330 :         failed += keywords_insert("NAME", sqlNAME);
     411         330 :         failed += keywords_insert("RETURN", RETURN);
     412         330 :         failed += keywords_insert("CALL", CALL);
     413         330 :         failed += keywords_insert("LANGUAGE", LANGUAGE);
     414             : 
     415         330 :         failed += keywords_insert("ANALYZE", ANALYZE);
     416         330 :         failed += keywords_insert("EXPLAIN", SQL_EXPLAIN);
     417         330 :         failed += keywords_insert("PLAN", SQL_PLAN);
     418         330 :         failed += keywords_insert("TRACE", SQL_TRACE);
     419         330 :         failed += keywords_insert("PREPARE", PREPARE);
     420         330 :         failed += keywords_insert("PREP", PREP);
     421         330 :         failed += keywords_insert("EXECUTE", EXECUTE);
     422         330 :         failed += keywords_insert("EXEC", EXEC);
     423         330 :         failed += keywords_insert("DEALLOCATE", DEALLOCATE);
     424             : 
     425         330 :         failed += keywords_insert("INDEX", INDEX);
     426             : 
     427         330 :         failed += keywords_insert("SEQUENCE", SEQUENCE);
     428         330 :         failed += keywords_insert("RESTART", RESTART);
     429         330 :         failed += keywords_insert("INCREMENT", INCREMENT);
     430         330 :         failed += keywords_insert("MAXVALUE", MAXVALUE);
     431         330 :         failed += keywords_insert("MINVALUE", MINVALUE);
     432         330 :         failed += keywords_insert("CYCLE", CYCLE);
     433         330 :         failed += keywords_insert("CACHE", CACHE);
     434         330 :         failed += keywords_insert("NEXT", NEXT);
     435         330 :         failed += keywords_insert("VALUE", VALUE);
     436         330 :         failed += keywords_insert("GENERATED", GENERATED);
     437         330 :         failed += keywords_insert("ALWAYS", ALWAYS);
     438         330 :         failed += keywords_insert("IDENTITY", IDENTITY);
     439         330 :         failed += keywords_insert("SERIAL", SERIAL);
     440         330 :         failed += keywords_insert("BIGSERIAL", BIGSERIAL);
     441         330 :         failed += keywords_insert("AUTO_INCREMENT", AUTO_INCREMENT);
     442         330 :         failed += keywords_insert("CONTINUE", CONTINUE);
     443             : 
     444         330 :         failed += keywords_insert("TRIGGER", TRIGGER);
     445         330 :         failed += keywords_insert("ATOMIC", ATOMIC);
     446         330 :         failed += keywords_insert("BEGIN", BEGIN);
     447         330 :         failed += keywords_insert("OF", OF);
     448         330 :         failed += keywords_insert("BEFORE", BEFORE);
     449         330 :         failed += keywords_insert("AFTER", AFTER);
     450         330 :         failed += keywords_insert("ROW", ROW);
     451         330 :         failed += keywords_insert("STATEMENT", STATEMENT);
     452         330 :         failed += keywords_insert("NEW", sqlNEW);
     453         330 :         failed += keywords_insert("OLD", OLD);
     454         330 :         failed += keywords_insert("EACH", EACH);
     455         330 :         failed += keywords_insert("REFERENCING", REFERENCING);
     456             : 
     457         330 :         failed += keywords_insert("RANGE", RANGE);
     458         330 :         failed += keywords_insert("UNBOUNDED", UNBOUNDED);
     459         330 :         failed += keywords_insert("PRECEDING", PRECEDING);
     460         330 :         failed += keywords_insert("FOLLOWING", FOLLOWING);
     461         330 :         failed += keywords_insert("CURRENT", CURRENT);
     462         330 :         failed += keywords_insert("EXCLUDE", EXCLUDE);
     463         330 :         failed += keywords_insert("OTHERS", OTHERS);
     464         330 :         failed += keywords_insert("TIES", TIES);
     465         330 :         failed += keywords_insert("GROUPS", GROUPS);
     466         330 :         failed += keywords_insert("WINDOW", WINDOW);
     467             : 
     468             :         /* special SQL/XML keywords */
     469         330 :         failed += keywords_insert("XMLCOMMENT", XMLCOMMENT);
     470         330 :         failed += keywords_insert("XMLCONCAT", XMLCONCAT);
     471         330 :         failed += keywords_insert("XMLDOCUMENT", XMLDOCUMENT);
     472         330 :         failed += keywords_insert("XMLELEMENT", XMLELEMENT);
     473         330 :         failed += keywords_insert("XMLATTRIBUTES", XMLATTRIBUTES);
     474         330 :         failed += keywords_insert("XMLFOREST", XMLFOREST);
     475         330 :         failed += keywords_insert("XMLPARSE", XMLPARSE);
     476         330 :         failed += keywords_insert("STRIP", STRIP);
     477         330 :         failed += keywords_insert("WHITESPACE", WHITESPACE);
     478         330 :         failed += keywords_insert("XMLPI", XMLPI);
     479         330 :         failed += keywords_insert("XMLQUERY", XMLQUERY);
     480         330 :         failed += keywords_insert("PASSING", PASSING);
     481         330 :         failed += keywords_insert("XMLTEXT", XMLTEXT);
     482         330 :         failed += keywords_insert("NIL", NIL);
     483         330 :         failed += keywords_insert("REF", REF);
     484         330 :         failed += keywords_insert("ABSENT", ABSENT);
     485         330 :         failed += keywords_insert("DOCUMENT", DOCUMENT);
     486         330 :         failed += keywords_insert("ELEMENT", ELEMENT);
     487         330 :         failed += keywords_insert("CONTENT", CONTENT);
     488         330 :         failed += keywords_insert("XMLNAMESPACES", XMLNAMESPACES);
     489         330 :         failed += keywords_insert("NAMESPACE", NAMESPACE);
     490         330 :         failed += keywords_insert("XMLVALIDATE", XMLVALIDATE);
     491         330 :         failed += keywords_insert("RETURNING", RETURNING);
     492         330 :         failed += keywords_insert("LOCATION", LOCATION);
     493         330 :         failed += keywords_insert("ID", ID);
     494         330 :         failed += keywords_insert("ACCORDING", ACCORDING);
     495         330 :         failed += keywords_insert("XMLSCHEMA", XMLSCHEMA);
     496         330 :         failed += keywords_insert("URI", URI);
     497         330 :         failed += keywords_insert("XMLAGG", XMLAGG);
     498             : 
     499             :         /* keywords for opengis */
     500         330 :         failed += keywords_insert("GEOMETRY", GEOMETRY);
     501             : 
     502         330 :         failed += keywords_insert("POINT", GEOMETRYSUBTYPE);
     503         330 :         failed += keywords_insert("LINESTRING", GEOMETRYSUBTYPE);
     504         330 :         failed += keywords_insert("POLYGON", GEOMETRYSUBTYPE);
     505         330 :         failed += keywords_insert("MULTIPOINT", GEOMETRYSUBTYPE);
     506         330 :         failed += keywords_insert("MULTILINESTRING", GEOMETRYSUBTYPE);
     507         330 :         failed += keywords_insert("MULTIPOLYGON", GEOMETRYSUBTYPE);
     508         330 :         failed += keywords_insert("GEOMETRYCOLLECTION", GEOMETRYSUBTYPE);
     509             : 
     510         330 :         failed += keywords_insert("POINTZ", GEOMETRYSUBTYPE);
     511         330 :         failed += keywords_insert("LINESTRINGZ", GEOMETRYSUBTYPE);
     512         330 :         failed += keywords_insert("POLYGONZ", GEOMETRYSUBTYPE);
     513         330 :         failed += keywords_insert("MULTIPOINTZ", GEOMETRYSUBTYPE);
     514         330 :         failed += keywords_insert("MULTILINESTRINGZ", GEOMETRYSUBTYPE);
     515         330 :         failed += keywords_insert("MULTIPOLYGONZ", GEOMETRYSUBTYPE);
     516         330 :         failed += keywords_insert("GEOMETRYCOLLECTIONZ", GEOMETRYSUBTYPE);
     517             : 
     518         330 :         failed += keywords_insert("POINTM", GEOMETRYSUBTYPE);
     519         330 :         failed += keywords_insert("LINESTRINGM", GEOMETRYSUBTYPE);
     520         330 :         failed += keywords_insert("POLYGONM", GEOMETRYSUBTYPE);
     521         330 :         failed += keywords_insert("MULTIPOINTM", GEOMETRYSUBTYPE);
     522         330 :         failed += keywords_insert("MULTILINESTRINGM", GEOMETRYSUBTYPE);
     523         330 :         failed += keywords_insert("MULTIPOLYGONM", GEOMETRYSUBTYPE);
     524         330 :         failed += keywords_insert("GEOMETRYCOLLECTIONM", GEOMETRYSUBTYPE);
     525             : 
     526         330 :         failed += keywords_insert("POINTZM", GEOMETRYSUBTYPE);
     527         330 :         failed += keywords_insert("LINESTRINGZM", GEOMETRYSUBTYPE);
     528         330 :         failed += keywords_insert("POLYGONZM", GEOMETRYSUBTYPE);
     529         330 :         failed += keywords_insert("MULTIPOINTZM", GEOMETRYSUBTYPE);
     530         330 :         failed += keywords_insert("MULTILINESTRINGZM", GEOMETRYSUBTYPE);
     531         330 :         failed += keywords_insert("MULTIPOLYGONZM", GEOMETRYSUBTYPE);
     532         330 :         failed += keywords_insert("GEOMETRYCOLLECTIONZM", GEOMETRYSUBTYPE);
     533         330 :         failed += keywords_insert("LOGIN", LOGIN);
     534             :         // odbc keywords
     535         330 :         failed += keywords_insert("d", ODBC_DATE_ESCAPE_PREFIX);
     536         330 :         failed += keywords_insert("t", ODBC_TIME_ESCAPE_PREFIX);
     537         330 :         failed += keywords_insert("ts", ODBC_TIMESTAMP_ESCAPE_PREFIX);
     538         330 :         failed += keywords_insert("guid", ODBC_GUID_ESCAPE_PREFIX);
     539         330 :         failed += keywords_insert("fn", ODBC_FUNC_ESCAPE_PREFIX);
     540         330 :         failed += keywords_insert("oj", ODBC_OJ_ESCAPE_PREFIX);
     541         330 :         failed += keywords_insert("DAYNAME", DAYNAME);
     542         330 :         failed += keywords_insert("IFNULL", IFNULL);
     543         330 :         failed += keywords_insert("MONTHNAME", MONTHNAME);
     544         330 :         failed += keywords_insert("TIMESTAMPADD", TIMESTAMPADD);
     545         330 :         failed += keywords_insert("TIMESTAMPDIFF", TIMESTAMPDIFF);
     546         330 :         failed += keywords_insert("SQL_BIGINT", SQL_BIGINT);
     547         330 :         failed += keywords_insert("SQL_BINARY", SQL_BINARY);
     548         330 :         failed += keywords_insert("SQL_BIT", SQL_BIT);
     549         330 :         failed += keywords_insert("SQL_CHAR", SQL_CHAR);
     550         330 :         failed += keywords_insert("SQL_DATE", SQL_DATE);
     551         330 :         failed += keywords_insert("SQL_DECIMAL", SQL_DECIMAL);
     552         330 :         failed += keywords_insert("SQL_DOUBLE", SQL_DOUBLE);
     553         330 :         failed += keywords_insert("SQL_FLOAT", SQL_FLOAT);
     554         330 :         failed += keywords_insert("SQL_GUID", SQL_GUID);
     555         330 :         failed += keywords_insert("SQL_HUGEINT", SQL_HUGEINT);
     556         330 :         failed += keywords_insert("SQL_INTEGER", SQL_INTEGER);
     557         330 :         failed += keywords_insert("SQL_INTERVAL_DAY", SQL_INTERVAL_DAY);
     558         330 :         failed += keywords_insert("SQL_INTERVAL_DAY_TO_HOUR", SQL_INTERVAL_DAY_TO_HOUR);
     559         330 :         failed += keywords_insert("SQL_INTERVAL_DAY_TO_MINUTE", SQL_INTERVAL_DAY_TO_MINUTE);
     560         330 :         failed += keywords_insert("SQL_INTERVAL_DAY_TO_SECOND", SQL_INTERVAL_DAY_TO_SECOND);
     561         330 :         failed += keywords_insert("SQL_INTERVAL_HOUR", SQL_INTERVAL_HOUR);
     562         330 :         failed += keywords_insert("SQL_INTERVAL_HOUR_TO_MINUTE", SQL_INTERVAL_HOUR_TO_MINUTE);
     563         330 :         failed += keywords_insert("SQL_INTERVAL_HOUR_TO_SECOND", SQL_INTERVAL_HOUR_TO_SECOND);
     564         330 :         failed += keywords_insert("SQL_INTERVAL_MINUTE", SQL_INTERVAL_MINUTE);
     565         330 :         failed += keywords_insert("SQL_INTERVAL_MINUTE_TO_SECOND", SQL_INTERVAL_MINUTE_TO_SECOND);
     566         330 :         failed += keywords_insert("SQL_INTERVAL_MONTH", SQL_INTERVAL_MONTH);
     567         330 :         failed += keywords_insert("SQL_INTERVAL_SECOND", SQL_INTERVAL_SECOND);
     568         330 :         failed += keywords_insert("SQL_INTERVAL_YEAR", SQL_INTERVAL_YEAR);
     569         330 :         failed += keywords_insert("SQL_INTERVAL_YEAR_TO_MONTH", SQL_INTERVAL_YEAR_TO_MONTH);
     570         330 :         failed += keywords_insert("SQL_LONGVARBINARY", SQL_LONGVARBINARY);
     571         330 :         failed += keywords_insert("SQL_LONGVARCHAR", SQL_LONGVARCHAR);
     572         330 :         failed += keywords_insert("SQL_NUMERIC", SQL_NUMERIC);
     573         330 :         failed += keywords_insert("SQL_REAL", SQL_REAL);
     574         330 :         failed += keywords_insert("SQL_SMALLINT", SQL_SMALLINT);
     575         330 :         failed += keywords_insert("SQL_TIME", SQL_TIME);
     576         330 :         failed += keywords_insert("SQL_TIMESTAMP", SQL_TIMESTAMP);
     577         330 :         failed += keywords_insert("SQL_TINYINT", SQL_TINYINT);
     578         330 :         failed += keywords_insert("SQL_VARBINARY", SQL_VARBINARY);
     579         330 :         failed += keywords_insert("SQL_VARCHAR", SQL_VARCHAR);
     580         330 :         failed += keywords_insert("SQL_WCHAR", SQL_WCHAR);
     581         330 :         failed += keywords_insert("SQL_WLONGVARCHAR", SQL_WLONGVARCHAR);
     582         330 :         failed += keywords_insert("SQL_WVARCHAR", SQL_WVARCHAR);
     583         330 :         failed += keywords_insert("SQL_TSI_FRAC_SECOND", SQL_TSI_FRAC_SECOND);
     584         330 :         failed += keywords_insert("SQL_TSI_SECOND", SQL_TSI_SECOND);
     585         330 :         failed += keywords_insert("SQL_TSI_MINUTE", SQL_TSI_MINUTE);
     586         330 :         failed += keywords_insert("SQL_TSI_HOUR", SQL_TSI_HOUR);
     587         330 :         failed += keywords_insert("SQL_TSI_DAY", SQL_TSI_DAY);
     588         330 :         failed += keywords_insert("SQL_TSI_WEEK", SQL_TSI_WEEK);
     589         330 :         failed += keywords_insert("SQL_TSI_MONTH", SQL_TSI_MONTH);
     590         330 :         failed += keywords_insert("SQL_TSI_QUARTER", SQL_TSI_QUARTER);
     591         330 :         failed += keywords_insert("SQL_TSI_YEAR", SQL_TSI_YEAR);
     592             : 
     593         330 :         failed += keywords_insert("LEAST", MARGFUNC);
     594         330 :         failed += keywords_insert("GREATEST", MARGFUNC);
     595         330 :         return failed;
     596             : }
     597             : 
     598             : #define find_keyword_bs(lc, s) find_keyword(lc->rs->buf+lc->rs->pos+s)
     599             : 
     600             : void
     601      246256 : scanner_init(struct scanner *s, bstream *rs, stream *ws)
     602             : {
     603      492512 :         *s = (struct scanner) {
     604             :                 .rs = rs,
     605             :                 .ws = ws,
     606             :                 .mode = LINE_N,
     607      246256 :                 .raw_string_mode = GDKgetenv_istrue("raw_strings"),
     608             :                 .aborted = false,
     609             :         };
     610      246256 : }
     611             : 
     612             : void
     613     1311529 : scanner_query_processed(struct scanner *s)
     614             : {
     615     1311529 :         int cur;
     616             : 
     617     1311529 :         if (s->yybak) {
     618      512377 :                 s->rs->buf[s->rs->pos + s->yycur] = s->yybak;
     619      512377 :                 s->yybak = 0;
     620             :         }
     621     1311529 :         if (s->rs) {
     622     1311529 :                 s->rs->pos += s->yycur;
     623             :                 /* completely eat the query including white space after the ; */
     624     2468500 :                 while (s->rs->pos < s->rs->len &&
     625     2136436 :                            (cur = s->rs->buf[s->rs->pos], iswspace(cur))) {
     626     1156971 :                         s->rs->pos++;
     627             :                 }
     628             :         }
     629             :         /*assert(s->rs->pos <= s->rs->len);*/
     630     1311529 :         s->yycur = 0;
     631     1311529 :         s->started = 0;
     632     1311529 :         s->as = 0;
     633     1311529 :         s->schema = NULL;
     634     1311529 : }
     635             : 
     636             : static int
     637          33 : scanner_error(mvc *lc, int cur)
     638             : {
     639          33 :         switch (cur) {
     640           0 :         case EOF:
     641           0 :                 (void) sql_error(lc, 1, SQLSTATE(42000) "Unexpected end of input");
     642           0 :                 return EOF;
     643          33 :         default:
     644             :                 /* on Windows at least, iswcntrl returns TRUE for
     645             :                  * U+FEFF, but we just want consistent error
     646             :                  * messages */
     647          33 :                 (void) sql_error(lc, 1, SQLSTATE(42000) "Unexpected%s character (U+%04X)", iswcntrl(cur) && cur != 0xFEFF ? " control" : "", (unsigned) cur);
     648             :         }
     649          33 :         return LEX_ERROR;
     650             : }
     651             : 
     652             : 
     653             : /*
     654             :    UTF-8 encoding is as follows:
     655             : U-00000000 - U-0000007F: 0xxxxxxx
     656             : U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
     657             : U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
     658             : U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
     659             : U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
     660             : U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
     661             : */
     662             : /* To be correctly coded UTF-8, the sequence should be the shortest
     663             :    possible encoding of the value being encoded.  This means that for
     664             :    an encoding of length n+1 (1 <= n <= 5), at least one of the bits in
     665             :    utf8chkmsk[n] should be non-zero (else the encoding could be
     666             :    shorter).
     667             : */
     668             : static const int utf8chkmsk[] = {
     669             :         0x0000007f,
     670             :         0x00000780,
     671             :         0x0000f800,
     672             :         0x001f0000,
     673             :         0x03e00000,
     674             :         0x7c000000
     675             : };
     676             : 
     677             : static void
     678    31414687 : utf8_putchar(struct scanner *lc, int ch)
     679             : {
     680    31414687 :         if ((ch) < 0x80) {
     681    31414682 :                 lc->yycur--;
     682           5 :         } else if ((ch) < 0x800) {
     683           0 :                 lc->yycur -= 2;
     684           5 :         } else if ((ch) < 0x10000) {
     685           5 :                 lc->yycur -= 3;
     686             :         } else {
     687           0 :                 lc->yycur -= 4;
     688             :         }
     689    31414687 : }
     690             : 
     691             : static inline int
     692   136671733 : scanner_read_more(struct scanner *lc, size_t n)
     693             : {
     694   136671733 :         bstream *b = lc->rs;
     695   136671733 :         bool more = false;
     696             : 
     697             : 
     698   136671733 :         if (lc->aborted)
     699             :                 return EOF;
     700   136676004 :         while (b->len < b->pos + lc->yycur + n) {
     701             : 
     702      125040 :                 if (lc->mode == LINE_1 || !lc->started)
     703             :                         return EOF;
     704             : 
     705             :                 /* query is not finished ask for more */
     706           0 :                 if (b->eof || !isa_block_stream(b->s)) {
     707           0 :                         if (bstream_getoob(b)) {
     708           0 :                                 lc->aborted = true;
     709           0 :                                 return EOF;
     710             :                         }
     711        2139 :                         if (mnstr_write(lc->ws, PROMPT2, sizeof(PROMPT2) - 1, 1) == 1)
     712        2139 :                                 mnstr_flush(lc->ws, MNSTR_FLUSH_DATA);
     713        2139 :                         b->eof = false;
     714        2139 :                         more = true;
     715             :                 }
     716             :                 /* we need more query text */
     717        4278 :                 if (bstream_next(b) < 0) {
     718           0 :                         if (mnstr_errnr(b->s) == MNSTR_INTERRUPT) {
     719             :                                 // now what?
     720           0 :                                 lc->errstr = "Query aborted";
     721           0 :                                 lc->aborted = true;
     722           0 :                                 mnstr_clearerr(b->s);
     723             :                         }
     724           0 :                         return EOF;
     725        4278 :                 } else if (/* we asked for more data but didn't get any */
     726        2139 :                            (more && b->eof && b->len < b->pos + lc->yycur + n))
     727             :                         return EOF;
     728        4271 :                 if (more && b->pos + lc->yycur + 2 == b->len && b->buf[b->pos + lc->yycur] == '\200' && b->buf[b->pos + lc->yycur + 1] == '\n') {
     729           0 :                         lc->errstr = "Query aborted";
     730           0 :                         b->len -= 2;
     731           0 :                         b->buf[b->len] = 0;
     732           0 :                         return EOF;
     733             :                 }
     734             :         }
     735             :         return 1;
     736             : }
     737             : 
     738             : static inline int
     739   135417724 : scanner_getc(struct scanner *lc)
     740             : {
     741   135417724 :         bstream *b = lc->rs;
     742   135417724 :         unsigned char *s = NULL;
     743   135417724 :         int c, m, n, mask;
     744             : 
     745   135417724 :         if (scanner_read_more(lc, 1) == EOF) {
     746             :                 //lc->errstr = SQLSTATE(42000) "end of input stream";
     747             :                 return EOF;
     748             :         }
     749   135329170 :         lc->errstr = NULL;
     750             : 
     751   135329170 :         s = (unsigned char *) b->buf + b->pos + lc->yycur++;
     752   135329170 :         if (((c = *s) & 0x80) == 0) {
     753             :                 /* 7-bit char */
     754             :                 return c;
     755             :         }
     756       88250 :         for (n = 0, m = 0x40; c & m; n++, m >>= 1)
     757             :                 ;
     758             :         /* n now is number of 10xxxxxx bytes that should follow */
     759       29443 :         if (n == 0 || n >= 6 || (b->pos + n) > b->len) {
     760             :                 /* incorrect UTF-8 sequence */
     761             :                 /* n==0: c == 10xxxxxx */
     762             :                 /* n>=6: c == 1111111x */
     763           0 :                 lc->errstr = SQLSTATE(42000) "invalid start of UTF-8 sequence";
     764           0 :                 goto error;
     765             :         }
     766             : 
     767       29443 :         if (scanner_read_more(lc, (size_t) n) == EOF)
     768             :                 return EOF;
     769       29443 :         s = (unsigned char *) b->buf + b->pos + lc->yycur;
     770             : 
     771       29443 :         mask = utf8chkmsk[n];
     772       29443 :         c &= ~(0xFFC0 >> n);  /* remove non-x bits */
     773       88249 :         while (--n >= 0) {
     774       58807 :                 c <<= 6;
     775       58807 :                 lc->yycur++;
     776       58807 :                 if (((m = *s++) & 0xC0) != 0x80) {
     777             :                         /* incorrect UTF-8 sequence: byte is not 10xxxxxx */
     778             :                         /* this includes end-of-string (m == 0) */
     779           1 :                         lc->errstr = SQLSTATE(42000) "invalid continuation in UTF-8 sequence";
     780           1 :                         goto error;
     781             :                 }
     782       58806 :                 c |= m & 0x3F;
     783             :         }
     784       29442 :         if ((c & mask) == 0) {
     785             :                 /* incorrect UTF-8 sequence: not shortest possible */
     786           0 :                 lc->errstr = SQLSTATE(42000) "not shortest possible UTF-8 sequence";
     787           0 :                 goto error;
     788             :         }
     789             : 
     790             :         return c;
     791             : 
     792           1 : error:
     793           1 :         if (b->pos + lc->yycur < b->len)    /* skip bogus char */
     794           0 :                 lc->yycur++;
     795             :         return EOF;
     796             : }
     797             : 
     798             : static int
     799    28121391 : scanner_token(struct scanner *lc, int token)
     800             : {
     801    28121391 :         lc->yybak = lc->rs->buf[lc->rs->pos + lc->yycur];
     802    28121391 :         lc->rs->buf[lc->rs->pos + lc->yycur] = 0;
     803    28121391 :         lc->yyval = token;
     804    28121391 :         return lc->yyval;
     805             : }
     806             : 
     807             : static int
     808     2094794 : scanner_string(mvc *c, int quote, bool escapes)
     809             : {
     810     2094794 :         struct scanner *lc = &c->scanner;
     811     2094794 :         bstream *rs = lc->rs;
     812     2094794 :         int cur = quote;
     813     2094794 :         bool escape = false;
     814     2094794 :         const size_t limit = quote == '"' ? 1 << 11 : 1 << 30;
     815             : 
     816     2094794 :         lc->started = 1;
     817     2132404 :         while (cur != EOF) {
     818     2132389 :                 size_t pos = 0;
     819     2132389 :                 const size_t yycur = rs->pos + lc->yycur;
     820             : 
     821    35191080 :                 while (cur != EOF && (quote != '"' || cur != 0xFEFF) && pos < limit &&
     822    33058691 :                        (((cur = rs->buf[yycur + pos++]) & 0x80) == 0) &&
     823    66087928 :                        cur && (cur != quote || escape)) {
     824    30926303 :                         if (escapes && cur == '\\')
     825        6589 :                                 escape = !escape;
     826             :                         else
     827             :                                 escape = false;
     828             :                 }
     829     2132389 :                 if (pos == limit) {
     830           0 :                         (void) sql_error(c, 2, SQLSTATE(42000) "string too long");
     831           0 :                         return LEX_ERROR;
     832             :                 }
     833             :                 /* BOM character not allowed as an identifier */
     834     2132389 :                 if (cur == EOF || (quote == '"' && cur == 0xFEFF))
     835           1 :                         return scanner_error(c, cur);
     836     2132388 :                 lc->yycur += pos;
     837             :                 /* check for quote escaped quote: Obscure SQL Rule */
     838     2132388 :                 if (cur == quote && rs->buf[yycur + pos] == quote) {
     839        8169 :                         lc->yycur++;
     840        8169 :                         continue;
     841             :                 }
     842     2124219 :                 assert(yycur + pos <= rs->len + 1);
     843     2124219 :                 if (cur == quote && !escape) {
     844     2094764 :                         return scanner_token(lc, STRING);
     845             :                 }
     846       29455 :                 lc->yycur--; /* go back to current (possibly invalid) char */
     847             :                 /* long utf8, if correct isn't the quote */
     848       29455 :                 if (!cur) {
     849          30 :                         if (lc->rs->len >= lc->rs->pos + lc->yycur + 1) {
     850          14 :                                 (void) sql_error(c, 2, SQLSTATE(42000) "NULL byte in string");
     851          14 :                                 return LEX_ERROR;
     852             :                         }
     853          16 :                         cur = scanner_read_more(lc, 1);
     854             :                 } else {
     855       29425 :                         cur = scanner_getc(lc);
     856             :                 }
     857             :         }
     858          15 :         (void) sql_error(c, 2, "%s", lc->errstr ? lc->errstr : SQLSTATE(42000) "Unexpected end of input");
     859          15 :         return EOF;
     860             : }
     861             : 
     862             : /* scan a structure {blah} into a string. We only count the matching {}
     863             :  * unless escaped. We do not consider embeddings in string literals yet
     864             :  */
     865             : 
     866             : static int
     867         230 : scanner_body(mvc *c)
     868             : {
     869         230 :         struct scanner *lc = &c->scanner;
     870         230 :         bstream *rs = lc->rs;
     871         230 :         int cur = (int) 'x';
     872         230 :         int blk = 1;
     873         230 :         bool escape = false;
     874             : 
     875         230 :         lc->started = 1;
     876         230 :         assert(rs->buf[rs->pos + lc->yycur-1] == '{');
     877         286 :         while (cur != EOF) {
     878         286 :                 size_t pos = rs->pos + lc->yycur;
     879             : 
     880       31815 :                 while ((((cur = rs->buf[pos++]) & 0x80) == 0) && cur && (blk || escape)) {
     881       31529 :                         if (cur != '\\')
     882             :                                 escape = false;
     883             :                         else
     884          12 :                                 escape = !escape;
     885       31529 :                         blk += cur =='{';
     886       31529 :                         blk -= cur =='}';
     887             :                 }
     888         286 :                 lc->yycur = pos - rs->pos;
     889         286 :                 assert(pos <= rs->len + 1);
     890         286 :                 if (blk == 0 && !escape){
     891         230 :                         lc->yycur--; /* go back to current (possibly invalid) char */
     892         230 :                         return scanner_token(lc, X_BODY);
     893             :                 }
     894          56 :                 lc->yycur--; /* go back to current (possibly invalid) char */
     895          56 :                 if (!cur) {
     896          56 :                         if (lc->rs->len >= lc->rs->pos + lc->yycur + 1) {
     897           0 :                                 (void) sql_error(c, 2, SQLSTATE(42000) "NULL byte in string");
     898           0 :                                 return LEX_ERROR;
     899             :                         }
     900          56 :                         cur = scanner_read_more(lc, 1);
     901             :                 } else {
     902           0 :                         cur = scanner_getc(lc);
     903             :                 }
     904             :         }
     905           0 :         (void) sql_error(c, 2, SQLSTATE(42000) "Unexpected end of input");
     906           0 :         return EOF;
     907             : }
     908             : 
     909             : static int
     910    13494224 : keyword_or_ident(mvc * c, int cur)
     911             : {
     912    13494224 :         struct scanner *lc = &c->scanner;
     913    13494224 :         keyword *k = NULL;
     914    13494224 :         size_t s;
     915             : 
     916    13494224 :         lc->started = 1;
     917    13494224 :         utf8_putchar(lc, cur);
     918    13494215 :         s = lc->yycur;
     919    13494215 :         lc->yyval = IDENT;
     920    80742853 :         while ((cur = scanner_getc(lc)) != EOF) {
     921    80742746 :                 if (!iswalnum(cur) && cur != '_') {
     922    13494108 :                         utf8_putchar(lc, cur);
     923    13494114 :                         (void)scanner_token(lc, IDENT);
     924    13494114 :                         if ((k = find_keyword_bs(lc,s)))
     925     8285236 :                                 lc->yyval = k->token;
     926    13494402 :                         return lc->yyval;
     927             :                 }
     928             :         }
     929             :         if (cur < 0)
     930             :                 return cur;
     931             :         (void)scanner_token(lc, IDENT);
     932             :         if ((k = find_keyword_bs(lc,s)))
     933             :                 lc->yyval = k->token;
     934             :         return lc->yyval;
     935             : }
     936             : 
     937             : static int
     938    14111141 : skip_white_space(struct scanner * lc)
     939             : {
     940    17714023 :         int cur;
     941             : 
     942    17714023 :         do {
     943    17714023 :                 lc->yysval = lc->yycur;
     944    17714023 :         } while ((cur = scanner_getc(lc)) != EOF && iswspace(cur));
     945    14110193 :         return cur;
     946             : }
     947             : 
     948             : static int
     949       68937 : skip_c_comment(struct scanner * lc)
     950             : {
     951       68937 :         int cur;
     952       68937 :         int prev = 0;
     953       68937 :         int started = lc->started;
     954       68937 :         int depth = 1;
     955             : 
     956       68937 :         lc->started = 1;
     957     1390278 :         while (depth > 0 && (cur = scanner_getc(lc)) != EOF) {
     958     1321341 :                 if (prev == '*' && cur == '/')
     959       68937 :                         depth--;
     960     1252404 :                 else if (prev == '/' && cur == '*') {
     961             :                         /* block comments can nest */
     962           0 :                         cur = 0; /* prevent slash-star-slash from matching */
     963           0 :                         depth++;
     964             :                 }
     965             :                 prev = cur;
     966             :         }
     967       68937 :         lc->yysval = lc->yycur;
     968       68937 :         lc->started = started;
     969             :         /* a comment is equivalent to a newline */
     970       68937 :         return cur == EOF ? cur : '\n';
     971             : }
     972             : 
     973             : static int
     974        3178 : skip_sql_comment(struct scanner * lc)
     975             : {
     976        3178 :         int cur;
     977        3178 :         int started = lc->started;
     978             : 
     979        3178 :         lc->started = 1;
     980      828372 :         while ((cur = scanner_getc(lc)) != EOF && (cur != '\n'))
     981             :                 ;
     982        3178 :         lc->yysval = lc->yycur;
     983        3178 :         lc->started = started;
     984             :         /* a comment is equivalent to a newline */
     985        3178 :         return cur;
     986             : }
     987             : 
     988             : static int tokenize(mvc * lc, int cur);
     989             : 
     990     5721228 : static inline bool is_valid_decimal_digit(int cur) { return (iswdigit(cur)); }
     991          13 : static inline bool is_valid_binary_digit(int cur) { return (iswdigit(cur) && cur < '2'); }
     992          10 : static inline bool is_valid_octal_digit(int cur) { return (iswdigit(cur) && cur < '8'); }
     993        3688 : static inline bool is_valid_hexadecimal_digit(int cur) { return iswxdigit(cur); }
     994             : 
     995     1902760 : static inline int check_validity_number(mvc* c, int pcur, bool initial_underscore_allowed, int *token, int type) {
     996     1902760 :         struct scanner *lc = &c->scanner;
     997     1902760 :         bool (*is_valid_n_ary_digit)(int);
     998             : 
     999     1902760 :         if (pcur == '_' && !initial_underscore_allowed)  /* ERROR: initial underscore not allowed */  {
    1000           0 :                 *token = 0;
    1001           0 :                 return '_';
    1002             :         }
    1003             : 
    1004     1902760 :         switch (type) {
    1005             :         case BINARYNUM:
    1006             :                 is_valid_n_ary_digit = &is_valid_binary_digit;
    1007             :                 break;
    1008           3 :         case OCTALNUM:
    1009           3 :                 is_valid_n_ary_digit = &is_valid_octal_digit;
    1010           3 :                 break;
    1011         280 :         case HEXADECIMALNUM:
    1012         280 :                 is_valid_n_ary_digit = &is_valid_hexadecimal_digit;
    1013         280 :                 break;
    1014     1902475 :         default:
    1015     1902475 :                 is_valid_n_ary_digit = &is_valid_decimal_digit;
    1016     1902475 :                 break;
    1017             :         }
    1018             : 
    1019     1902760 :         if ( !(pcur == '_' || is_valid_n_ary_digit(pcur)) ) /* ERROR: first digit is not valid */ {
    1020          17 :                 *token = 0;
    1021          17 :                 return pcur;
    1022             :         }
    1023             : 
    1024     1902839 :         int cur = scanner_getc(lc);
    1025     1903112 :         *token = type;
    1026     3832251 :         while (cur != EOF) {
    1027     3832455 :                 if (cur == '_') {
    1028          25 :                         if (pcur == '_') /* ERROR: multiple consecutive underscores */ {
    1029           2 :                                 *token = 0;
    1030           2 :                                 return '_';
    1031             :                         }
    1032             :                 }
    1033     3832430 :                 else if (!is_valid_n_ary_digit(cur))
    1034             :                         break;
    1035     1929964 :                 pcur = cur;
    1036     1929964 :                 cur = scanner_getc(lc);
    1037             :         }
    1038             : 
    1039     1902112 :         if (pcur == '_')  {
    1040           3 :                 *token = 0;
    1041           3 :                 if (iswalnum(cur))       /* ERROR: not a valid digit */
    1042             :                         return cur;
    1043             :                 else                            /* ERROR: number ends with underscore */
    1044             :                         return '_';
    1045             :         }
    1046             : 
    1047             :         return cur;
    1048             : }
    1049             : 
    1050             : static int
    1051     1889570 : number(mvc * c, int cur)
    1052             : {
    1053     1889570 :         struct scanner *lc = &c->scanner;
    1054     1889570 :         int token = sqlINT;
    1055             : 
    1056             :         /* a number has one of these forms (expressed in regular expressions):
    1057             :          * 0x[0-9A-Fa-f]+                   -- (hexadecimal) INTEGER
    1058             :          * \.[0-9]+                         -- DECIMAL
    1059             :          * [0-9]+\.[0-9]*                   -- DECIMAL
    1060             :          * [0-9]+@0                         -- OID
    1061             :          * [0-9]*\.[0-9]+[eE][-+]?[0-9]+    -- REAL
    1062             :          * [0-9]+(\.[0-9]*)?[eE][-+]?[0-9]+ -- REAL
    1063             :          * [0-9]+                           -- (decimal) INTEGER
    1064             :          */
    1065     1889570 :         lc->started = 1;
    1066     1889570 :         if (cur == '0') {
    1067      303553 :                 switch ((cur = scanner_getc(lc))) {
    1068           2 :                 case 'b':
    1069           2 :                         cur = scanner_getc(lc);
    1070           2 :                         if ((cur = check_validity_number(c, cur, true, &token, BINARYNUM)) == EOF) return cur;
    1071             :                         break;
    1072           3 :                 case 'o':
    1073           3 :                         cur = scanner_getc(lc);
    1074           3 :                         if ((cur = check_validity_number(c,  cur, true, &token, OCTALNUM)) == EOF) return cur;
    1075             :                         break;
    1076         280 :                 case 'x':
    1077         280 :                         cur = scanner_getc(lc);
    1078         280 :                         if ((cur = check_validity_number(c,  cur, true, &token, HEXADECIMALNUM)) == EOF) return cur;
    1079             :                         break;
    1080      303271 :                 default:
    1081      303271 :                         utf8_putchar(lc, cur);
    1082      303271 :                         cur = '0';
    1083             :                 }
    1084             :         }
    1085     1889571 :         if (token == sqlINT) {
    1086     1889228 :                 if ((cur = check_validity_number(c, cur, false, &token, sqlINT)) == EOF) return cur;
    1087     1888597 :                 if (cur == '@') {
    1088           0 :                         if (token == sqlINT) {
    1089           0 :                                 cur = scanner_getc(lc);
    1090           0 :                                 if (cur == EOF)
    1091             :                                         return cur;
    1092           0 :                                 if (cur == '0') {
    1093           0 :                                         cur = scanner_getc(lc);
    1094           0 :                                         if (cur == EOF)
    1095             :                                                 return cur;
    1096           0 :                                         token = OIDNUM;
    1097             :                                 } else {
    1098             :                                         /* number + '@' not followed by 0: show '@' as erroneous */
    1099           0 :                                         utf8_putchar(lc, cur);
    1100           0 :                                         cur = '@';
    1101           0 :                                         token = 0;
    1102             :                                 }
    1103             :                         }
    1104             :                 } else {
    1105     1888597 :                         if (cur == '.') {
    1106       11067 :                                 cur = scanner_getc(lc);
    1107       11067 :                                 if (iswalnum(cur)) /* early exit for numerical forms with final . e.g. 10. */
    1108       11061 :                                 if ((cur = check_validity_number(c, cur, false, &token, INTNUM)) == EOF) return cur;
    1109             :                         }
    1110     1888597 :                         if (token != 0)
    1111     1888498 :                         if (cur == 'e' || cur == 'E') {
    1112        2225 :                                 cur = scanner_getc(lc);
    1113        2225 :                                 if (cur == '+' || cur == '-')
    1114        2111 :                                         cur = scanner_getc(lc);
    1115        2225 :                                 if ((cur = check_validity_number(c, cur, false, &token, APPROXNUM)) == EOF) return cur;
    1116             :                         }
    1117             :                 }
    1118             :         }
    1119             : 
    1120     1886715 :         assert(cur != EOF);
    1121             : 
    1122     1888940 :         if (iswalnum(cur)) /* ERROR: not a valid digit */
    1123           6 :                 token = 0;
    1124             : 
    1125     1888940 :         utf8_putchar(lc, cur);
    1126             : 
    1127     1888905 :         if (token) {
    1128     1888895 :                 return scanner_token(lc, token);
    1129             :         } else {
    1130          10 :                 (void)sql_error( c, 2, SQLSTATE(42000) "Unexpected symbol %lc", (wint_t) cur);
    1131          10 :                 return LEX_ERROR;
    1132             :         }
    1133             : }
    1134             : 
    1135             : static
    1136    12803463 : int scanner_symbol(mvc * c, int cur)
    1137             : {
    1138    12803463 :         struct scanner *lc = &c->scanner;
    1139    12803463 :         int next = 0;
    1140    12803463 :         int started = lc->started;
    1141             : 
    1142    12803463 :         switch (cur) {
    1143       71565 :         case '/':
    1144       71565 :                 lc->started = 1;
    1145       71565 :                 next = scanner_getc(lc);
    1146       71565 :                 if (next < 0)
    1147             :                         return EOF;
    1148       71565 :                 if (next == '*') {
    1149       68937 :                         lc->started = started;
    1150       68937 :                         cur = skip_c_comment(lc);
    1151       68937 :                         if (cur < 0)
    1152             :                                 return EOF;
    1153       68937 :                         return tokenize(c, cur);
    1154             :                 } else {
    1155        2628 :                         utf8_putchar(lc, next);
    1156        2628 :                         return scanner_token(lc, cur);
    1157             :                 }
    1158           0 :         case '0':
    1159             :         case '1':
    1160             :         case '2':
    1161             :         case '3':
    1162             :         case '4':
    1163             :         case '5':
    1164             :         case '6':
    1165             :         case '7':
    1166             :         case '8':
    1167             :         case '9':
    1168           0 :                 return number(c, cur);
    1169           5 :         case '#':
    1170           5 :                 if ((cur = skip_sql_comment(lc)) == EOF)
    1171             :                         return cur;
    1172           5 :                 return tokenize(c, cur);
    1173      796409 :         case '\'':
    1174      796409 :                 if (lc->raw_string_mode || lc->next_string_is_raw)
    1175          46 :                         return scanner_string(c, cur, false);
    1176      796363 :                 return scanner_string(c, cur, true);
    1177     1291296 :         case '"':
    1178     1291296 :                 return scanner_string(c, cur, false);
    1179         496 :         case '{':
    1180             :                 // if previous tokens like LANGUAGE IDENT
    1181             :                 // TODO checking on IDENT only may not be enough
    1182         496 :                 if (lc->yylast == IDENT)
    1183         230 :                         return scanner_body(c);
    1184         266 :                 lc->started = 1;
    1185         266 :                 return scanner_token(lc, cur);
    1186         266 :         case '}':
    1187         266 :                 lc->started = 1;
    1188         266 :                 return scanner_token(lc, cur);
    1189       29822 :         case '-':
    1190       29822 :                 lc->started = 1;
    1191       29822 :                 next = scanner_getc(lc);
    1192       29822 :                 if (next < 0)
    1193             :                         return EOF;
    1194       29821 :                 if (next == '-') {
    1195        3173 :                         lc->started = started;
    1196        3173 :                         if ((cur = skip_sql_comment(lc)) == EOF)
    1197             :                                 return cur;
    1198        3173 :                         return tokenize(c, cur);
    1199             :                 }
    1200       26648 :                 lc->started = 1;
    1201       26648 :                 utf8_putchar(lc, next);
    1202       26648 :                 return scanner_token(lc, cur);
    1203          12 :         case '~': /* binary not */
    1204          12 :                 lc->started = 1;
    1205          12 :                 next = scanner_getc(lc);
    1206          12 :                 if (next < 0)
    1207             :                         return EOF;
    1208          12 :                 if (next == '=')
    1209           5 :                         return scanner_token(lc, GEOM_MBR_EQUAL);
    1210           7 :                 utf8_putchar(lc, next);
    1211           7 :                 return scanner_token(lc, cur);
    1212     7192926 :         case '^': /* binary xor */
    1213             :         case '*':
    1214             :         case '?':
    1215             :         case ':':
    1216             :         case '%':
    1217             :         case '+':
    1218             :         case '(':
    1219             :         case ')':
    1220             :         case ',':
    1221             :         case '=':
    1222             :         case '[':
    1223             :         case ']':
    1224     7192926 :                 lc->started = 1;
    1225     7192926 :                 return scanner_token(lc, cur);
    1226        6227 :         case '&':
    1227        6227 :                 lc->started = 1;
    1228        6227 :                 cur = scanner_getc(lc);
    1229        6227 :                 if (cur < 0)
    1230             :                         return EOF;
    1231        6227 :                 if (cur < 0)
    1232             :                         return EOF;
    1233        6227 :                 if(cur == '<') {
    1234           3 :                         next = scanner_getc(lc);
    1235           3 :                         if (next < 0)
    1236             :                                 return EOF;
    1237           3 :                         if(next == '|') {
    1238           0 :                                 return scanner_token(lc, GEOM_OVERLAP_OR_BELOW);
    1239             :                         } else {
    1240           3 :                                 utf8_putchar(lc, next); //put the char back
    1241           3 :                                 return scanner_token(lc, GEOM_OVERLAP_OR_LEFT);
    1242             :                         }
    1243        6224 :                 } else if(cur == '>')
    1244           3 :                         return scanner_token(lc, GEOM_OVERLAP_OR_RIGHT);
    1245        6221 :                 else if(cur == '&')
    1246           3 :                         return scanner_token(lc, GEOM_OVERLAP);
    1247             :                 else {/* binary and */
    1248        6218 :                         utf8_putchar(lc, cur); //put the char back
    1249        6218 :                         return scanner_token(lc, '&');
    1250             :                 }
    1251          19 :         case '@':
    1252          19 :                 lc->started = 1;
    1253          19 :                 return scanner_token(lc, AT);
    1254      991616 :         case ';':
    1255      991616 :                 lc->started = 0;
    1256      991616 :                 return scanner_token(lc, SCOLON);
    1257          27 :         case '!':
    1258          27 :                 lc->started = 1;
    1259          27 :                 cur = scanner_getc(lc);
    1260          27 :                 if (cur < 0)
    1261             :                         return EOF;
    1262          27 :                 else if (cur == '=') {
    1263          21 :                         lc->rs->buf[lc->rs->pos + lc->yycur - 2] = '<';
    1264          21 :                         lc->rs->buf[lc->rs->pos + lc->yycur - 1] = '>';
    1265          21 :                         return scanner_token( lc, COMPARISON);
    1266             :                 } else {
    1267           6 :                         utf8_putchar(lc, cur); //put the char back
    1268             :                 }
    1269           6 :                 return scanner_token(lc, '!');
    1270       51072 :         case '<':
    1271       51072 :                 lc->started = 1;
    1272       51072 :                 cur = scanner_getc(lc);
    1273       51072 :                 if (cur < 0)
    1274             :                         return EOF;
    1275       51072 :                 if (cur == '=') {
    1276        3113 :                         return scanner_token( lc, COMPARISON);
    1277       47959 :                 } else if (cur == '>') {
    1278       34543 :                         return scanner_token( lc, COMPARISON);
    1279       13416 :                 } else if (cur == '<') {
    1280          44 :                         next = scanner_getc(lc);
    1281          44 :                         if (next < 0)
    1282             :                                 return EOF;
    1283          44 :                         if (next == '=') {
    1284           4 :                                 return scanner_token( lc, LEFT_SHIFT_ASSIGN);
    1285          40 :                         } else if (next == '|') {
    1286           1 :                                 return scanner_token(lc, GEOM_BELOW);
    1287             :                         } else {
    1288          39 :                                 utf8_putchar(lc, next); //put the char back
    1289          39 :                                 return scanner_token( lc, LEFT_SHIFT);
    1290             :                         }
    1291       13372 :                 } else if(cur == '-') {
    1292          19 :                         next = scanner_getc(lc);
    1293          19 :                         if (next < 0)
    1294             :                                 return EOF;
    1295          19 :                         if(next == '>') {
    1296           7 :                                 return scanner_token(lc, GEOM_DIST);
    1297             :                         } else {
    1298             :                                 //put the characters back and fall in the next possible case
    1299          12 :                                 utf8_putchar(lc, next);
    1300          12 :                                 utf8_putchar(lc, cur);
    1301          12 :                                 return scanner_token( lc, COMPARISON);
    1302             :                         }
    1303             :                 } else {
    1304       13353 :                         utf8_putchar(lc, cur);
    1305       13353 :                         return scanner_token( lc, COMPARISON);
    1306             :                 }
    1307       47656 :         case '>':
    1308       47656 :                 lc->started = 1;
    1309       47656 :                 cur = scanner_getc(lc);
    1310       47656 :                 if (cur < 0)
    1311             :                         return EOF;
    1312       47656 :                 if (cur == '>') {
    1313        2647 :                         cur = scanner_getc(lc);
    1314        2647 :                         if (cur < 0)
    1315             :                                 return EOF;
    1316        2647 :                         if (cur == '=')
    1317           3 :                                 return scanner_token( lc, RIGHT_SHIFT_ASSIGN);
    1318        2644 :                         utf8_putchar(lc, cur);
    1319        2644 :                         return scanner_token( lc, RIGHT_SHIFT);
    1320       45009 :                 } else if (cur != '=') {
    1321       42780 :                         utf8_putchar(lc, cur);
    1322       42780 :                         return scanner_token( lc, COMPARISON);
    1323             :                 } else {
    1324        2229 :                         return scanner_token( lc, COMPARISON);
    1325             :                 }
    1326     2143513 :         case '.':
    1327     2143513 :                 lc->started = 1;
    1328     2143513 :                 cur = scanner_getc(lc);
    1329     2143513 :                 if (cur < 0)
    1330             :                         return EOF;
    1331     2143512 :                 if (!iswdigit(cur)) {
    1332     2143499 :                         utf8_putchar(lc, cur);
    1333     2143499 :                         return scanner_token( lc, '.');
    1334             :                 } else {
    1335          13 :                         utf8_putchar(lc, cur);
    1336          13 :                         cur = '.';
    1337          13 :                         return number(c, cur);
    1338             :                 }
    1339      180526 :         case '|': /* binary or or string concat */
    1340      180526 :                 lc->started = 1;
    1341      180526 :                 cur = scanner_getc(lc);
    1342      180526 :                 if (cur < 0)
    1343             :                         return EOF;
    1344      180526 :                 if (cur == '|') {
    1345      180503 :                         return scanner_token(lc, CONCATSTRING);
    1346          23 :                 } else if (cur == '&') {
    1347           0 :                         next = scanner_getc(lc);
    1348           0 :                         if (next < 0)
    1349             :                                 return EOF;
    1350           0 :                         if(next == '>') {
    1351           0 :                                 return scanner_token(lc, GEOM_OVERLAP_OR_ABOVE);
    1352             :                         } else {
    1353           0 :                                 utf8_putchar(lc, next); //put the char back
    1354           0 :                                 utf8_putchar(lc, cur); //put the char back
    1355           0 :                                 return scanner_token(lc, '|');
    1356             :                         }
    1357          23 :                 } else if (cur == '>') {
    1358           1 :                         next = scanner_getc(lc);
    1359           1 :                         if (next < 0)
    1360             :                                 return EOF;
    1361           1 :                         if(next == '>') {
    1362           1 :                                 return scanner_token(lc, GEOM_ABOVE);
    1363             :                         } else {
    1364           0 :                                 utf8_putchar(lc, next); //put the char back
    1365           0 :                                 utf8_putchar(lc, cur); //put the char back
    1366           0 :                                 return scanner_token(lc, '|');
    1367             :                         }
    1368             :                 } else {
    1369          22 :                         utf8_putchar(lc, cur);
    1370          22 :                         return scanner_token(lc, '|');
    1371             :                 }
    1372             :         }
    1373          10 :         (void)sql_error( c, 3, SQLSTATE(42000) "Unexpected symbol (%lc)", (wint_t) cur);
    1374          10 :         return LEX_ERROR;
    1375             : }
    1376             : 
    1377             : static int
    1378    28211884 : tokenize(mvc * c, int cur)
    1379             : {
    1380    28211884 :         struct scanner *lc = &c->scanner;
    1381    56389338 :         while (1) {
    1382    42300611 :                 if (cur == 0xFEFF) {
    1383             :                         /* on Linux at least, iswpunct returns TRUE
    1384             :                          * for U+FEFF, but we don't want that, we just
    1385             :                          * want to go to the scanner_error case
    1386             :                          * below */
    1387             :                         ;
    1388    42301265 :                 } else if (iswspace(cur)) {
    1389    14106332 :                         if ((cur = skip_white_space(lc)) == EOF)
    1390             :                                 return cur;
    1391    14088727 :                         continue;  /* try again */
    1392    28194933 :                 } else if (iswdigit(cur)) {
    1393     1889953 :                         return number(c, cur);
    1394    26304980 :                 } else if (iswalpha(cur) || cur == '_') {
    1395    13466455 :                         switch (cur) {
    1396      652412 :                         case 'e': /* string with escapes */
    1397             :                         case 'E':
    1398      652412 :                                 if (scanner_read_more(lc, 1) != EOF &&
    1399      652412 :                                     lc->rs->buf[lc->rs->pos + lc->yycur] == '\'') {
    1400        3797 :                                         return scanner_string(c, scanner_getc(lc), true);
    1401             :                                 }
    1402             :                                 break;
    1403      414995 :                         case 'x': /* blob */
    1404             :                         case 'X':
    1405             :                         case 'r': /* raw string */
    1406             :                         case 'R':
    1407      414995 :                                 if (scanner_read_more(lc, 1) != EOF &&
    1408      414995 :                                     lc->rs->buf[lc->rs->pos + lc->yycur] == '\'') {
    1409        3275 :                                         return scanner_string(c, scanner_getc(lc), false);
    1410             :                                 }
    1411             :                                 break;
    1412      154420 :                         case 'u': /* unicode string */
    1413             :                         case 'U':
    1414      154420 :                                 if (scanner_read_more(lc, 1) != EOF &&
    1415      154437 :                                     lc->rs->buf[lc->rs->pos + lc->yycur] == '&' &&
    1416          17 :                                     scanner_read_more(lc, 2) != EOF &&
    1417          17 :                                     (lc->rs->buf[lc->rs->pos + lc->yycur + 1] == '\'' ||
    1418             :                                      lc->rs->buf[lc->rs->pos + lc->yycur + 1] == '"')) {
    1419          17 :                                         cur = scanner_getc(lc); /* '&' */
    1420          17 :                                         return scanner_string(c, scanner_getc(lc), false);
    1421             :                                 }
    1422             :                                 break;
    1423             :                         default:
    1424             :                                 break;
    1425             :                         }
    1426    13494255 :                         return keyword_or_ident(c, cur);
    1427    12803636 :                 } else if (iswpunct(cur)) {
    1428    12802950 :                         return scanner_symbol(c, cur);
    1429             :                 }
    1430          32 :                 if (cur == EOF) {
    1431           0 :                         if (lc->mode == LINE_1 || !lc->started )
    1432             :                                 return cur;
    1433           0 :                         return scanner_error(c, cur);
    1434             :                 }
    1435             :                 /* none of the above: error */
    1436          32 :                 return scanner_error(c, cur);
    1437             :         }
    1438             : }
    1439             : 
    1440             : /* SQL 'quoted' idents consist of a set of any character of
    1441             :  * the source language character set other than a 'quote'
    1442             :  *
    1443             :  * MonetDB has 3 restrictions:
    1444             :  *      1 we disallow '%' as the first character.
    1445             :  *      2 the length is limited to 1024 characters
    1446             :  *      3 the identifier 'TID%' is not allowed
    1447             :  */
    1448             : static bool
    1449     1291285 : valid_ident(const char *restrict s, char *restrict dst)
    1450             : {
    1451     1291285 :         int p = 0;
    1452             : 
    1453     1291285 :         if (*s == '%')
    1454             :                 return false;
    1455             : 
    1456     9553793 :         while (*s) {
    1457     8262508 :                 if ((dst[p++] = *s++) == '"' && *s == '"')
    1458          68 :                         s++;
    1459     8262508 :                 if (p >= 1024)
    1460             :                         return false;
    1461             :         }
    1462     1291285 :         dst[p] = '\0';
    1463     1291285 :         if (strcmp(dst, TID + 1) == 0) /* an index named 'TID%' could interfere with '%TID%' */
    1464             :                 return false;
    1465             :         return true;
    1466             : }
    1467             : 
    1468             : static inline int
    1469    28301623 : sql_get_next_token(YYSTYPE *yylval, void *parm)
    1470             : {
    1471    28301623 :         mvc *c = (mvc*)parm;
    1472    28301623 :         struct scanner *lc = &c->scanner;
    1473    28301623 :         int token = 0, cur = 0;
    1474             : 
    1475    28301623 :         if (lc->rs->buf == NULL) /* malloc failure */
    1476             :                 return EOF;
    1477             : 
    1478    28301623 :         if (lc->yynext) {
    1479       61321 :                 int next = lc->yynext;
    1480             : 
    1481       61321 :                 lc->yynext = 0;
    1482       61321 :                 return(next);
    1483             :         }
    1484             : 
    1485    28240302 :         if (lc->yybak) {
    1486    27217856 :                 lc->rs->buf[lc->rs->pos + lc->yycur] = lc->yybak;
    1487    27217856 :                 lc->yybak = 0;
    1488             :         }
    1489             : 
    1490    28240302 :         lc->yysval = lc->yycur;
    1491    28240302 :         lc->yylast = lc->yyval;
    1492    28240302 :         cur = scanner_getc(lc);
    1493    28251057 :         if (cur < 0)
    1494             :                 return EOF;
    1495    28140164 :         token = tokenize(c, cur);
    1496             : 
    1497    28129683 :         yylval->sval = (lc->rs->buf + lc->rs->pos + lc->yysval);
    1498             : 
    1499    28129683 :         if (token == KW_TYPE)
    1500       49263 :                 token = aTYPE;
    1501             : 
    1502    28129683 :         if (token == IDENT || token == COMPARISON ||
    1503    22824480 :             token == RANK || token == aTYPE || token == MARGFUNC) {
    1504     5364285 :                 yylval->sval = sa_strndup(c->sa, yylval->sval, lc->yycur-lc->yysval);
    1505     5364202 :                 lc->next_string_is_raw = false;
    1506    22765398 :         } else if (token == STRING) {
    1507     2094764 :                 char quote = *yylval->sval;
    1508     2094764 :                 char *str = sa_alloc( c->sa, (lc->yycur-lc->yysval-2)*2 + 1 );
    1509     2094763 :                 char *dst;
    1510             : 
    1511     2094763 :                 assert(quote == '"' || quote == '\'' || quote == 'E' || quote == 'e' || quote == 'U' || quote == 'u' || quote == 'X' || quote == 'x' || quote == 'R' || quote == 'r');
    1512             : 
    1513     2094763 :                 lc->rs->buf[lc->rs->pos + lc->yycur - 1] = 0;
    1514     2094763 :                 switch (quote) {
    1515     1291285 :                 case '"':
    1516     1291285 :                         if (valid_ident(yylval->sval+1,str)) {
    1517             :                                 token = IDENT;
    1518             :                         } else {
    1519           0 :                                 sql_error(c, 1, SQLSTATE(42000) "Invalid identifier '%s'", yylval->sval+1);
    1520           0 :                                 return LEX_ERROR;
    1521             :                         }
    1522             :                         break;
    1523        3796 :                 case 'e':
    1524             :                 case 'E':
    1525        3796 :                         assert(yylval->sval[1] == '\'');
    1526        3796 :                         if (GDKstrFromStr((unsigned char *) str,
    1527             :                                                           (unsigned char *) yylval->sval + 2,
    1528        3796 :                                                           lc->yycur-lc->yysval - 2, '\'') < 0) {
    1529           1 :                                 char *err = GDKerrbuf;
    1530           1 :                                 if (strncmp(err, GDKERROR, strlen(GDKERROR)) == 0)
    1531           1 :                                         err += strlen(GDKERROR);
    1532           0 :                                 else if (*err == '!')
    1533           0 :                                         err++;
    1534           1 :                                 sql_error(c, 1, SQLSTATE(42000) "%s", err);
    1535           1 :                                 return LEX_ERROR;
    1536             :                         }
    1537             :                         quote = '\'';
    1538             :                         break;
    1539          17 :                 case 'u':
    1540             :                 case 'U':
    1541          17 :                         assert(yylval->sval[1] == '&');
    1542          17 :                         assert(yylval->sval[2] == '\'' || yylval->sval[2] == '"');
    1543          17 :                         strcpy(str, yylval->sval + 3);
    1544          17 :                         token = yylval->sval[2] == '\'' ? USTRING : UIDENT;
    1545          17 :                         quote = yylval->sval[2];
    1546          17 :                         lc->next_string_is_raw = true;
    1547          17 :                         break;
    1548           1 :                 case 'x':
    1549             :                 case 'X':
    1550           1 :                         assert(yylval->sval[1] == '\'');
    1551           1 :                         dst = str;
    1552           5 :                         for (char *src = yylval->sval + 2; *src; dst++)
    1553           4 :                                 if ((*dst = *src++) == '\'' && *src == '\'')
    1554           0 :                                         src++;
    1555           1 :                         *dst = 0;
    1556           1 :                         quote = '\'';
    1557           1 :                         token = XSTRING;
    1558           1 :                         lc->next_string_is_raw = true;
    1559           1 :                         break;
    1560        3267 :                 case 'r':
    1561             :                 case 'R':
    1562        3267 :                         assert(yylval->sval[1] == '\'');
    1563        3267 :                         dst = str;
    1564      449813 :                         for (char *src = yylval->sval + 2; *src; dst++)
    1565      446546 :                                 if ((*dst = *src++) == '\'' && *src == '\'')
    1566        2720 :                                         src++;
    1567        3267 :                         quote = '\'';
    1568        3267 :                         *dst = 0;
    1569        3267 :                         break;
    1570      796397 :                 default:
    1571      796397 :                         if (lc->raw_string_mode || lc->next_string_is_raw) {
    1572          46 :                                 dst = str;
    1573         436 :                                 for (char *src = yylval->sval + 1; *src; dst++)
    1574         390 :                                         if ((*dst = *src++) == '\'' && *src == '\'')
    1575           1 :                                                 src++;
    1576          46 :                                 *dst = 0;
    1577             :                         } else {
    1578      796351 :                                 if (GDKstrFromStr((unsigned char *)str,
    1579      796351 :                                                                   (unsigned char *)yylval->sval + 1,
    1580      796351 :                                                                   lc->yycur - lc->yysval - 1,
    1581             :                                                                   '\'') < 0) {
    1582           1 :                                         sql_error(c, 1, SQLSTATE(42000) "%s", GDKerrbuf);
    1583           1 :                                         return LEX_ERROR;
    1584             :                                 }
    1585             :                         }
    1586             :                         break;
    1587             :                 }
    1588     2094761 :                 yylval->sval = str;
    1589             : 
    1590             :                 /* reset original */
    1591     2094761 :                 lc->rs->buf[lc->rs->pos+lc->yycur- 1] = quote;
    1592             :         } else {
    1593    20670634 :                 lc->next_string_is_raw = false;
    1594             :         }
    1595             : 
    1596             :         return(token);
    1597             : }
    1598             : 
    1599             : static int scanner( YYSTYPE *yylval, void *m, bool log);
    1600             : 
    1601             : static int
    1602    28172240 : scanner(YYSTYPE * yylval, void *parm, bool log)
    1603             : {
    1604    28172240 :         int token;
    1605    28172240 :         mvc *c = (mvc *) parm;
    1606    28172240 :         struct scanner *lc = &c->scanner;
    1607    28172240 :         size_t pos;
    1608             : 
    1609             :         /* store position for when view's query ends */
    1610    28172240 :         pos = lc->rs->pos + lc->yycur;
    1611             : 
    1612    28172240 :         token = sql_get_next_token(yylval, parm);
    1613             : 
    1614    28167694 :         if (token == NOT) {
    1615       73979 :                 int next = scanner(yylval, parm, false);
    1616             : 
    1617       73979 :                 if (next == NOT) {
    1618           2 :                         return scanner(yylval, parm, false);
    1619             :                 } else if (next == EXISTS) {
    1620             :                         token = NOT_EXISTS;
    1621             :                 } else if (next == BETWEEN) {
    1622             :                         token = NOT_BETWEEN;
    1623             :                 } else if (next == sqlIN) {
    1624             :                         token = NOT_IN;
    1625             :                 } else if (next == LIKE) {
    1626             :                         token = NOT_LIKE;
    1627             :                 } else if (next == ILIKE) {
    1628             :                         token = NOT_ILIKE;
    1629             :                 } else {
    1630       61321 :                         lc->yynext = next;
    1631             :                 }
    1632    28093715 :         } else if (token == SCOLON) {
    1633             :                 /* ignore semi-colon(s) following a semi-colon */
    1634      991632 :                 if (lc->yylast == SCOLON) {
    1635      131700 :                         size_t prev = lc->yycur;
    1636      131701 :                         while ((token = sql_get_next_token(yylval, parm)) == SCOLON)
    1637           1 :                                 prev = lc->yycur;
    1638             : 
    1639             :                         /* skip the skipped stuff also in the buffer */
    1640      131606 :                         lc->rs->pos += prev;
    1641      131606 :                         lc->yycur -= prev;
    1642             :                 }
    1643             :         }
    1644             : 
    1645    28167598 :         if (lc->log && log)
    1646           0 :                 mnstr_write(lc->log, lc->rs->buf+pos, lc->rs->pos + lc->yycur - pos, 1);
    1647             : 
    1648    28167598 :         lc->started += (token != EOF);
    1649    28167598 :         return token;
    1650             : }
    1651             : 
    1652             : /* also see sql_parser.y */
    1653             : extern int sqllex(YYSTYPE * yylval, void *parm);
    1654             : 
    1655             : int
    1656    28099858 : sqllex(YYSTYPE * yylval, void *parm)
    1657             : {
    1658    28099858 :         return scanner(yylval, parm, true);
    1659             : }

Generated by: LCOV version 1.14