LCOV - code coverage report
Current view: top level - monetdb5/modules/atoms - url.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 372 459 81.0 %
Date: 2024-11-13 22:44:48 Functions: 26 27 96.3 %

          Line data    Source code
       1             : /*
       2             :  * SPDX-License-Identifier: MPL-2.0
       3             :  *
       4             :  * This Source Code Form is subject to the terms of the Mozilla Public
       5             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       6             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       7             :  *
       8             :  * Copyright 2024 MonetDB Foundation;
       9             :  * Copyright August 2008 - 2023 MonetDB B.V.;
      10             :  * Copyright 1997 - July 2008 CWI.
      11             :  */
      12             : 
      13             : /*
      14             :  *  M. Kersten
      15             :  *  Y. Zhang
      16             :  * The URL module
      17             :  * The URL module contains a collection of commands to manipulate
      18             :  * Uniform Resource Locators - a resource on the World Wide Web-
      19             :  * represented as a string in Monet. The URL can represent
      20             :  * anything from a file, a directory or a complete movie.
      21             :  * This module is geared towards manipulation of their name only.
      22             :  * A complementary module can be used to gain access.[IOgate]
      23             :  *
      24             :  * The URL syntax is specified in RFC2396, Uniform Resource Identifiers
      25             :  * (URI): Generic Syntax. The URL syntax is dependent upon the scheme.
      26             :  * In general, a URL has the form <scheme>:<scheme-specific-part>.
      27             :  * Thus, accepting a valid URL is a simple process, unless the scheme
      28             :  * is known and schema-specific syntax is checked (e.g., http or ftp
      29             :  * scheme). For the URL module implemented here, we assume some common
      30             :  * fields of the <scheme-specific-part> that are shared among different
      31             :  * schemes.
      32             :  *
      33             :  * The core of the extension involves several operators to extract
      34             :  * portions of the URLs for further manipulation. In particular,
      35             :  * the domain, the server, and the protocol, and the file extension
      36             :  * can be extracted without copying the complete URL from the heap
      37             :  * into a string variable first.
      38             :  *
      39             :  * The commands provided are based on the corresponding Java class.
      40             :  *
      41             :  * A future version should use a special atom, because this may save
      42             :  * considerable space. Alternatively, break the URL strings into
      43             :  * components and represent them with a bunch of BATs. An intermediate
      44             :  * step would be to refine the atom STR, then it would be possible to
      45             :  * redefine hashing.
      46             :  */
      47             : 
      48             : #include "monetdb_config.h"
      49             : #include "mal.h"
      50             : #include "gdk.h"
      51             : #include <ctype.h>
      52             : #include "mal_exception.h"
      53             : #include "str.h"
      54             : 
      55             : typedef str url;
      56             : 
      57             : /* SCHEME "://" AUTHORITY [ PATH ] [ "?" SEARCH ] [ "#" FRAGMENT ]
      58             :  * AUTHORITY is: [ USER [ ":" PASSWORD ] "@" ] HOST [ ":" PORT ] */
      59             : 
      60             : /* return pointer to string after the scheme and colon; input: pointer
      61             :  * to start of URI */
      62             : static const char *
      63          73 : skip_scheme(const char *uri)
      64             : {
      65          73 :         if (('a' <= *uri && *uri <= 'z') || ('A' <= *uri && *uri <= 'Z')) {
      66          73 :                 uri++;
      67          73 :                 while (('a' <= *uri && *uri <= 'z') || ('A' <= *uri && *uri <= 'Z')
      68          73 :                            || isdigit((unsigned char) *uri) || *uri == '+' || *uri == '-'
      69         395 :                            || *uri == '.')
      70         249 :                         uri++;
      71          73 :                 if (*uri == ':')
      72          72 :                         return uri + 1;
      73             :         }
      74             :         return NULL;
      75             : }
      76             : 
      77             : #define ishex(c)                isxdigit((unsigned char) (c))
      78             : #define isreserved(c)   ((c) == ';' || (c) == '/' || (c) == '?' || \
      79             :                                                  (c) == ':' || (c) == '@' || (c) == '&' || \
      80             :                                                  (c) == '=' || (c) == '+' || (c) == '$' || \
      81             :                                                  (c) == ',')
      82             : #define isunreserved(c) (('a' <= (c) && (c) <= 'z') || \
      83             :                                                  ('A' <= (c) && (c) <= 'Z') || \
      84             :                                                  isdigit((unsigned char) (c)) || \
      85             :                                                  (c) == '-' || (c) == '_' || (c) == '.' || \
      86             :                                                  (c) == '!' || (c) == '~' || (c) == '*' || \
      87             :                                                  (c) == '\'' || (c) == '(' || (c) == ')')
      88             : 
      89             : /* return pointer to string after the authority, filling in pointers
      90             :  * to start of user, password, host, and port, if provided; input:
      91             :  * result of skip_scheme() */
      92             : static const char *
      93          63 : skip_authority(const char *uri, const char **userp, const char **passp,
      94             :                            const char **hostp, const char **portp)
      95             : {
      96          63 :         const char *user = NULL, *pass = NULL, *host = NULL, *port = NULL;
      97             : 
      98          63 :         if (uri[0] == '/' && uri[1] == '/') {
      99          63 :                 uri += 2;
     100          63 :                 user = host = uri;
     101         436 :                 while (isunreserved(*uri)
     102           0 :                            || (*uri == '%' && ishex(uri[1]) && ishex(uri[2])) || *uri == ';'
     103             :                            || *uri == ':' || *uri == '=' || *uri == '+' || *uri == '$'
     104        1193 :                            || *uri == ',' || *uri == '@') {
     105        1130 :                         if (*uri == ':') {
     106          39 :                                 if (user == host)
     107          15 :                                         port = pass = uri + 1;
     108             :                                 else
     109          24 :                                         port = uri + 1;
     110        1091 :                         } else if (*uri == '@') {
     111          29 :                                 host = uri + 1;
     112          29 :                                 port = NULL;
     113             :                         }
     114        2260 :                         uri += *uri == '%' ? 3 : 1;
     115             :                 }
     116          63 :                 if (user == host) {
     117             :                         /* no "@", so no user info */
     118          34 :                         if (userp)
     119           4 :                                 *userp = NULL;
     120          34 :                         if (passp)
     121           4 :                                 *passp = NULL;
     122             :                 } else {
     123          29 :                         if (userp)
     124           4 :                                 *userp = user;
     125          29 :                         if (passp)
     126           4 :                                 *passp = pass;
     127             :                 }
     128          63 :                 if (portp)
     129          20 :                         *portp = port;
     130          63 :                 if (hostp)
     131          23 :                         *hostp = host;
     132          63 :                 return uri;
     133             :         }
     134             :         return NULL;
     135             : }
     136             : 
     137             : /* return pointer to string after the path, filling in pointer to
     138             :  * start of last component and extension of that component; input:
     139             :  * result of skip_authority() */
     140             : static const char *
     141          30 : skip_path(const char *uri, const char **basep, const char **extp)
     142             : {
     143          30 :         const char *base = NULL, *ext = NULL;
     144             : 
     145          30 :         if (*uri == '/') {
     146          24 :                 uri++;
     147          24 :                 base = uri;
     148         132 :                 while (isunreserved(*uri)
     149           0 :                            || (*uri == '%' && ishex(uri[1]) && ishex(uri[2])) || *uri == ':'
     150             :                            || *uri == '@' || *uri == '&' || *uri == '=' || *uri == '+'
     151         582 :                            || *uri == '$' || *uri == ',' || *uri == ';' || *uri == '/') {
     152         558 :                         if (*uri == '/') {
     153          36 :                                 base = uri + 1;
     154          36 :                                 ext = NULL;
     155         522 :                         } else if (*uri == '.' && ext == NULL && uri != base) {
     156         558 :                                 ext = uri;
     157             :                         }
     158        1116 :                         uri += *uri == '%' ? 3 : 1;
     159             :                 }
     160             :         }
     161          30 :         if (basep)
     162          10 :                 *basep = base;
     163          30 :         if (extp)
     164          10 :                 *extp = ext;
     165          30 :         return uri;
     166             : }
     167             : 
     168             : /* return pointer to string after the search string; input: result of
     169             :  * skip_path() */
     170             : static const char *
     171          10 : skip_search(const char *uri)
     172             : {
     173          10 :         if (*uri == '?') {
     174           6 :                 uri++;
     175          68 :                 while (isreserved(*uri) || isunreserved(*uri)
     176          76 :                            || (*uri == '%' && ishex(uri[1]) && ishex(uri[2]))) {
     177         140 :                         uri += *uri == '%' ? 3 : 1;
     178             :                 }
     179             :         }
     180          10 :         return uri;
     181             : }
     182             : 
     183             : #if 0
     184             : /*
     185             :  * Utilities
     186             :  */
     187             : 
     188             : static char
     189             : x2c(const char *what)
     190             : {
     191             :         char digit;
     192             : 
     193             :         digit = (what[0] >= 'A' ? ((what[0] & 0xdf) - 'A') + 10 : (what[0] - '0'));
     194             :         digit *= 16;
     195             :         digit += (what[1] >= 'A' ? ((what[1] & 0xdf) - 'A') + 10 : (what[1] - '0'));
     196             :         return (digit);
     197             : }
     198             : 
     199             : static int
     200             : needEscape(char c)
     201             : {
     202             :         if (isalnum((unsigned char) c))
     203             :                 return 0;
     204             :         if (c == '#' || c == '-' || c == '_' || c == '.' || c == '!' || c == '~'
     205             :                 || c == '*' || c == '\'' || c == '(' || c == ')')
     206             :                 return 0;
     207             :         return 1;
     208             : }
     209             : 
     210             : /* COMMAND "escape": this function applies the URI escaping rules defined in
     211             :  * section 2 of [RFC 3986] to the string supplied as 's'.
     212             :  * The effect of the function is to escape a set of identified characters in
     213             :  * the string. Each such character is replaced in the string by an escape
     214             :  * sequence, which is formed by encoding the character as a sequence of octets
     215             :  * in UTF-8, and then reprensenting each of these octets in the form %HH.
     216             :  *
     217             :  * All characters are escaped other than:
     218             :  * [a-z], [A-Z], [0-9], "#", "-", "_", ".", "!", "~", "*", "'", "(", ")"
     219             :  *
     220             :  * This function must always generate hexadecimal values using the upper-case
     221             :  * letters A-F.
     222             :  *
     223             :  * SIGNATURE: escape(str) : str; */
     224             : static str
     225             : escape_str(str *retval, const char *s)
     226             : {
     227             :         int x, y;
     228             :         str res;
     229             : 
     230             :         if (!s)
     231             :                 throw(ILLARG, "url.escape", "url missing");
     232             : 
     233             :         if (!(res = (str) GDKmalloc(strlen(s) * 3)))
     234             :                 throw(MAL, "url.escape", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     235             :         for (x = 0, y = 0; s[x]; ++x, ++y) {
     236             :                 if (needEscape(s[x])) {
     237             :                         if (s[x] == ' ') {
     238             :                                 res[y] = '+';
     239             :                         } else {
     240             :                                 sprintf(res + y, "%%%2x", (uint8_t) s[x]);
     241             :                                 y += 2;
     242             :                         }
     243             :                 } else {
     244             :                         res[y] = s[x];
     245             :                 }
     246             :         }
     247             :         res[y] = '\0';
     248             : 
     249             :         if ((*retval = GDKrealloc(res, strlen(res) + 1)) == NULL) {
     250             :                 GDKfree(res);
     251             :                 throw(MAL, "url.escape", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     252             :         }
     253             :         return MAL_SUCCEED;
     254             : }
     255             : 
     256             : /* COMMAND "unescape": Convert hexadecimal representations to ASCII characters.
     257             :  *                     All sequences of the form "% HEX HEX" are unescaped.
     258             :  * SIGNATURE: unescape(str) : str; */
     259             : static str
     260             : unescape_str(str *retval, const char *s)
     261             : {
     262             :         int x, y;
     263             :         str res;
     264             : 
     265             :         if (!s)
     266             :                 throw(ILLARG, "url.escape", "url missing");
     267             : 
     268             :         res = (str) GDKmalloc(strlen(s));
     269             :         if (!res)
     270             :                 throw(MAL, "url.unescape", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     271             : 
     272             :         for (x = 0, y = 0; s[x]; ++x, ++y) {
     273             :                 if (s[x] == '%') {
     274             :                         res[y] = x2c(&s[x + 1]);
     275             :                         x += 2;
     276             :                 } else {
     277             :                         res[y] = s[x];
     278             :                 }
     279             :         }
     280             :         res[y] = '\0';
     281             : 
     282             :         if ((*retval = GDKrealloc(res, strlen(res) + 1)) == NULL) {
     283             :                 GDKfree(res);
     284             :                 throw(MAL, "url.unescape", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     285             :         }
     286             :         return MAL_SUCCEED;
     287             : }
     288             : #endif
     289             : 
     290             : /*
     291             :  * Wrapping
     292             :  * Here you find the wrappers around the V4 url library included above.
     293             :  */
     294             : 
     295             : static ssize_t
     296     2000013 : URLfromString(const char *src, size_t *len, void **U, bool external)
     297             : {
     298     2000013 :         char **u = (char **) U;
     299     2000013 :         size_t l = strlen(src) + 1;
     300             : 
     301     2000013 :         if (*len < l || *u == NULL) {
     302          25 :                 GDKfree(*u);
     303          25 :                 *u = GDKmalloc(l);
     304          25 :                 if (*u == NULL)
     305             :                         return -1;
     306          25 :                 *len = l;
     307             :         }
     308             : 
     309             :         /* actually parse the message for valid url */
     310             : 
     311     2000013 :         if (external && strcmp(src, "nil") == 0)
     312           0 :                 strcpy(*u, str_nil);
     313             :         else
     314     2000013 :                 memcpy(*u, src, l);
     315     2000013 :         return (ssize_t) l - 1;
     316             : }
     317             : 
     318             : static ssize_t
     319         199 : URLtoString(str *s, size_t *len, const void *SRC, bool external)
     320             : {
     321         199 :         const char *src = SRC;
     322         199 :         size_t l = strlen(src);
     323             : 
     324         199 :         if (external)
     325         188 :                 l += 2;
     326         199 :         if (l >= *len || *s == NULL) {
     327          18 :                 GDKfree(*s);
     328          18 :                 *s = GDKmalloc(l + 1);
     329          18 :                 if (*s == NULL)
     330             :                         return -1;
     331          18 :                 *len = l + 1;
     332             :         }
     333             : 
     334         199 :         if (external) {
     335         188 :                 if (strNil(src)) {
     336           0 :                         strcpy(*s, "nil");
     337           0 :                         return 3;
     338             :                 }
     339         188 :                 snprintf(*s, l + 1, "\"%s\"", src);
     340             :         } else {
     341          11 :                 strcpy(*s, src);
     342             :         }
     343         199 :         return (ssize_t) l;
     344             : }
     345             : 
     346             : /* COMMAND "getAnchor": Extract an anchor (reference) from the URL
     347             :  * SIGNATURE: getAnchor(url) : str; */
     348             : static str
     349           6 : URLgetAnchor(str *retval, const url *val)
     350             : {
     351           6 :         const char *s;
     352             : 
     353           6 :         if (val == NULL || *val == NULL)
     354           0 :                 throw(ILLARG, "url.getAnchor", "url missing");
     355             : 
     356           6 :         if (strNil(*val)) {
     357             :                 s = str_nil;
     358             :         } else {
     359           5 :                 if ((s = skip_scheme(*val)) == NULL
     360           5 :                         || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
     361           5 :                         || (s = skip_path(s, NULL, NULL)) == NULL
     362           5 :                         || (s = skip_search(s)) == NULL)
     363           0 :                         throw(ILLARG, "url.getAnchor", "bad url");
     364           5 :                 if (*s == '#')
     365           2 :                         s++;
     366             :                 else
     367             :                         s = str_nil;
     368             :         }
     369             : 
     370           6 :         if ((*retval = GDKstrdup(s)) == NULL)
     371           0 :                 throw(MAL, "url.getAnchor", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     372             :         return MAL_SUCCEED;
     373             : }
     374             : 
     375             : /* COMMAND "getBasename": Extract the base of the last file name of the URL,
     376             :  *                        thus, excluding the file extension.
     377             :  * SIGNATURE: getBasename(str) : str; */
     378             : static str
     379           6 : URLgetBasename(str *retval, const url *val)
     380             : {
     381           6 :         const char *s;
     382           6 :         const char *b = NULL;
     383           6 :         const char *e = NULL;
     384             : 
     385           6 :         if (val == NULL || *val == NULL)
     386           0 :                 throw(ILLARG, "url.getBasename", "url missing");
     387             : 
     388           6 :         if (strNil(*val)) {
     389           1 :                 *retval = GDKstrdup(str_nil);
     390             :         } else {
     391           5 :                 if ((s = skip_scheme(*val)) == NULL
     392           5 :                         || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
     393           5 :                         || (s = skip_path(s, &b, &e)) == NULL)
     394           0 :                         throw(ILLARG, "url.getBasename", "bad url");
     395           5 :                 if (b == NULL) {
     396           1 :                         *retval = GDKstrdup(str_nil);
     397             :                 } else {
     398           4 :                         size_t l;
     399             : 
     400           4 :                         if (e != NULL) {
     401           3 :                                 l = e - b;
     402             :                         } else {
     403           1 :                                 l = s - b;
     404             :                         }
     405           4 :                         if ((*retval = GDKmalloc(l + 1)) != NULL) {
     406           4 :                                 strcpy_len(*retval, b, l + 1);
     407             :                         }
     408             :                 }
     409             :         }
     410             : 
     411           6 :         if (*retval == NULL)
     412           0 :                 throw(MAL, "url.getBasename", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     413             :         return MAL_SUCCEED;
     414             : }
     415             : 
     416             : /* COMMAND "getContext": Extract the path context from the URL
     417             :  * SIGNATURE: getContext(str) : str; */
     418             : static str
     419           6 : URLgetContext(str *retval, const url *val)
     420             : {
     421           6 :         const char *s;
     422           6 :         const char *p;
     423             : 
     424           6 :         if (val == NULL || *val == NULL)
     425           0 :                 throw(ILLARG, "url.getContext", "url missing");
     426             : 
     427           6 :         if (strNil(*val)) {
     428           1 :                 *retval = GDKstrdup(str_nil);
     429             :         } else {
     430           5 :                 if ((s = skip_scheme(*val)) == NULL
     431           5 :                         || (p = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
     432           5 :                         || (s = skip_path(p, NULL, NULL)) == NULL)
     433           0 :                         throw(ILLARG, "url.getContext", "bad url");
     434           5 :                 if (p == s) {
     435           1 :                         *retval = GDKstrdup(str_nil);
     436           4 :                 } else if ((*retval = GDKmalloc(s - p + 1)) != NULL) {
     437           4 :                         strcpy_len(*retval, p, s - p + 1);
     438             :                 }
     439             :         }
     440             : 
     441           6 :         if (*retval == NULL)
     442           0 :                 throw(MAL, "url.getContext", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     443             :         return MAL_SUCCEED;
     444             : }
     445             : 
     446             : /* COMMAND "getExtension": Extract the file extension of the URL
     447             :  * SIGNATURE: getExtension(str) : str; */
     448             : static str
     449           6 : URLgetExtension(str *retval, const url *val)
     450             : {
     451           6 :         const char *s;
     452           6 :         const char *e = NULL;
     453             : 
     454           6 :         if (val == NULL || *val == NULL)
     455           0 :                 throw(ILLARG, "url.getExtension", "url missing");
     456             : 
     457           6 :         if (strNil(*val)) {
     458           1 :                 *retval = GDKstrdup(str_nil);
     459             :         } else {
     460           5 :                 if ((s = skip_scheme(*val)) == NULL
     461           5 :                         || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
     462           5 :                         || (s = skip_path(s, NULL, &e)) == NULL)
     463           0 :                         throw(ILLARG, "url.getExtension", "bad url");
     464           5 :                 if (e == NULL) {
     465           2 :                         *retval = GDKstrdup(str_nil);
     466             :                 } else {
     467           3 :                         size_t l = s - e;
     468             : 
     469           3 :                         assert(*e == '.');
     470           3 :                         if ((*retval = GDKmalloc(l)) != NULL) {
     471           3 :                                 strcpy_len(*retval, e + 1, l);
     472             :                         }
     473             :                 }
     474             :         }
     475             : 
     476           6 :         if (*retval == NULL)
     477           0 :                 throw(MAL, "url.getExtension", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     478             :         return MAL_SUCCEED;
     479             : }
     480             : 
     481             : /* COMMAND "getFile": Extract the last file name of the URL
     482             :  * SIGNATURE: getFile(str) : str; */
     483             : static str
     484           6 : URLgetFile(str *retval, const url *val)
     485             : {
     486           6 :         const char *s;
     487           6 :         const char *b = NULL;
     488             : 
     489           6 :         if (val == NULL || *val == NULL)
     490           0 :                 throw(ILLARG, "url.getFile", "url missing");
     491             : 
     492           6 :         if (strNil(*val)) {
     493           1 :                 *retval = GDKstrdup(str_nil);
     494             :         } else {
     495           5 :                 if ((s = skip_scheme(*val)) == NULL
     496           5 :                         || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
     497           5 :                         || (s = skip_path(s, &b, NULL)) == NULL)
     498           0 :                         throw(ILLARG, "url.getFile", "bad url");
     499           5 :                 if (b == NULL) {
     500           1 :                         *retval = GDKstrdup(str_nil);
     501             :                 } else {
     502           4 :                         size_t l;
     503             : 
     504           4 :                         l = s - b;
     505           4 :                         if ((*retval = GDKmalloc(l + 1)) != NULL) {
     506           4 :                                 strcpy_len(*retval, b, l + 1);
     507             :                         }
     508             :                 }
     509             :         }
     510             : 
     511           6 :         if (*retval == NULL)
     512           0 :                 throw(MAL, "url.getFile", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     513             :         return MAL_SUCCEED;
     514             : }
     515             : 
     516             : /* COMMAND "getHost": Extract the server identity from the URL */
     517             : /* SIGNATURE: getHost(str) : str; */
     518             : static str
     519           9 : URLgetHost(str *retval, const url *val)
     520             : {
     521           9 :         const char *s;
     522           9 :         const char *h = NULL;
     523           9 :         const char *p = NULL;
     524             : 
     525           9 :         if (val == NULL || *val == NULL)
     526           0 :                 throw(ILLARG, "url.getHost", "url missing");
     527             : 
     528           9 :         if (strNil(*val)) {
     529           1 :                 *retval = GDKstrdup(str_nil);
     530             :         } else {
     531           8 :                 if ((s = skip_scheme(*val)) == NULL
     532           8 :                         || (s = skip_authority(s, NULL, NULL, &h, &p)) == NULL)
     533           0 :                         throw(ILLARG, "url.getHost", "bad url");
     534           8 :                 if (h == NULL) {
     535           0 :                         *retval = GDKstrdup(str_nil);
     536             :                 } else {
     537           8 :                         size_t l;
     538             : 
     539           8 :                         if (p != NULL) {
     540           3 :                                 l = p - h - 1;
     541             :                         } else {
     542           5 :                                 l = s - h;
     543             :                         }
     544           8 :                         if ((*retval = GDKmalloc(l + 1)) != NULL) {
     545           8 :                                 strcpy_len(*retval, h, l + 1);
     546             :                         }
     547             :                 }
     548             :         }
     549             : 
     550           9 :         if (*retval == NULL)
     551           0 :                 throw(MAL, "url.getHost", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     552             :         return MAL_SUCCEED;
     553             : }
     554             : 
     555             : /* COMMAND "getDomain": Extract the Internet domain from the URL
     556             :  * SIGNATURE: getDomain(str) : str; */
     557             : static str
     558           6 : URLgetDomain(str *retval, const url *val)
     559             : {
     560           6 :         const char *s;
     561           6 :         const char *h = NULL;
     562           6 :         const char *p = NULL;
     563             : 
     564           6 :         if (val == NULL || *val == NULL)
     565           0 :                 throw(ILLARG, "url.getDomain", "url missing");
     566             : 
     567           6 :         if (strNil(*val)) {
     568           1 :                 *retval = GDKstrdup(str_nil);
     569             :         } else {
     570           5 :                 if ((s = skip_scheme(*val)) == NULL
     571           5 :                         || (s = skip_authority(s, NULL, NULL, &h, &p)) == NULL)
     572           0 :                         throw(ILLARG, "url.getDomain", "bad url");
     573           5 :                 if (h == NULL) {
     574           0 :                         *retval = GDKstrdup(str_nil);
     575             :                 } else {
     576           5 :                         size_t l;
     577             : 
     578           5 :                         if (p != NULL)
     579           3 :                                 p--;
     580             :                         else
     581           2 :                                 p = s;
     582             :                         l = 0;
     583          19 :                         while (p > h && p[-1] != '.') {
     584          14 :                                 p--;
     585          14 :                                 l++;
     586             :                         }
     587           5 :                         if ((*retval = GDKmalloc(l + 1)) != NULL) {
     588           5 :                                 strcpy_len(*retval, p, l + 1);
     589             :                         }
     590             :                 }
     591             :         }
     592             : 
     593           6 :         if (*retval == NULL)
     594           0 :                 throw(MAL, "url.getDomain", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     595             :         return MAL_SUCCEED;
     596             : }
     597             : 
     598             : /* COMMAND "getPort": Extract the port id from the URL
     599             :  * SIGNATURE: getPort(str) : str; */
     600             : static str
     601           6 : URLgetPort(str *retval, const url *val)
     602             : {
     603           6 :         const char *s;
     604           6 :         const char *p = NULL;
     605             : 
     606           6 :         if (val == NULL || *val == NULL)
     607           0 :                 throw(ILLARG, "url.getPort", "url missing");
     608             : 
     609           6 :         if (strNil(*val)) {
     610           1 :                 *retval = GDKstrdup(str_nil);
     611             :         } else {
     612           5 :                 if ((s = skip_scheme(*val)) == NULL
     613           5 :                         || (s = skip_authority(s, NULL, NULL, NULL, &p)) == NULL)
     614           0 :                         throw(ILLARG, "url.getPort", "bad url");
     615           5 :                 if (p == NULL) {
     616           2 :                         *retval = GDKstrdup(str_nil);
     617             :                 } else {
     618           3 :                         size_t l = s - p;
     619             : 
     620           3 :                         if ((*retval = GDKmalloc(l + 1)) != NULL) {
     621           3 :                                 strcpy_len(*retval, p, l + 1);
     622             :                         }
     623             :                 }
     624             :         }
     625             : 
     626           6 :         if (*retval == NULL)
     627           0 :                 throw(MAL, "url.getPort", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     628             :         return MAL_SUCCEED;
     629             : }
     630             : 
     631             : /* COMMAND "getProtocol": Extract the protocol from the URL
     632             :  * SIGNATURE: getProtocol(str) : str; */
     633             : static str
     634           3 : URLgetProtocol(str *retval, const url *val)
     635             : {
     636           3 :         const char *s;
     637             : 
     638           3 :         if (val == NULL || *val == NULL)
     639           0 :                 throw(ILLARG, "url.getProtocol", "url missing");
     640             : 
     641           3 :         if (strNil(*val)) {
     642           1 :                 *retval = GDKstrdup(str_nil);
     643             :         } else {
     644           2 :                 if ((s = skip_scheme(*val)) == NULL)
     645           0 :                         throw(ILLARG, "url.getProtocol", "bad url");
     646           2 :                 size_t l = s - *val;
     647             : 
     648           2 :                 if ((*retval = GDKmalloc(l)) != NULL) {
     649           2 :                         strcpy_len(*retval, *val, l);
     650             :                 }
     651             :         }
     652             : 
     653           3 :         if (*retval == NULL)
     654           0 :                 throw(MAL, "url.getProtocol", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     655             :         return MAL_SUCCEED;
     656             : }
     657             : 
     658             : /* COMMAND "getQuery": Extract the query part from the URL
     659             :  * SIGNATURE: getQuery(str) : str; */
     660             : static str
     661           6 : URLgetQuery(str *retval, const url *val)
     662             : {
     663           6 :         const char *s;
     664           6 :         const char *q;
     665             : 
     666           6 :         if (val == NULL || *val == NULL)
     667           0 :                 throw(ILLARG, "url.getQuery", "url missing");
     668             : 
     669           6 :         if (strNil(*val)) {
     670           1 :                 *retval = GDKstrdup(str_nil);
     671             :         } else {
     672           5 :                 if ((s = skip_scheme(*val)) == NULL
     673           5 :                         || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
     674           5 :                         || (q = skip_path(s, NULL, NULL)) == NULL
     675           5 :                         || (s = skip_search(q)) == NULL)
     676           0 :                         throw(ILLARG, "url.getQuery", "bad url");
     677           5 :                 if (*q == '?') {
     678           3 :                         size_t l;
     679             : 
     680           3 :                         q++;
     681           3 :                         l = s - q;
     682           3 :                         if ((*retval = GDKmalloc(l + 1)) != NULL) {
     683           3 :                                 strcpy_len(*retval, q, l + 1);
     684             :                         }
     685             :                 } else {
     686           2 :                         *retval = GDKstrdup(str_nil);
     687             :                 }
     688             :         }
     689             : 
     690           6 :         if (*retval == NULL)
     691           0 :                 throw(MAL, "url.getQuery", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     692             :         return MAL_SUCCEED;
     693             : }
     694             : 
     695             : /* COMMAND "getRobotURL": Extract the location of the robot control file
     696             :  * SIGNATURE: getRobotURL(str) : str; */
     697             : static str
     698           6 : URLgetRobotURL(str *retval, const url *val)
     699             : {
     700           6 :         const char *s;
     701           6 :         size_t l;
     702             : 
     703           6 :         if (val == NULL || *val == NULL)
     704           0 :                 throw(ILLARG, "url.getQuery", "url missing");
     705             : 
     706           6 :         if (strNil(*val)) {
     707           1 :                 *retval = GDKstrdup(str_nil);
     708             :         } else {
     709           5 :                 if ((s = skip_scheme(*val)) == NULL
     710           5 :                         || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL)
     711           0 :                         throw(ILLARG, "url.getQuery", "bad url");
     712           5 :                 l = s - *val;
     713             : 
     714           5 :                 if ((*retval = GDKmalloc(l + sizeof("/robots.txt"))) != NULL) {
     715           5 :                         sprintf(*retval, "%.*s/robots.txt", (int) l, *val);
     716             :                 }
     717             :         }
     718             : 
     719           6 :         if (*retval == NULL)
     720           0 :                 throw(MAL, "url.getQuery", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     721             :         return MAL_SUCCEED;
     722             : }
     723             : 
     724             : /* COMMAND "getUser": Extract the user identity from the URL
     725             :  * SIGNATURE: getUser(str) : str; */
     726             : static str
     727           9 : URLgetUser(str *retval, const url *val)
     728             : {
     729           9 :         const char *s, *h, *u, *p;
     730             : 
     731           9 :         if (val == NULL || *val == NULL)
     732           0 :                 throw(ILLARG, "url.getUser", "url missing");
     733             : 
     734           9 :         if (strNil(*val)) {
     735           1 :                 *retval = GDKstrdup(str_nil);
     736             :         } else {
     737           8 :                 if ((s = skip_scheme(*val)) == NULL
     738           8 :                         || (s = skip_authority(s, &u, &p, &h, NULL)) == NULL)
     739           0 :                         throw(ILLARG, "url.getHost", "bad url");
     740           8 :                 if (u == NULL || h == NULL) {
     741           4 :                         *retval = GDKstrdup(str_nil);
     742             :                 } else {
     743           4 :                         size_t l;
     744             : 
     745           4 :                         if (p) {
     746           1 :                                 l = p - u - 1;
     747             :                         } else {
     748           3 :                                 l = h - u - 1;
     749             :                         }
     750           4 :                         if ((*retval = GDKmalloc(l + 1)) != NULL) {
     751           4 :                                 strcpy_len(*retval, u, l + 1);
     752             :                         }
     753             :                 }
     754             :         }
     755             : 
     756           9 :         if (*retval == NULL)
     757           0 :                 throw(MAL, "url.getUser", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     758             :         return MAL_SUCCEED;
     759             : }
     760             : 
     761             : /* COMMAND "isaURL": Check conformity of the URL syntax
     762             :  * SIGNATURE: isaURL(str) : bit; */
     763             : static str
     764           7 : URLisaURL(bit *retval, const char *const *val)
     765             : {
     766           7 :         if (val == NULL || *val == NULL)
     767           0 :                 throw(ILLARG, "url.isaURL", "url missing");
     768           7 :         if (strNil(*val))
     769           0 :                 *retval = bit_nil;
     770             :         else
     771           7 :                 *retval = skip_scheme(*val) != NULL;
     772             :         return MAL_SUCCEED;
     773             : }
     774             : 
     775             : static str
     776          52 : URLnew(url *u, const char *const *val)
     777             : {
     778          52 :         *u = GDKstrdup(*val);
     779          52 :         if (*u == NULL)
     780           0 :                 throw(MAL, "url.new", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     781             :         return MAL_SUCCEED;
     782             : }
     783             : 
     784             : static str
     785           9 : URLnew3(url *u, const char *const *protocol, const char *const *server, const char *const *file)
     786             : {
     787           9 :         const char *Protocol = *protocol;
     788           9 :         const char *Server = *server;
     789           9 :         const char *File = *file;
     790           9 :         size_t l;
     791             : 
     792           9 :         if (strNil(File))
     793             :                 File = "";
     794           2 :         else if (*File == '/')
     795           0 :                 File++;
     796           9 :         if (strNil(Server))
     797             :                 Server = "";
     798           9 :         if (strNil(Protocol))
     799             :                 Protocol = "";
     800           9 :         l = strlen(File) + strlen(Server) + strlen(Protocol) + 10;
     801           9 :         *u = GDKmalloc(l);
     802           9 :         if (*u == NULL)
     803           0 :                 throw(MAL, "url.newurl", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     804           9 :         snprintf(*u, l, "%s://%s/%s", Protocol, Server, File);
     805           9 :         return MAL_SUCCEED;
     806             : }
     807             : 
     808             : static str
     809           3 : URLnew4(url *u, const char *const *protocol, const char *const *server, const int *port, const char *const *file)
     810             : {
     811           3 :         const char *Protocol = *protocol;
     812           3 :         const char *Server = *server;
     813           3 :         int Port = *port;
     814           3 :         const char *File = *file;
     815           3 :         size_t l;
     816             : 
     817           3 :         if (strNil(File))
     818             :                 File = "";
     819           2 :         else if (*File == '/')
     820           0 :                 File++;
     821           3 :         if (strNil(Server))
     822             :                 Server = "";
     823           3 :         if (is_int_nil(Port))
     824           1 :                 Port = 0;
     825           3 :         if (strNil(Protocol))
     826             :                 Protocol = "";
     827           3 :         l = strlen(File) + strlen(Server) + strlen(Protocol) + 20;
     828           3 :         *u = GDKmalloc(l);
     829           3 :         if (*u == NULL)
     830           0 :                 throw(MAL, "url.newurl", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     831           3 :         snprintf(*u, l, "%s://%s:%d/%s", Protocol, Server, Port, File);
     832           3 :         return MAL_SUCCEED;
     833             : }
     834             : 
     835             : static str
     836           0 : URLnoop(url *u, const url *val)
     837             : {
     838           0 :         *u = GDKstrdup(*val);
     839           0 :         if (*u == NULL)
     840           0 :                 throw(MAL, "url.noop", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     841             :         return MAL_SUCCEED;
     842             : }
     843             : 
     844             : 
     845             : /* Extract host identity from URL. This is a relaxed version,
     846             :  * where no exceptions is thrown when the input URL is not valid,
     847             :  * and empty string is returned instead.
     848             :  * */
     849             : static str
     850           1 : extractURLHost(str *retval, const char *const *url, const bit *no_www)
     851             : {
     852           1 :         const char *s;
     853           1 :         const char *h = NULL;
     854           1 :         const char *p = NULL;
     855             : 
     856           2 :         if (url != NULL && *url != NULL && !strNil(*url)) {
     857           1 :                 if ((s = skip_scheme(*url)) != NULL
     858           0 :                         && (s = skip_authority(s, NULL, NULL, &h, &p)) != NULL
     859           0 :                         && h != NULL) {
     860             :                         ssize_t l;
     861             :                         const char *pos = s;
     862           0 :                         const char *domain = NULL;
     863           0 :                         while (pos > h) {
     864           0 :                                 if (*pos == '.') {
     865             :                                         domain = pos;
     866             :                                         break;
     867             :                                 }
     868           0 :                                 pos--;
     869             :                         }
     870             : 
     871           0 :                         if (p != NULL) {
     872           0 :                                 l = p - h - 1;
     873             :                         } else {
     874           0 :                                 l = s - h;
     875             :                         }
     876           0 :                         if (*no_www && !strncmp(h, "www.", 4)) {
     877           0 :                                 h += 4;
     878           0 :                                 l -= 4;
     879             :                         }
     880           0 :                         if (domain && l > 3) {
     881           0 :                                 if ((*retval = GDKmalloc(l + 1)) != NULL)
     882           0 :                                         strcpy_len(*retval, h, l + 1);
     883             :                         } else {
     884           0 :                                 *retval = GDKstrdup(str_nil);
     885             :                         }
     886             :                 } else {
     887           1 :                         *retval = GDKstrdup(str_nil);
     888             :                 }
     889             :         } else {
     890           0 :                 *retval = GDKstrdup(str_nil);
     891             :         }
     892           1 :         if (!*retval)
     893           0 :                 throw(MAL, "url.getURLHost", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     894             : 
     895             :         return MAL_SUCCEED;
     896             : }
     897             : 
     898             : 
     899             : static inline str
     900           2 : str_buf_copy(str *buf, size_t *buflen, const char *s, size_t l)
     901             : {
     902           2 :         CHECK_STR_BUFFER_LENGTH(buf, buflen, l, "url.str_buf_copy");
     903           2 :         strcpy_len(*buf, s, l);
     904           2 :         return MAL_SUCCEED;
     905             : }
     906             : 
     907             : 
     908             : // bulk version
     909             : static str
     910           2 : BATextractURLHost(bat *res, const bat *bid, const bit *no_www)
     911             : {
     912           2 :         const char *s;
     913           2 :         const char *host = NULL;
     914           2 :         const char *port = NULL;
     915           2 :         BAT *bn = NULL, *b = NULL;
     916           2 :         BUN p, q;
     917           2 :         size_t buflen = INITIAL_STR_BUFFER_LENGTH;
     918           2 :         str buf = GDKmalloc(buflen);
     919           2 :         str msg = MAL_SUCCEED;
     920           2 :         bool nils = false;
     921             : 
     922           2 :         if (buf == NULL)
     923           0 :                 throw(MAL, "baturl.extractURLHost", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     924             : 
     925           2 :         if (!(b = BATdescriptor(*bid))) {
     926           0 :                 GDKfree(buf);
     927           0 :                 throw(MAL, "baturl.extractURLHost",
     928             :                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
     929             :         }
     930           2 :         if ((bn = COLnew(b->hseqbase, TYPE_str, BATcount(b), TRANSIENT)) == NULL) {
     931           0 :                 GDKfree(buf);
     932           0 :                 BBPunfix(b->batCacheid);
     933           0 :                 throw(MAL, "baturl.extractURLHost", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     934             :         }
     935             : 
     936           2 :         BATiter bi = bat_iterator(b);
     937           4 :         BATloop(b, p, q) {
     938           2 :                 const char *url = (const char *) BUNtvar(bi, p);
     939           2 :                 if (strNil(url)) {
     940           0 :                         if (bunfastapp_nocheckVAR(bn, str_nil) != GDK_SUCCEED) {
     941           0 :                                 msg = createException(MAL, "baturl.extractURLHost",
     942             :                                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     943           0 :                                 break;
     944             :                         }
     945             :                         nils = true;
     946             :                 } else {
     947           2 :                         if ((s = skip_scheme(url)) != NULL
     948           2 :                                 && (s = skip_authority(s, NULL, NULL, &host, &port)) != NULL
     949           2 :                                 && host != NULL) {
     950             :                                 ssize_t l;
     951             :                                 const char *pos = s;
     952          18 :                                 const char *domain = NULL;
     953          18 :                                 while (pos > host) {
     954          18 :                                         if (*pos == '.') {
     955             :                                                 domain = pos;
     956             :                                                 break;
     957             :                                         }
     958          16 :                                         pos--;
     959             :                                 }
     960             : 
     961           2 :                                 if (port != NULL) {
     962           2 :                                         l = port - host - 1;
     963             :                                 } else {
     964           0 :                                         l = s - host;
     965             :                                 }
     966           2 :                                 if (domain && l > 3) {
     967           2 :                                         if (*no_www && !strncmp(host, "www.", 4)) {
     968           1 :                                                 host += 4;
     969           1 :                                                 l -= 4;
     970             :                                         }
     971           2 :                                         if (l > 0) {
     972             :                                                 // if ((msg = str_Sub_String(&buf, &buflen, host, 0, l)) != MAL_SUCCEED)
     973             :                                                 //  break;
     974           2 :                                                 if ((msg = str_buf_copy(&buf, &buflen, host,
     975           2 :                                                                                                 (size_t) (l + 1))) != MAL_SUCCEED)
     976             :                                                         break;
     977           2 :                                                 if (bunfastapp_nocheckVAR(bn, buf) != GDK_SUCCEED) {
     978           0 :                                                         msg = createException(MAL, "baturl.extractURLHost",
     979             :                                                                                                   SQLSTATE(HY013)
     980             :                                                                                                   MAL_MALLOC_FAIL);
     981           0 :                                                         break;
     982             :                                                 }
     983           2 :                                                 continue;
     984             :                                         }
     985             :                                 }
     986             :                         }
     987             :                         // fall back insert nil str if no valid host
     988           0 :                         if (bunfastapp_nocheckVAR(bn, str_nil) != GDK_SUCCEED) {
     989           0 :                                 msg = createException(MAL, "baturl.extractURLHost",
     990             :                                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     991           0 :                                 break;
     992             :                         }
     993             :                         nils = true;
     994             :                 }
     995             :         }
     996           2 :         bat_iterator_end(&bi);
     997             : 
     998           2 :         GDKfree(buf);
     999           2 :         if (msg == MAL_SUCCEED) {
    1000           2 :                 BATsetcount(bn, q);
    1001           2 :                 bn->tnil = nils;
    1002           2 :                 bn->tnonil = !nils;
    1003           2 :                 bn->tkey = BATcount(bn) <= 1;
    1004           2 :                 bn->tsorted = BATcount(bn) <= 1;
    1005           2 :                 bn->trevsorted = BATcount(bn) <= 1;
    1006           2 :                 *res = bn->batCacheid;
    1007           2 :                 BBPkeepref(bn);
    1008             :         }
    1009           2 :         BBPunfix(b->batCacheid);
    1010           2 :         return msg;
    1011             : }
    1012             : 
    1013             : 
    1014             : #include "mel.h"
    1015             : mel_atom url_init_atoms[] = {
    1016             :  { .name="url", .basetype="str", .fromstr=URLfromString, .tostr=URLtoString, },  { .cmp=NULL }
    1017             : };
    1018             : mel_func url_init_funcs[] = {
    1019             :  command("url", "url", URLnew, false, "Create an URL from a string literal", args(1,2, arg("",url),arg("s",str))),
    1020             :  command("url", "url", URLnoop, false, "Create an URL from a string literal", args(1,2, arg("",url),arg("s",url))),
    1021             :  command("calc", "url", URLnew, false, "Create an URL from a string literal", args(1,2, arg("",url),arg("s",str))),
    1022             :  command("calc", "url", URLnoop, false, "Create an URL from a string literal", args(1,2, arg("",url),arg("s",url))),
    1023             :  command("url", "getAnchor", URLgetAnchor, false, "Extract the URL anchor (reference)", args(1,2, arg("",str),arg("u",url))),
    1024             :  command("url", "getBasename", URLgetBasename, false, "Extract the URL base file name", args(1,2, arg("",str),arg("u",url))),
    1025             :  command("url", "getContext", URLgetContext, false, "Get the path context of a URL", args(1,2, arg("",str),arg("u",url))),
    1026             :  command("url", "getDomain", URLgetDomain, false, "Extract Internet domain from the URL", args(1,2, arg("",str),arg("u",url))),
    1027             :  command("url", "getExtension", URLgetExtension, false, "Extract the file extension of the URL", args(1,2, arg("",str),arg("u",url))),
    1028             :  command("url", "getFile", URLgetFile, false, "Extract the last file name of the URL", args(1,2, arg("",str),arg("u",url))),
    1029             :  command("url", "getHost", URLgetHost, false, "Extract the server name from the URL strict version", args(1,2, arg("",str),arg("u",url))),
    1030             :  command("url", "getPort", URLgetPort, false, "Extract the port id from the URL", args(1,2, arg("",str),arg("u",url))),
    1031             :  command("url", "getProtocol", URLgetProtocol, false, "Extract the protocol from the URL", args(1,2, arg("",str),arg("u",url))),
    1032             :  command("url", "getQuery", URLgetQuery, false, "Extract the query string from the URL", args(1,2, arg("",str),arg("u",url))),
    1033             :  command("url", "getUser", URLgetUser, false, "Extract the user identity from the URL", args(1,2, arg("",str),arg("u",url))),
    1034             :  command("url", "getRobotURL", URLgetRobotURL, false, "Extract the location of the robot control file", args(1,2, arg("",str),arg("u",url))),
    1035             :  command("url", "isaURL", URLisaURL, false, "Check conformity of the URL syntax", args(1,2, arg("",bit),arg("u",str))),
    1036             :  command("url", "new", URLnew4, false, "Construct URL from protocol, host, port, and file", args(1,5, arg("",url),arg("p",str),arg("h",str),arg("prt",int),arg("f",str))),
    1037             :  command("url", "new", URLnew3, false, "Construct URL from protocol, host,and file", args(1,4, arg("",url),arg("prot",str),arg("host",str),arg("fnme",str))),
    1038             :  command("url", "extractURLHost", extractURLHost, false, "Extract host from a URL relaxed version", args(1,3, arg("",str),arg("u",str), arg("no_www", bit))),
    1039             :  command("baturl", "extractURLHost", BATextractURLHost, false, "Extract host from BAT of URLs", args(1,3, batarg("",str), batarg("s",str), arg("no_www", bit))),
    1040             :  { .imp=NULL }
    1041             : };
    1042             : #include "mal_import.h"
    1043             : #ifdef _MSC_VER
    1044             : #undef read
    1045             : #pragma section(".CRT$XCU",read)
    1046             : #endif
    1047         308 : LIB_STARTUP_FUNC(init_url_mal)
    1048         308 : { mal_module("url", url_init_atoms, url_init_funcs); }

Generated by: LCOV version 1.14