LCOV - code coverage report
Current view: top level - monetdb5/modules/atoms - url.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 371 458 81.0 %
Date: 2024-04-26 00:35:57 Functions: 26 27 96.3 %

          Line data    Source code
       1             : /*
       2             :  * SPDX-License-Identifier: MPL-2.0
       3             :  *
       4             :  * This Source Code Form is subject to the terms of the Mozilla Public
       5             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       6             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       7             :  *
       8             :  * Copyright 2024 MonetDB Foundation;
       9             :  * Copyright August 2008 - 2023 MonetDB B.V.;
      10             :  * Copyright 1997 - July 2008 CWI.
      11             :  */
      12             : 
      13             : /*
      14             :  *  M. Kersten
      15             :  *  Y. Zhang
      16             :  * The URL module
      17             :  * The URL module contains a collection of commands to manipulate
      18             :  * Uniform Resource Locators - a resource on the World Wide Web-
      19             :  * represented as a string in Monet. The URL can represent
      20             :  * anything from a file, a directory or a complete movie.
      21             :  * This module is geared towards manipulation of their name only.
      22             :  * A complementary module can be used to gain access.[IOgate]
      23             :  *
      24             :  * The URL syntax is specified in RFC2396, Uniform Resource Identifiers
      25             :  * (URI): Generic Syntax. The URL syntax is dependent upon the scheme.
      26             :  * In general, a URL has the form <scheme>:<scheme-specific-part>.
      27             :  * Thus, accepting a valid URL is a simple proccess, unless the scheme
      28             :  * is known and schema-specific syntax is checked (e.g., http or ftp
      29             :  * scheme). For the URL module implemented here, we assume some common
      30             :  * fields of the <scheme-specific-part> that are shared among different
      31             :  * schemes.
      32             :  *
      33             :  * The core of the extension involves several operators to extract
      34             :  * portions of the URLs for further manipulation. In particular,
      35             :  * the domain, the server, and the protocol, and the file extension
      36             :  * can be extracted without copying the complete URL from the heap
      37             :  * into a string variable first.
      38             :  *
      39             :  * The commands provided are based on the corresponding Java class.
      40             :  *
      41             :  * A future version should use a special atom, because this may save
      42             :  * considerable space. Alternatively, break the URL strings into
      43             :  * components and represent them with a bunch of BATs. An intermediate
      44             :  * step would be to refine the atom STR, then it would be possible to
      45             :  * redefine hashing.
      46             :  */
      47             : 
      48             : #include "monetdb_config.h"
      49             : #include "mal.h"
      50             : #include "gdk.h"
      51             : #include <ctype.h>
      52             : #include "mal_exception.h"
      53             : #include "str.h"
      54             : 
      55             : typedef str url;
      56             : 
      57             : /* SCHEME "://" AUTHORITY [ PATH ] [ "?" SEARCH ] [ "#" FRAGMENT ]
      58             :  * AUTHORITY is: [ USER [ ":" PASSWORD ] "@" ] HOST [ ":" PORT ] */
      59             : 
      60             : /* return pointer to string after the scheme and colon; input: pointer
      61             :  * to start of URI */
      62             : static const char *
      63          70 : skip_scheme(const char *uri)
      64             : {
      65          70 :         if (('a' <= *uri && *uri <= 'z') || ('A' <= *uri && *uri <= 'Z')) {
      66          70 :                 uri++;
      67          70 :                 while (('a' <= *uri && *uri <= 'z') || ('A' <= *uri && *uri <= 'Z')
      68          70 :                            || isdigit((unsigned char) *uri) || *uri == '+' || *uri == '-'
      69         377 :                            || *uri == '.')
      70         237 :                         uri++;
      71          70 :                 if (*uri == ':')
      72          69 :                         return uri + 1;
      73             :         }
      74             :         return NULL;
      75             : }
      76             : 
      77             : #define ishex(c)                isxdigit((unsigned char) (c))
      78             : #define isreserved(c)   ((c) == ';' || (c) == '/' || (c) == '?' || \
      79             :                                                  (c) == ':' || (c) == '@' || (c) == '&' || \
      80             :                                                  (c) == '=' || (c) == '+' || (c) == '$' || \
      81             :                                                  (c) == ',')
      82             : #define isunreserved(c) (('a' <= (c) && (c) <= 'z') || \
      83             :                                                  ('A' <= (c) && (c) <= 'Z') || \
      84             :                                                  isdigit((unsigned char) (c)) || \
      85             :                                                  (c) == '-' || (c) == '_' || (c) == '.' || \
      86             :                                                  (c) == '!' || (c) == '~' || (c) == '*' || \
      87             :                                                  (c) == '\'' || (c) == '(' || (c) == ')')
      88             : 
      89             : /* return pointer to string after the authority, filling in pointers
      90             :  * to start of user, password, host, and port, if provided; input:
      91             :  * result of skip_scheme() */
      92             : static const char *
      93          60 : skip_authority(const char *uri, const char **userp, const char **passp,
      94             :                            const char **hostp, const char **portp)
      95             : {
      96          60 :         const char *user = NULL, *pass = NULL, *host = NULL, *port = NULL;
      97             : 
      98          60 :         if (uri[0] == '/' && uri[1] == '/') {
      99          60 :                 uri += 2;
     100          60 :                 user = host = uri;
     101         418 :                 while (isunreserved(*uri)
     102           0 :                            || (*uri == '%' && ishex(uri[1]) && ishex(uri[2])) || *uri == ';'
     103             :                            || *uri == ':' || *uri == '=' || *uri == '+' || *uri == '$'
     104        1127 :                            || *uri == ',' || *uri == '@') {
     105        1067 :                         if (*uri == ':') {
     106          36 :                                 if (user == host)
     107          12 :                                         port = pass = uri + 1;
     108             :                                 else
     109          24 :                                         port = uri + 1;
     110        1031 :                         } else if (*uri == '@')
     111          26 :                                 host = uri + 1;
     112        2134 :                         uri += *uri == '%' ? 3 : 1;
     113             :                 }
     114          60 :                 if (user == host) {
     115             :                         /* no "@", so no user info */
     116          34 :                         if (userp)
     117           4 :                                 *userp = NULL;
     118          34 :                         if (passp)
     119           4 :                                 *passp = NULL;
     120             :                 } else {
     121          26 :                         if (userp)
     122           4 :                                 *userp = user;
     123          26 :                         if (passp)
     124           4 :                                 *passp = pass;
     125             :                 }
     126          60 :                 if (portp)
     127          17 :                         *portp = port;
     128          60 :                 if (hostp)
     129          20 :                         *hostp = host;
     130          60 :                 return uri;
     131             :         }
     132             :         return NULL;
     133             : }
     134             : 
     135             : /* return pointer to string after the path, filling in pointer to
     136             :  * start of last component and extension of that component; input:
     137             :  * result of skip_authority() */
     138             : static const char *
     139          30 : skip_path(const char *uri, const char **basep, const char **extp)
     140             : {
     141          30 :         const char *base = NULL, *ext = NULL;
     142             : 
     143          30 :         if (*uri == '/') {
     144          24 :                 uri++;
     145          24 :                 base = uri;
     146         132 :                 while (isunreserved(*uri)
     147           0 :                            || (*uri == '%' && ishex(uri[1]) && ishex(uri[2])) || *uri == ':'
     148             :                            || *uri == '@' || *uri == '&' || *uri == '=' || *uri == '+'
     149         582 :                            || *uri == '$' || *uri == ',' || *uri == ';' || *uri == '/') {
     150         558 :                         if (*uri == '/') {
     151          36 :                                 base = uri + 1;
     152          36 :                                 ext = NULL;
     153         522 :                         } else if (*uri == '.' && ext == NULL && uri != base) {
     154         558 :                                 ext = uri;
     155             :                         }
     156        1116 :                         uri += *uri == '%' ? 3 : 1;
     157             :                 }
     158             :         }
     159          30 :         if (basep)
     160          10 :                 *basep = base;
     161          30 :         if (extp)
     162          10 :                 *extp = ext;
     163          30 :         return uri;
     164             : }
     165             : 
     166             : /* return pointer to string after the search string; input: result of
     167             :  * skip_path() */
     168             : static const char *
     169          10 : skip_search(const char *uri)
     170             : {
     171          10 :         if (*uri == '?') {
     172           6 :                 uri++;
     173          68 :                 while (isreserved(*uri) || isunreserved(*uri)
     174          76 :                            || (*uri == '%' && ishex(uri[1]) && ishex(uri[2]))) {
     175         140 :                         uri += *uri == '%' ? 3 : 1;
     176             :                 }
     177             :         }
     178          10 :         return uri;
     179             : }
     180             : 
     181             : #if 0
     182             : /*
     183             :  * Utilities
     184             :  */
     185             : 
     186             : static char
     187             : x2c(char *what)
     188             : {
     189             :         char digit;
     190             : 
     191             :         digit = (what[0] >= 'A' ? ((what[0] & 0xdf) - 'A') + 10 : (what[0] - '0'));
     192             :         digit *= 16;
     193             :         digit += (what[1] >= 'A' ? ((what[1] & 0xdf) - 'A') + 10 : (what[1] - '0'));
     194             :         return (digit);
     195             : }
     196             : 
     197             : static int
     198             : needEscape(char c)
     199             : {
     200             :         if (isalnum((unsigned char) c))
     201             :                 return 0;
     202             :         if (c == '#' || c == '-' || c == '_' || c == '.' || c == '!' || c == '~'
     203             :                 || c == '*' || c == '\'' || c == '(' || c == ')')
     204             :                 return 0;
     205             :         return 1;
     206             : }
     207             : 
     208             : /* COMMAND "escape": this function applies the URI escaping rules defined in
     209             :  * section 2 of [RFC 3986] to the string supplied as 's'.
     210             :  * The effect of the function is to escape a set of identified characters in
     211             :  * the string. Each such character is replaced in the string by an escape
     212             :  * sequence, which is formed by encoding the character as a sequence of octets
     213             :  * in UTF-8, and then reprensenting each of these octets in the form %HH.
     214             :  *
     215             :  * All characters are escaped other than:
     216             :  * [a-z], [A-Z], [0-9], "#", "-", "_", ".", "!", "~", "*", "'", "(", ")"
     217             :  *
     218             :  * This function must always generate hexadecimal values using the upper-case
     219             :  * letters A-F.
     220             :  *
     221             :  * SIGNATURE: escape(str) : str; */
     222             : static str
     223             : escape_str(str *retval, str s)
     224             : {
     225             :         int x, y;
     226             :         str res;
     227             : 
     228             :         if (!s)
     229             :                 throw(ILLARG, "url.escape", "url missing");
     230             : 
     231             :         if (!(res = (str) GDKmalloc(strlen(s) * 3)))
     232             :                 throw(MAL, "url.escape", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     233             :         for (x = 0, y = 0; s[x]; ++x, ++y) {
     234             :                 if (needEscape(s[x])) {
     235             :                         if (s[x] == ' ') {
     236             :                                 res[y] = '+';
     237             :                         } else {
     238             :                                 sprintf(res + y, "%%%2x", (uint8_t) s[x]);
     239             :                                 y += 2;
     240             :                         }
     241             :                 } else {
     242             :                         res[y] = s[x];
     243             :                 }
     244             :         }
     245             :         res[y] = '\0';
     246             : 
     247             :         if ((*retval = GDKrealloc(res, strlen(res) + 1)) == NULL) {
     248             :                 GDKfree(res);
     249             :                 throw(MAL, "url.escape", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     250             :         }
     251             :         return MAL_SUCCEED;
     252             : }
     253             : 
     254             : /* COMMAND "unescape": Convert hexadecimal representations to ASCII characters.
     255             :  *                     All sequences of the form "% HEX HEX" are unescaped.
     256             :  * SIGNATURE: unescape(str) : str; */
     257             : static str
     258             : unescape_str(str *retval, str s)
     259             : {
     260             :         int x, y;
     261             :         str res;
     262             : 
     263             :         if (!s)
     264             :                 throw(ILLARG, "url.escape", "url missing");
     265             : 
     266             :         res = (str) GDKmalloc(strlen(s));
     267             :         if (!res)
     268             :                 throw(MAL, "url.unescape", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     269             : 
     270             :         for (x = 0, y = 0; s[x]; ++x, ++y) {
     271             :                 if (s[x] == '%') {
     272             :                         res[y] = x2c(&s[x + 1]);
     273             :                         x += 2;
     274             :                 } else {
     275             :                         res[y] = s[x];
     276             :                 }
     277             :         }
     278             :         res[y] = '\0';
     279             : 
     280             :         if ((*retval = GDKrealloc(res, strlen(res) + 1)) == NULL) {
     281             :                 GDKfree(res);
     282             :                 throw(MAL, "url.unescape", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     283             :         }
     284             :         return MAL_SUCCEED;
     285             : }
     286             : #endif
     287             : 
     288             : /*
     289             :  * Wrapping
     290             :  * Here you find the wrappers around the V4 url library included above.
     291             :  */
     292             : 
     293             : static ssize_t
     294     2000013 : URLfromString(const char *src, size_t *len, void **U, bool external)
     295             : {
     296     2000013 :         char **u = (char **) U;
     297     2000013 :         size_t l = strlen(src) + 1;
     298             : 
     299     2000013 :         if (*len < l || *u == NULL) {
     300          25 :                 GDKfree(*u);
     301          25 :                 *u = GDKmalloc(l);
     302          25 :                 if (*u == NULL)
     303             :                         return -1;
     304          25 :                 *len = l;
     305             :         }
     306             : 
     307             :         /* actually parse the message for valid url */
     308             : 
     309     2000013 :         if (external && strcmp(src, "nil") == 0)
     310           0 :                 strcpy(*u, str_nil);
     311             :         else
     312     2000013 :                 memcpy(*u, src, l);
     313     2000013 :         return (ssize_t) l - 1;
     314             : }
     315             : 
     316             : static ssize_t
     317         199 : URLtoString(str *s, size_t *len, const void *SRC, bool external)
     318             : {
     319         199 :         const char *src = SRC;
     320         199 :         size_t l = strlen(src);
     321             : 
     322         199 :         if (external)
     323         188 :                 l += 2;
     324         199 :         if (l >= *len || *s == NULL) {
     325          18 :                 GDKfree(*s);
     326          18 :                 *s = GDKmalloc(l + 1);
     327          18 :                 if (*s == NULL)
     328             :                         return -1;
     329          18 :                 *len = l + 1;
     330             :         }
     331             : 
     332         199 :         if (external) {
     333         188 :                 if (strNil(src)) {
     334           0 :                         strcpy(*s, "nil");
     335           0 :                         return 3;
     336             :                 }
     337         188 :                 snprintf(*s, l + 1, "\"%s\"", src);
     338             :         } else {
     339          11 :                 strcpy(*s, src);
     340             :         }
     341         199 :         return (ssize_t) l;
     342             : }
     343             : 
     344             : /* COMMAND "getAnchor": Extract an anchor (reference) from the URL
     345             :  * SIGNATURE: getAnchor(url) : str; */
     346             : static str
     347           6 : URLgetAnchor(str *retval, url *val)
     348             : {
     349           6 :         const char *s;
     350             : 
     351           6 :         if (val == NULL || *val == NULL)
     352           0 :                 throw(ILLARG, "url.getAnchor", "url missing");
     353             : 
     354           6 :         if (strNil(*val)) {
     355             :                 s = str_nil;
     356             :         } else {
     357           5 :                 if ((s = skip_scheme(*val)) == NULL
     358           5 :                         || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
     359           5 :                         || (s = skip_path(s, NULL, NULL)) == NULL
     360           5 :                         || (s = skip_search(s)) == NULL)
     361           0 :                         throw(ILLARG, "url.getAnchor", "bad url");
     362           5 :                 if (*s == '#')
     363           2 :                         s++;
     364             :                 else
     365             :                         s = str_nil;
     366             :         }
     367             : 
     368           6 :         if ((*retval = GDKstrdup(s)) == NULL)
     369           0 :                 throw(MAL, "url.getAnchor", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     370             :         return MAL_SUCCEED;
     371             : }
     372             : 
     373             : /* COMMAND "getBasename": Extract the base of the last file name of the URL,
     374             :  *                        thus, excluding the file extension.
     375             :  * SIGNATURE: getBasename(str) : str; */
     376             : static str
     377           6 : URLgetBasename(str *retval, url *val)
     378             : {
     379           6 :         const char *s;
     380           6 :         const char *b = NULL;
     381           6 :         const char *e = NULL;
     382             : 
     383           6 :         if (val == NULL || *val == NULL)
     384           0 :                 throw(ILLARG, "url.getBasename", "url missing");
     385             : 
     386           6 :         if (strNil(*val)) {
     387           1 :                 *retval = GDKstrdup(str_nil);
     388             :         } else {
     389           5 :                 if ((s = skip_scheme(*val)) == NULL
     390           5 :                         || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
     391           5 :                         || (s = skip_path(s, &b, &e)) == NULL)
     392           0 :                         throw(ILLARG, "url.getBasename", "bad url");
     393           5 :                 if (b == NULL) {
     394           1 :                         *retval = GDKstrdup(str_nil);
     395             :                 } else {
     396           4 :                         size_t l;
     397             : 
     398           4 :                         if (e != NULL) {
     399           3 :                                 l = e - b;
     400             :                         } else {
     401           1 :                                 l = s - b;
     402             :                         }
     403           4 :                         if ((*retval = GDKmalloc(l + 1)) != NULL) {
     404           4 :                                 strcpy_len(*retval, b, l + 1);
     405             :                         }
     406             :                 }
     407             :         }
     408             : 
     409           6 :         if (*retval == NULL)
     410           0 :                 throw(MAL, "url.getBasename", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     411             :         return MAL_SUCCEED;
     412             : }
     413             : 
     414             : /* COMMAND "getContext": Extract the path context from the URL
     415             :  * SIGNATURE: getContext(str) : str; */
     416             : static str
     417           6 : URLgetContext(str *retval, url *val)
     418             : {
     419           6 :         const char *s;
     420           6 :         const char *p;
     421             : 
     422           6 :         if (val == NULL || *val == NULL)
     423           0 :                 throw(ILLARG, "url.getContext", "url missing");
     424             : 
     425           6 :         if (strNil(*val)) {
     426           1 :                 *retval = GDKstrdup(str_nil);
     427             :         } else {
     428           5 :                 if ((s = skip_scheme(*val)) == NULL
     429           5 :                         || (p = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
     430           5 :                         || (s = skip_path(p, NULL, NULL)) == NULL)
     431           0 :                         throw(ILLARG, "url.getContext", "bad url");
     432           5 :                 if (p == s) {
     433           1 :                         *retval = GDKstrdup(str_nil);
     434           4 :                 } else if ((*retval = GDKmalloc(s - p + 1)) != NULL) {
     435           4 :                         strcpy_len(*retval, p, s - p + 1);
     436             :                 }
     437             :         }
     438             : 
     439           6 :         if (*retval == NULL)
     440           0 :                 throw(MAL, "url.getContext", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     441             :         return MAL_SUCCEED;
     442             : }
     443             : 
     444             : /* COMMAND "getExtension": Extract the file extension of the URL
     445             :  * SIGNATURE: getExtension(str) : str; */
     446             : static str
     447           6 : URLgetExtension(str *retval, url *val)
     448             : {
     449           6 :         const char *s;
     450           6 :         const char *e = NULL;
     451             : 
     452           6 :         if (val == NULL || *val == NULL)
     453           0 :                 throw(ILLARG, "url.getExtension", "url missing");
     454             : 
     455           6 :         if (strNil(*val)) {
     456           1 :                 *retval = GDKstrdup(str_nil);
     457             :         } else {
     458           5 :                 if ((s = skip_scheme(*val)) == NULL
     459           5 :                         || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
     460           5 :                         || (s = skip_path(s, NULL, &e)) == NULL)
     461           0 :                         throw(ILLARG, "url.getExtension", "bad url");
     462           5 :                 if (e == NULL) {
     463           2 :                         *retval = GDKstrdup(str_nil);
     464             :                 } else {
     465           3 :                         size_t l = s - e;
     466             : 
     467           3 :                         assert(*e == '.');
     468           3 :                         if ((*retval = GDKmalloc(l)) != NULL) {
     469           3 :                                 strcpy_len(*retval, e + 1, l);
     470             :                         }
     471             :                 }
     472             :         }
     473             : 
     474           6 :         if (*retval == NULL)
     475           0 :                 throw(MAL, "url.getExtension", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     476             :         return MAL_SUCCEED;
     477             : }
     478             : 
     479             : /* COMMAND "getFile": Extract the last file name of the URL
     480             :  * SIGNATURE: getFile(str) : str; */
     481             : static str
     482           6 : URLgetFile(str *retval, url *val)
     483             : {
     484           6 :         const char *s;
     485           6 :         const char *b = NULL;
     486             : 
     487           6 :         if (val == NULL || *val == NULL)
     488           0 :                 throw(ILLARG, "url.getFile", "url missing");
     489             : 
     490           6 :         if (strNil(*val)) {
     491           1 :                 *retval = GDKstrdup(str_nil);
     492             :         } else {
     493           5 :                 if ((s = skip_scheme(*val)) == NULL
     494           5 :                         || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
     495           5 :                         || (s = skip_path(s, &b, NULL)) == NULL)
     496           0 :                         throw(ILLARG, "url.getFile", "bad url");
     497           5 :                 if (b == NULL) {
     498           1 :                         *retval = GDKstrdup(str_nil);
     499             :                 } else {
     500           4 :                         size_t l;
     501             : 
     502           4 :                         l = s - b;
     503           4 :                         if ((*retval = GDKmalloc(l + 1)) != NULL) {
     504           4 :                                 strcpy_len(*retval, b, l + 1);
     505             :                         }
     506             :                 }
     507             :         }
     508             : 
     509           6 :         if (*retval == NULL)
     510           0 :                 throw(MAL, "url.getFile", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     511             :         return MAL_SUCCEED;
     512             : }
     513             : 
     514             : /* COMMAND "getHost": Extract the server identity from the URL */
     515             : /* SIGNATURE: getHost(str) : str; */
     516             : static str
     517           6 : URLgetHost(str *retval, url *val)
     518             : {
     519           6 :         const char *s;
     520           6 :         const char *h = NULL;
     521           6 :         const char *p = NULL;
     522             : 
     523           6 :         if (val == NULL || *val == NULL)
     524           0 :                 throw(ILLARG, "url.getHost", "url missing");
     525             : 
     526           6 :         if (strNil(*val)) {
     527           1 :                 *retval = GDKstrdup(str_nil);
     528             :         } else {
     529           5 :                 if ((s = skip_scheme(*val)) == NULL
     530           5 :                         || (s = skip_authority(s, NULL, NULL, &h, &p)) == NULL)
     531           0 :                         throw(ILLARG, "url.getHost", "bad url");
     532           5 :                 if (h == NULL) {
     533           0 :                         *retval = GDKstrdup(str_nil);
     534             :                 } else {
     535           5 :                         size_t l;
     536             : 
     537           5 :                         if (p != NULL) {
     538           3 :                                 l = p - h - 1;
     539             :                         } else {
     540           2 :                                 l = s - h;
     541             :                         }
     542           5 :                         if ((*retval = GDKmalloc(l + 1)) != NULL) {
     543           5 :                                 strcpy_len(*retval, h, l + 1);
     544             :                         }
     545             :                 }
     546             :         }
     547             : 
     548           6 :         if (*retval == NULL)
     549           0 :                 throw(MAL, "url.getHost", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     550             :         return MAL_SUCCEED;
     551             : }
     552             : 
     553             : /* COMMAND "getDomain": Extract the Internet domain from the URL
     554             :  * SIGNATURE: getDomain(str) : str; */
     555             : static str
     556           6 : URLgetDomain(str *retval, url *val)
     557             : {
     558           6 :         const char *s;
     559           6 :         const char *h = NULL;
     560           6 :         const char *p = NULL;
     561             : 
     562           6 :         if (val == NULL || *val == NULL)
     563           0 :                 throw(ILLARG, "url.getDomain", "url missing");
     564             : 
     565           6 :         if (strNil(*val)) {
     566           1 :                 *retval = GDKstrdup(str_nil);
     567             :         } else {
     568           5 :                 if ((s = skip_scheme(*val)) == NULL
     569           5 :                         || (s = skip_authority(s, NULL, NULL, &h, &p)) == NULL)
     570           0 :                         throw(ILLARG, "url.getDomain", "bad url");
     571           5 :                 if (h == NULL) {
     572           0 :                         *retval = GDKstrdup(str_nil);
     573             :                 } else {
     574           5 :                         size_t l;
     575             : 
     576           5 :                         if (p != NULL)
     577           3 :                                 p--;
     578             :                         else
     579           2 :                                 p = s;
     580             :                         l = 0;
     581          19 :                         while (p > h && p[-1] != '.') {
     582          14 :                                 p--;
     583          14 :                                 l++;
     584             :                         }
     585           5 :                         if ((*retval = GDKmalloc(l + 1)) != NULL) {
     586           5 :                                 strcpy_len(*retval, p, l + 1);
     587             :                         }
     588             :                 }
     589             :         }
     590             : 
     591           6 :         if (*retval == NULL)
     592           0 :                 throw(MAL, "url.getDomain", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     593             :         return MAL_SUCCEED;
     594             : }
     595             : 
     596             : /* COMMAND "getPort": Extract the port id from the URL
     597             :  * SIGNATURE: getPort(str) : str; */
     598             : static str
     599           6 : URLgetPort(str *retval, url *val)
     600             : {
     601           6 :         const char *s;
     602           6 :         const char *p = NULL;
     603             : 
     604           6 :         if (val == NULL || *val == NULL)
     605           0 :                 throw(ILLARG, "url.getPort", "url missing");
     606             : 
     607           6 :         if (strNil(*val)) {
     608           1 :                 *retval = GDKstrdup(str_nil);
     609             :         } else {
     610           5 :                 if ((s = skip_scheme(*val)) == NULL
     611           5 :                         || (s = skip_authority(s, NULL, NULL, NULL, &p)) == NULL)
     612           0 :                         throw(ILLARG, "url.getPort", "bad url");
     613           5 :                 if (p == NULL) {
     614           2 :                         *retval = GDKstrdup(str_nil);
     615             :                 } else {
     616           3 :                         size_t l = s - p;
     617             : 
     618           3 :                         if ((*retval = GDKmalloc(l + 1)) != NULL) {
     619           3 :                                 strcpy_len(*retval, p, l + 1);
     620             :                         }
     621             :                 }
     622             :         }
     623             : 
     624           6 :         if (*retval == NULL)
     625           0 :                 throw(MAL, "url.getPort", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     626             :         return MAL_SUCCEED;
     627             : }
     628             : 
     629             : /* COMMAND "getProtocol": Extract the protocol from the URL
     630             :  * SIGNATURE: getProtocol(str) : str; */
     631             : static str
     632           3 : URLgetProtocol(str *retval, url *val)
     633             : {
     634           3 :         const char *s;
     635             : 
     636           3 :         if (val == NULL || *val == NULL)
     637           0 :                 throw(ILLARG, "url.getProtocol", "url missing");
     638             : 
     639           3 :         if (strNil(*val)) {
     640           1 :                 *retval = GDKstrdup(str_nil);
     641             :         } else {
     642           2 :                 if ((s = skip_scheme(*val)) == NULL)
     643           0 :                         throw(ILLARG, "url.getProtocol", "bad url");
     644           2 :                 size_t l = s - *val;
     645             : 
     646           2 :                 if ((*retval = GDKmalloc(l)) != NULL) {
     647           2 :                         strcpy_len(*retval, *val, l);
     648             :                 }
     649             :         }
     650             : 
     651           3 :         if (*retval == NULL)
     652           0 :                 throw(MAL, "url.getProtocol", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     653             :         return MAL_SUCCEED;
     654             : }
     655             : 
     656             : /* COMMAND "getQuery": Extract the query part from the URL
     657             :  * SIGNATURE: getQuery(str) : str; */
     658             : static str
     659           6 : URLgetQuery(str *retval, url *val)
     660             : {
     661           6 :         const char *s;
     662           6 :         const char *q;
     663             : 
     664           6 :         if (val == NULL || *val == NULL)
     665           0 :                 throw(ILLARG, "url.getQuery", "url missing");
     666             : 
     667           6 :         if (strNil(*val)) {
     668           1 :                 *retval = GDKstrdup(str_nil);
     669             :         } else {
     670           5 :                 if ((s = skip_scheme(*val)) == NULL
     671           5 :                         || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
     672           5 :                         || (q = skip_path(s, NULL, NULL)) == NULL
     673           5 :                         || (s = skip_search(q)) == NULL)
     674           0 :                         throw(ILLARG, "url.getQuery", "bad url");
     675           5 :                 if (*q == '?') {
     676           3 :                         size_t l;
     677             : 
     678           3 :                         q++;
     679           3 :                         l = s - q;
     680           3 :                         if ((*retval = GDKmalloc(l + 1)) != NULL) {
     681           3 :                                 strcpy_len(*retval, q, l + 1);
     682             :                         }
     683             :                 } else {
     684           2 :                         *retval = GDKstrdup(str_nil);
     685             :                 }
     686             :         }
     687             : 
     688           6 :         if (*retval == NULL)
     689           0 :                 throw(MAL, "url.getQuery", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     690             :         return MAL_SUCCEED;
     691             : }
     692             : 
     693             : /* COMMAND "getRobotURL": Extract the location of the robot control file
     694             :  * SIGNATURE: getRobotURL(str) : str; */
     695             : static str
     696           6 : URLgetRobotURL(str *retval, url *val)
     697             : {
     698           6 :         const char *s;
     699           6 :         size_t l;
     700             : 
     701           6 :         if (val == NULL || *val == NULL)
     702           0 :                 throw(ILLARG, "url.getQuery", "url missing");
     703             : 
     704           6 :         if (strNil(*val)) {
     705           1 :                 *retval = GDKstrdup(str_nil);
     706             :         } else {
     707           5 :                 if ((s = skip_scheme(*val)) == NULL
     708           5 :                         || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL)
     709           0 :                         throw(ILLARG, "url.getQuery", "bad url");
     710           5 :                 l = s - *val;
     711             : 
     712           5 :                 if ((*retval = GDKmalloc(l + sizeof("/robots.txt"))) != NULL) {
     713           5 :                         sprintf(*retval, "%.*s/robots.txt", (int) l, *val);
     714             :                 }
     715             :         }
     716             : 
     717           6 :         if (*retval == NULL)
     718           0 :                 throw(MAL, "url.getQuery", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     719             :         return MAL_SUCCEED;
     720             : }
     721             : 
     722             : /* COMMAND "getUser": Extract the user identity from the URL
     723             :  * SIGNATURE: getUser(str) : str; */
     724             : static str
     725           9 : URLgetUser(str *retval, url *val)
     726             : {
     727           9 :         const char *s, *h, *u, *p;
     728             : 
     729           9 :         if (val == NULL || *val == NULL)
     730           0 :                 throw(ILLARG, "url.getUser", "url missing");
     731             : 
     732           9 :         if (strNil(*val)) {
     733           1 :                 *retval = GDKstrdup(str_nil);
     734             :         } else {
     735           8 :                 if ((s = skip_scheme(*val)) == NULL
     736           8 :                         || (s = skip_authority(s, &u, &p, &h, NULL)) == NULL)
     737           0 :                         throw(ILLARG, "url.getHost", "bad url");
     738           8 :                 if (u == NULL || h == NULL) {
     739           4 :                         *retval = GDKstrdup(str_nil);
     740             :                 } else {
     741           4 :                         size_t l;
     742             : 
     743           4 :                         if (p) {
     744           1 :                                 l = p - u - 1;
     745             :                         } else {
     746           3 :                                 l = h - u - 1;
     747             :                         }
     748           4 :                         if ((*retval = GDKmalloc(l + 1)) != NULL) {
     749           4 :                                 strcpy_len(*retval, u, l + 1);
     750             :                         }
     751             :                 }
     752             :         }
     753             : 
     754           9 :         if (*retval == NULL)
     755           0 :                 throw(MAL, "url.getUser", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     756             :         return MAL_SUCCEED;
     757             : }
     758             : 
     759             : /* COMMAND "isaURL": Check conformity of the URL syntax
     760             :  * SIGNATURE: isaURL(str) : bit; */
     761             : static str
     762           7 : URLisaURL(bit *retval, str *val)
     763             : {
     764           7 :         if (val == NULL || *val == NULL)
     765           0 :                 throw(ILLARG, "url.isaURL", "url missing");
     766           7 :         if (strNil(*val))
     767           0 :                 *retval = bit_nil;
     768             :         else
     769           7 :                 *retval = skip_scheme(*val) != NULL;
     770             :         return MAL_SUCCEED;
     771             : }
     772             : 
     773             : static str
     774          49 : URLnew(url *u, str *val)
     775             : {
     776          49 :         *u = GDKstrdup(*val);
     777          49 :         if (*u == NULL)
     778           0 :                 throw(MAL, "url.new", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     779             :         return MAL_SUCCEED;
     780             : }
     781             : 
     782             : static str
     783           9 : URLnew3(url *u, str *protocol, str *server, str *file)
     784             : {
     785           9 :         str Protocol = *protocol;
     786           9 :         str Server = *server;
     787           9 :         str File = *file;
     788           9 :         size_t l;
     789             : 
     790           9 :         if (strNil(File))
     791             :                 File = "";
     792           2 :         else if (*File == '/')
     793           0 :                 File++;
     794           9 :         if (strNil(Server))
     795             :                 Server = "";
     796           9 :         if (strNil(Protocol))
     797             :                 Protocol = "";
     798           9 :         l = strlen(File) + strlen(Server) + strlen(Protocol) + 10;
     799           9 :         *u = GDKmalloc(l);
     800           9 :         if (*u == NULL)
     801           0 :                 throw(MAL, "url.newurl", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     802           9 :         snprintf(*u, l, "%s://%s/%s", Protocol, Server, File);
     803           9 :         return MAL_SUCCEED;
     804             : }
     805             : 
     806             : static str
     807           3 : URLnew4(url *u, str *protocol, str *server, int *port, str *file)
     808             : {
     809           3 :         str Protocol = *protocol;
     810           3 :         str Server = *server;
     811           3 :         int Port = *port;
     812           3 :         str File = *file;
     813           3 :         size_t l;
     814             : 
     815           3 :         if (strNil(File))
     816             :                 File = "";
     817           2 :         else if (*File == '/')
     818           0 :                 File++;
     819           3 :         if (strNil(Server))
     820             :                 Server = "";
     821           3 :         if (is_int_nil(Port))
     822           1 :                 Port = 0;
     823           3 :         if (strNil(Protocol))
     824             :                 Protocol = "";
     825           3 :         l = strlen(File) + strlen(Server) + strlen(Protocol) + 20;
     826           3 :         *u = GDKmalloc(l);
     827           3 :         if (*u == NULL)
     828           0 :                 throw(MAL, "url.newurl", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     829           3 :         snprintf(*u, l, "%s://%s:%d/%s", Protocol, Server, Port, File);
     830           3 :         return MAL_SUCCEED;
     831             : }
     832             : 
     833             : static str
     834           0 : URLnoop(url *u, url *val)
     835             : {
     836           0 :         *u = GDKstrdup(*val);
     837           0 :         if (*u == NULL)
     838           0 :                 throw(MAL, "url.noop", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     839             :         return MAL_SUCCEED;
     840             : }
     841             : 
     842             : 
     843             : /* Extract host identity from URL. This is a relaxed version,
     844             :  * where no exceptions is thrown when the input URL is not valid,
     845             :  * and empty string is returned instead.
     846             :  * */
     847             : static str
     848           1 : extractURLHost(str *retval, str *url, bit *no_www)
     849             : {
     850           1 :         const char *s;
     851           1 :         const char *h = NULL;
     852           1 :         const char *p = NULL;
     853             : 
     854           2 :         if (url != NULL && *url != NULL && !strNil(*url)) {
     855           1 :                 if ((s = skip_scheme(*url)) != NULL
     856           0 :                         && (s = skip_authority(s, NULL, NULL, &h, &p)) != NULL
     857           0 :                         && h != NULL) {
     858             :                         ssize_t l;
     859             :                         const char *pos = s;
     860           0 :                         const char *domain = NULL;
     861           0 :                         while (pos > h) {
     862           0 :                                 if (*pos == '.') {
     863             :                                         domain = pos;
     864             :                                         break;
     865             :                                 }
     866           0 :                                 pos--;
     867             :                         }
     868             : 
     869           0 :                         if (p != NULL) {
     870           0 :                                 l = p - h - 1;
     871             :                         } else {
     872           0 :                                 l = s - h;
     873             :                         }
     874           0 :                         if (*no_www && !strncmp(h, "www.", 4)) {
     875           0 :                                 h += 4;
     876           0 :                                 l -= 4;
     877             :                         }
     878           0 :                         if (domain && l > 3) {
     879           0 :                                 if ((*retval = GDKmalloc(l + 1)) != NULL)
     880           0 :                                         strcpy_len(*retval, h, l + 1);
     881             :                         } else {
     882           0 :                                 *retval = GDKstrdup(str_nil);
     883             :                         }
     884             :                 } else {
     885           1 :                         *retval = GDKstrdup(str_nil);
     886             :                 }
     887             :         } else {
     888           0 :                 *retval = GDKstrdup(str_nil);
     889             :         }
     890           1 :         if (!*retval)
     891           0 :                 throw(MAL, "url.getURLHost", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     892             : 
     893             :         return MAL_SUCCEED;
     894             : }
     895             : 
     896             : 
     897             : static inline str
     898           2 : str_buf_copy(str *buf, size_t *buflen, const char *s, size_t l)
     899             : {
     900           2 :         CHECK_STR_BUFFER_LENGTH(buf, buflen, l, "url.str_buf_copy");
     901           2 :         strcpy_len(*buf, s, l);
     902           2 :         return MAL_SUCCEED;
     903             : }
     904             : 
     905             : 
     906             : // bulk version
     907             : static str
     908           2 : BATextractURLHost(bat *res, const bat *bid, bit *no_www)
     909             : {
     910           2 :         const char *s;
     911           2 :         const char *host = NULL;
     912           2 :         const char *port = NULL;
     913           2 :         BAT *bn = NULL, *b = NULL;
     914           2 :         BUN p, q;
     915           2 :         size_t buflen = INITIAL_STR_BUFFER_LENGTH;
     916           2 :         str buf = GDKmalloc(buflen);
     917           2 :         str msg = MAL_SUCCEED;
     918           2 :         bool nils = false;
     919             : 
     920           2 :         if (buf == NULL)
     921           0 :                 throw(MAL, "baturl.extractURLHost", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     922             : 
     923           2 :         if (!(b = BATdescriptor(*bid))) {
     924           0 :                 GDKfree(buf);
     925           0 :                 throw(MAL, "baturl.extractURLHost",
     926             :                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
     927             :         }
     928           2 :         if ((bn = COLnew(b->hseqbase, TYPE_str, BATcount(b), TRANSIENT)) == NULL) {
     929           0 :                 GDKfree(buf);
     930           0 :                 BBPunfix(b->batCacheid);
     931           0 :                 throw(MAL, "baturl.extractURLHost", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     932             :         }
     933             : 
     934           2 :         BATiter bi = bat_iterator(b);
     935           4 :         BATloop(b, p, q) {
     936           2 :                 const char *url = (const char *) BUNtvar(bi, p);
     937           2 :                 if (strNil(url)) {
     938           0 :                         if (bunfastapp_nocheckVAR(bn, str_nil) != GDK_SUCCEED) {
     939           0 :                                 msg = createException(MAL, "baturl.extractURLHost",
     940             :                                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     941           0 :                                 break;
     942             :                         }
     943             :                         nils = true;
     944             :                 } else {
     945           2 :                         if ((s = skip_scheme(url)) != NULL
     946           2 :                                 && (s = skip_authority(s, NULL, NULL, &host, &port)) != NULL
     947           2 :                                 && host != NULL) {
     948             :                                 ssize_t l;
     949             :                                 const char *pos = s;
     950          18 :                                 const char *domain = NULL;
     951          18 :                                 while (pos > host) {
     952          18 :                                         if (*pos == '.') {
     953             :                                                 domain = pos;
     954             :                                                 break;
     955             :                                         }
     956          16 :                                         pos--;
     957             :                                 }
     958             : 
     959           2 :                                 if (port != NULL) {
     960           2 :                                         l = port - host - 1;
     961             :                                 } else {
     962           0 :                                         l = s - host;
     963             :                                 }
     964           2 :                                 if (domain && l > 3) {
     965           2 :                                         if (*no_www && !strncmp(host, "www.", 4)) {
     966           1 :                                                 host += 4;
     967           1 :                                                 l -= 4;
     968             :                                         }
     969           2 :                                         if (l > 0) {
     970             :                                                 // if ((msg = str_Sub_String(&buf, &buflen, host, 0, l)) != MAL_SUCCEED)
     971             :                                                 //  break;
     972           2 :                                                 if ((msg = str_buf_copy(&buf, &buflen, host,
     973           2 :                                                                                                 (size_t) (l + 1))) != MAL_SUCCEED)
     974             :                                                         break;
     975           2 :                                                 if (bunfastapp_nocheckVAR(bn, buf) != GDK_SUCCEED) {
     976           0 :                                                         msg = createException(MAL, "baturl.extractURLHost",
     977             :                                                                                                   SQLSTATE(HY013)
     978             :                                                                                                   MAL_MALLOC_FAIL);
     979           0 :                                                         break;
     980             :                                                 }
     981           2 :                                                 continue;
     982             :                                         }
     983             :                                 }
     984             :                         }
     985             :                         // fall back insert nil str if no valid host
     986           0 :                         if (bunfastapp_nocheckVAR(bn, str_nil) != GDK_SUCCEED) {
     987           0 :                                 msg = createException(MAL, "baturl.extractURLHost",
     988             :                                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     989           0 :                                 break;
     990             :                         }
     991             :                         nils = true;
     992             :                 }
     993             :         }
     994           2 :         bat_iterator_end(&bi);
     995             : 
     996           2 :         GDKfree(buf);
     997           2 :         if (msg == MAL_SUCCEED) {
     998           2 :                 BATsetcount(bn, q);
     999           2 :                 bn->tnil = nils;
    1000           2 :                 bn->tnonil = !nils;
    1001           2 :                 bn->tkey = BATcount(bn) <= 1;
    1002           2 :                 bn->tsorted = BATcount(bn) <= 1;
    1003           2 :                 bn->trevsorted = BATcount(bn) <= 1;
    1004           2 :                 *res = bn->batCacheid;
    1005           2 :                 BBPkeepref(bn);
    1006             :         }
    1007           2 :         BBPunfix(b->batCacheid);
    1008           2 :         return msg;
    1009             : }
    1010             : 
    1011             : 
    1012             : #include "mel.h"
    1013             : mel_atom url_init_atoms[] = {
    1014             :  { .name="url", .basetype="str", .fromstr=URLfromString, .tostr=URLtoString, },  { .cmp=NULL }
    1015             : };
    1016             : mel_func url_init_funcs[] = {
    1017             :  command("url", "url", URLnew, false, "Create an URL from a string literal", args(1,2, arg("",url),arg("s",str))),
    1018             :  command("url", "url", URLnoop, false, "Create an URL from a string literal", args(1,2, arg("",url),arg("s",url))),
    1019             :  command("calc", "url", URLnew, false, "Create an URL from a string literal", args(1,2, arg("",url),arg("s",str))),
    1020             :  command("calc", "url", URLnoop, false, "Create an URL from a string literal", args(1,2, arg("",url),arg("s",url))),
    1021             :  command("url", "getAnchor", URLgetAnchor, false, "Extract the URL anchor (reference)", args(1,2, arg("",str),arg("u",url))),
    1022             :  command("url", "getBasename", URLgetBasename, false, "Extract the URL base file name", args(1,2, arg("",str),arg("u",url))),
    1023             :  command("url", "getContext", URLgetContext, false, "Get the path context of a URL", args(1,2, arg("",str),arg("u",url))),
    1024             :  command("url", "getDomain", URLgetDomain, false, "Extract Internet domain from the URL", args(1,2, arg("",str),arg("u",url))),
    1025             :  command("url", "getExtension", URLgetExtension, false, "Extract the file extension of the URL", args(1,2, arg("",str),arg("u",url))),
    1026             :  command("url", "getFile", URLgetFile, false, "Extract the last file name of the URL", args(1,2, arg("",str),arg("u",url))),
    1027             :  command("url", "getHost", URLgetHost, false, "Extract the server name from the URL strict version", args(1,2, arg("",str),arg("u",url))),
    1028             :  command("url", "getPort", URLgetPort, false, "Extract the port id from the URL", args(1,2, arg("",str),arg("u",url))),
    1029             :  command("url", "getProtocol", URLgetProtocol, false, "Extract the protocol from the URL", args(1,2, arg("",str),arg("u",url))),
    1030             :  command("url", "getQuery", URLgetQuery, false, "Extract the query string from the URL", args(1,2, arg("",str),arg("u",url))),
    1031             :  command("url", "getUser", URLgetUser, false, "Extract the user identity from the URL", args(1,2, arg("",str),arg("u",url))),
    1032             :  command("url", "getRobotURL", URLgetRobotURL, false, "Extract the location of the robot control file", args(1,2, arg("",str),arg("u",url))),
    1033             :  command("url", "isaURL", URLisaURL, false, "Check conformity of the URL syntax", args(1,2, arg("",bit),arg("u",str))),
    1034             :  command("url", "new", URLnew4, false, "Construct URL from protocol, host, port, and file", args(1,5, arg("",url),arg("p",str),arg("h",str),arg("prt",int),arg("f",str))),
    1035             :  command("url", "new", URLnew3, false, "Construct URL from protocol, host,and file", args(1,4, arg("",url),arg("prot",str),arg("host",str),arg("fnme",str))),
    1036             :  command("url", "extractURLHost", extractURLHost, false, "Extract host from a URL relaxed version", args(1,3, arg("",str),arg("u",str), arg("no_www", bit))),
    1037             :  command("baturl", "extractURLHost", BATextractURLHost, false, "Extract host from BAT of URLs", args(1,3, batarg("",str), batarg("s",str), arg("no_www", bit))),
    1038             :  { .imp=NULL }
    1039             : };
    1040             : #include "mal_import.h"
    1041             : #ifdef _MSC_VER
    1042             : #undef read
    1043             : #pragma section(".CRT$XCU",read)
    1044             : #endif
    1045         334 : LIB_STARTUP_FUNC(init_url_mal)
    1046         334 : { mal_module("url", url_init_atoms, url_init_funcs); }

Generated by: LCOV version 1.14