Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : /*
14 : * M. Kersten
15 : * Y. Zhang
16 : * The URL module
17 : * The URL module contains a collection of commands to manipulate
18 : * Uniform Resource Locators - a resource on the World Wide Web-
19 : * represented as a string in Monet. The URL can represent
20 : * anything from a file, a directory or a complete movie.
21 : * This module is geared towards manipulation of their name only.
22 : * A complementary module can be used to gain access.[IOgate]
23 : *
24 : * The URL syntax is specified in RFC2396, Uniform Resource Identifiers
25 : * (URI): Generic Syntax. The URL syntax is dependent upon the scheme.
26 : * In general, a URL has the form <scheme>:<scheme-specific-part>.
27 : * Thus, accepting a valid URL is a simple proccess, unless the scheme
28 : * is known and schema-specific syntax is checked (e.g., http or ftp
29 : * scheme). For the URL module implemented here, we assume some common
30 : * fields of the <scheme-specific-part> that are shared among different
31 : * schemes.
32 : *
33 : * The core of the extension involves several operators to extract
34 : * portions of the URLs for further manipulation. In particular,
35 : * the domain, the server, and the protocol, and the file extension
36 : * can be extracted without copying the complete URL from the heap
37 : * into a string variable first.
38 : *
39 : * The commands provided are based on the corresponding Java class.
40 : *
41 : * A future version should use a special atom, because this may save
42 : * considerable space. Alternatively, break the URL strings into
43 : * components and represent them with a bunch of BATs. An intermediate
44 : * step would be to refine the atom STR, then it would be possible to
45 : * redefine hashing.
46 : */
47 :
48 : #include "monetdb_config.h"
49 : #include "mal.h"
50 : #include "gdk.h"
51 : #include <ctype.h>
52 : #include "mal_exception.h"
53 : #include "str.h"
54 :
55 : typedef str url;
56 :
57 : /* SCHEME "://" AUTHORITY [ PATH ] [ "?" SEARCH ] [ "#" FRAGMENT ]
58 : * AUTHORITY is: [ USER [ ":" PASSWORD ] "@" ] HOST [ ":" PORT ] */
59 :
60 : /* return pointer to string after the scheme and colon; input: pointer
61 : * to start of URI */
62 : static const char *
63 70 : skip_scheme(const char *uri)
64 : {
65 70 : if (('a' <= *uri && *uri <= 'z') || ('A' <= *uri && *uri <= 'Z')) {
66 70 : uri++;
67 70 : while (('a' <= *uri && *uri <= 'z') || ('A' <= *uri && *uri <= 'Z')
68 70 : || isdigit((unsigned char) *uri) || *uri == '+' || *uri == '-'
69 377 : || *uri == '.')
70 237 : uri++;
71 70 : if (*uri == ':')
72 69 : return uri + 1;
73 : }
74 : return NULL;
75 : }
76 :
77 : #define ishex(c) isxdigit((unsigned char) (c))
78 : #define isreserved(c) ((c) == ';' || (c) == '/' || (c) == '?' || \
79 : (c) == ':' || (c) == '@' || (c) == '&' || \
80 : (c) == '=' || (c) == '+' || (c) == '$' || \
81 : (c) == ',')
82 : #define isunreserved(c) (('a' <= (c) && (c) <= 'z') || \
83 : ('A' <= (c) && (c) <= 'Z') || \
84 : isdigit((unsigned char) (c)) || \
85 : (c) == '-' || (c) == '_' || (c) == '.' || \
86 : (c) == '!' || (c) == '~' || (c) == '*' || \
87 : (c) == '\'' || (c) == '(' || (c) == ')')
88 :
89 : /* return pointer to string after the authority, filling in pointers
90 : * to start of user, password, host, and port, if provided; input:
91 : * result of skip_scheme() */
92 : static const char *
93 60 : skip_authority(const char *uri, const char **userp, const char **passp,
94 : const char **hostp, const char **portp)
95 : {
96 60 : const char *user = NULL, *pass = NULL, *host = NULL, *port = NULL;
97 :
98 60 : if (uri[0] == '/' && uri[1] == '/') {
99 60 : uri += 2;
100 60 : user = host = uri;
101 418 : while (isunreserved(*uri)
102 0 : || (*uri == '%' && ishex(uri[1]) && ishex(uri[2])) || *uri == ';'
103 : || *uri == ':' || *uri == '=' || *uri == '+' || *uri == '$'
104 1127 : || *uri == ',' || *uri == '@') {
105 1067 : if (*uri == ':') {
106 36 : if (user == host)
107 12 : port = pass = uri + 1;
108 : else
109 24 : port = uri + 1;
110 1031 : } else if (*uri == '@')
111 26 : host = uri + 1;
112 2134 : uri += *uri == '%' ? 3 : 1;
113 : }
114 60 : if (user == host) {
115 : /* no "@", so no user info */
116 34 : if (userp)
117 4 : *userp = NULL;
118 34 : if (passp)
119 4 : *passp = NULL;
120 : } else {
121 26 : if (userp)
122 4 : *userp = user;
123 26 : if (passp)
124 4 : *passp = pass;
125 : }
126 60 : if (portp)
127 17 : *portp = port;
128 60 : if (hostp)
129 20 : *hostp = host;
130 60 : return uri;
131 : }
132 : return NULL;
133 : }
134 :
135 : /* return pointer to string after the path, filling in pointer to
136 : * start of last component and extension of that component; input:
137 : * result of skip_authority() */
138 : static const char *
139 30 : skip_path(const char *uri, const char **basep, const char **extp)
140 : {
141 30 : const char *base = NULL, *ext = NULL;
142 :
143 30 : if (*uri == '/') {
144 24 : uri++;
145 24 : base = uri;
146 132 : while (isunreserved(*uri)
147 0 : || (*uri == '%' && ishex(uri[1]) && ishex(uri[2])) || *uri == ':'
148 : || *uri == '@' || *uri == '&' || *uri == '=' || *uri == '+'
149 582 : || *uri == '$' || *uri == ',' || *uri == ';' || *uri == '/') {
150 558 : if (*uri == '/') {
151 36 : base = uri + 1;
152 36 : ext = NULL;
153 522 : } else if (*uri == '.' && ext == NULL && uri != base) {
154 558 : ext = uri;
155 : }
156 1116 : uri += *uri == '%' ? 3 : 1;
157 : }
158 : }
159 30 : if (basep)
160 10 : *basep = base;
161 30 : if (extp)
162 10 : *extp = ext;
163 30 : return uri;
164 : }
165 :
166 : /* return pointer to string after the search string; input: result of
167 : * skip_path() */
168 : static const char *
169 10 : skip_search(const char *uri)
170 : {
171 10 : if (*uri == '?') {
172 6 : uri++;
173 68 : while (isreserved(*uri) || isunreserved(*uri)
174 76 : || (*uri == '%' && ishex(uri[1]) && ishex(uri[2]))) {
175 140 : uri += *uri == '%' ? 3 : 1;
176 : }
177 : }
178 10 : return uri;
179 : }
180 :
181 : #if 0
182 : /*
183 : * Utilities
184 : */
185 :
186 : static char
187 : x2c(char *what)
188 : {
189 : char digit;
190 :
191 : digit = (what[0] >= 'A' ? ((what[0] & 0xdf) - 'A') + 10 : (what[0] - '0'));
192 : digit *= 16;
193 : digit += (what[1] >= 'A' ? ((what[1] & 0xdf) - 'A') + 10 : (what[1] - '0'));
194 : return (digit);
195 : }
196 :
197 : static int
198 : needEscape(char c)
199 : {
200 : if (isalnum((unsigned char) c))
201 : return 0;
202 : if (c == '#' || c == '-' || c == '_' || c == '.' || c == '!' || c == '~'
203 : || c == '*' || c == '\'' || c == '(' || c == ')')
204 : return 0;
205 : return 1;
206 : }
207 :
208 : /* COMMAND "escape": this function applies the URI escaping rules defined in
209 : * section 2 of [RFC 3986] to the string supplied as 's'.
210 : * The effect of the function is to escape a set of identified characters in
211 : * the string. Each such character is replaced in the string by an escape
212 : * sequence, which is formed by encoding the character as a sequence of octets
213 : * in UTF-8, and then reprensenting each of these octets in the form %HH.
214 : *
215 : * All characters are escaped other than:
216 : * [a-z], [A-Z], [0-9], "#", "-", "_", ".", "!", "~", "*", "'", "(", ")"
217 : *
218 : * This function must always generate hexadecimal values using the upper-case
219 : * letters A-F.
220 : *
221 : * SIGNATURE: escape(str) : str; */
222 : static str
223 : escape_str(str *retval, str s)
224 : {
225 : int x, y;
226 : str res;
227 :
228 : if (!s)
229 : throw(ILLARG, "url.escape", "url missing");
230 :
231 : if (!(res = (str) GDKmalloc(strlen(s) * 3)))
232 : throw(MAL, "url.escape", SQLSTATE(HY013) MAL_MALLOC_FAIL);
233 : for (x = 0, y = 0; s[x]; ++x, ++y) {
234 : if (needEscape(s[x])) {
235 : if (s[x] == ' ') {
236 : res[y] = '+';
237 : } else {
238 : sprintf(res + y, "%%%2x", (uint8_t) s[x]);
239 : y += 2;
240 : }
241 : } else {
242 : res[y] = s[x];
243 : }
244 : }
245 : res[y] = '\0';
246 :
247 : if ((*retval = GDKrealloc(res, strlen(res) + 1)) == NULL) {
248 : GDKfree(res);
249 : throw(MAL, "url.escape", SQLSTATE(HY013) MAL_MALLOC_FAIL);
250 : }
251 : return MAL_SUCCEED;
252 : }
253 :
254 : /* COMMAND "unescape": Convert hexadecimal representations to ASCII characters.
255 : * All sequences of the form "% HEX HEX" are unescaped.
256 : * SIGNATURE: unescape(str) : str; */
257 : static str
258 : unescape_str(str *retval, str s)
259 : {
260 : int x, y;
261 : str res;
262 :
263 : if (!s)
264 : throw(ILLARG, "url.escape", "url missing");
265 :
266 : res = (str) GDKmalloc(strlen(s));
267 : if (!res)
268 : throw(MAL, "url.unescape", SQLSTATE(HY013) MAL_MALLOC_FAIL);
269 :
270 : for (x = 0, y = 0; s[x]; ++x, ++y) {
271 : if (s[x] == '%') {
272 : res[y] = x2c(&s[x + 1]);
273 : x += 2;
274 : } else {
275 : res[y] = s[x];
276 : }
277 : }
278 : res[y] = '\0';
279 :
280 : if ((*retval = GDKrealloc(res, strlen(res) + 1)) == NULL) {
281 : GDKfree(res);
282 : throw(MAL, "url.unescape", SQLSTATE(HY013) MAL_MALLOC_FAIL);
283 : }
284 : return MAL_SUCCEED;
285 : }
286 : #endif
287 :
288 : /*
289 : * Wrapping
290 : * Here you find the wrappers around the V4 url library included above.
291 : */
292 :
293 : static ssize_t
294 2000013 : URLfromString(const char *src, size_t *len, void **U, bool external)
295 : {
296 2000013 : char **u = (char **) U;
297 2000013 : size_t l = strlen(src) + 1;
298 :
299 2000013 : if (*len < l || *u == NULL) {
300 25 : GDKfree(*u);
301 25 : *u = GDKmalloc(l);
302 25 : if (*u == NULL)
303 : return -1;
304 25 : *len = l;
305 : }
306 :
307 : /* actually parse the message for valid url */
308 :
309 2000013 : if (external && strcmp(src, "nil") == 0)
310 0 : strcpy(*u, str_nil);
311 : else
312 2000013 : memcpy(*u, src, l);
313 2000013 : return (ssize_t) l - 1;
314 : }
315 :
316 : static ssize_t
317 199 : URLtoString(str *s, size_t *len, const void *SRC, bool external)
318 : {
319 199 : const char *src = SRC;
320 199 : size_t l = strlen(src);
321 :
322 199 : if (external)
323 188 : l += 2;
324 199 : if (l >= *len || *s == NULL) {
325 18 : GDKfree(*s);
326 18 : *s = GDKmalloc(l + 1);
327 18 : if (*s == NULL)
328 : return -1;
329 18 : *len = l + 1;
330 : }
331 :
332 199 : if (external) {
333 188 : if (strNil(src)) {
334 0 : strcpy(*s, "nil");
335 0 : return 3;
336 : }
337 188 : snprintf(*s, l + 1, "\"%s\"", src);
338 : } else {
339 11 : strcpy(*s, src);
340 : }
341 199 : return (ssize_t) l;
342 : }
343 :
344 : /* COMMAND "getAnchor": Extract an anchor (reference) from the URL
345 : * SIGNATURE: getAnchor(url) : str; */
346 : static str
347 6 : URLgetAnchor(str *retval, url *val)
348 : {
349 6 : const char *s;
350 :
351 6 : if (val == NULL || *val == NULL)
352 0 : throw(ILLARG, "url.getAnchor", "url missing");
353 :
354 6 : if (strNil(*val)) {
355 : s = str_nil;
356 : } else {
357 5 : if ((s = skip_scheme(*val)) == NULL
358 5 : || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
359 5 : || (s = skip_path(s, NULL, NULL)) == NULL
360 5 : || (s = skip_search(s)) == NULL)
361 0 : throw(ILLARG, "url.getAnchor", "bad url");
362 5 : if (*s == '#')
363 2 : s++;
364 : else
365 : s = str_nil;
366 : }
367 :
368 6 : if ((*retval = GDKstrdup(s)) == NULL)
369 0 : throw(MAL, "url.getAnchor", SQLSTATE(HY013) MAL_MALLOC_FAIL);
370 : return MAL_SUCCEED;
371 : }
372 :
373 : /* COMMAND "getBasename": Extract the base of the last file name of the URL,
374 : * thus, excluding the file extension.
375 : * SIGNATURE: getBasename(str) : str; */
376 : static str
377 6 : URLgetBasename(str *retval, url *val)
378 : {
379 6 : const char *s;
380 6 : const char *b = NULL;
381 6 : const char *e = NULL;
382 :
383 6 : if (val == NULL || *val == NULL)
384 0 : throw(ILLARG, "url.getBasename", "url missing");
385 :
386 6 : if (strNil(*val)) {
387 1 : *retval = GDKstrdup(str_nil);
388 : } else {
389 5 : if ((s = skip_scheme(*val)) == NULL
390 5 : || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
391 5 : || (s = skip_path(s, &b, &e)) == NULL)
392 0 : throw(ILLARG, "url.getBasename", "bad url");
393 5 : if (b == NULL) {
394 1 : *retval = GDKstrdup(str_nil);
395 : } else {
396 4 : size_t l;
397 :
398 4 : if (e != NULL) {
399 3 : l = e - b;
400 : } else {
401 1 : l = s - b;
402 : }
403 4 : if ((*retval = GDKmalloc(l + 1)) != NULL) {
404 4 : strcpy_len(*retval, b, l + 1);
405 : }
406 : }
407 : }
408 :
409 6 : if (*retval == NULL)
410 0 : throw(MAL, "url.getBasename", SQLSTATE(HY013) MAL_MALLOC_FAIL);
411 : return MAL_SUCCEED;
412 : }
413 :
414 : /* COMMAND "getContext": Extract the path context from the URL
415 : * SIGNATURE: getContext(str) : str; */
416 : static str
417 6 : URLgetContext(str *retval, url *val)
418 : {
419 6 : const char *s;
420 6 : const char *p;
421 :
422 6 : if (val == NULL || *val == NULL)
423 0 : throw(ILLARG, "url.getContext", "url missing");
424 :
425 6 : if (strNil(*val)) {
426 1 : *retval = GDKstrdup(str_nil);
427 : } else {
428 5 : if ((s = skip_scheme(*val)) == NULL
429 5 : || (p = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
430 5 : || (s = skip_path(p, NULL, NULL)) == NULL)
431 0 : throw(ILLARG, "url.getContext", "bad url");
432 5 : if (p == s) {
433 1 : *retval = GDKstrdup(str_nil);
434 4 : } else if ((*retval = GDKmalloc(s - p + 1)) != NULL) {
435 4 : strcpy_len(*retval, p, s - p + 1);
436 : }
437 : }
438 :
439 6 : if (*retval == NULL)
440 0 : throw(MAL, "url.getContext", SQLSTATE(HY013) MAL_MALLOC_FAIL);
441 : return MAL_SUCCEED;
442 : }
443 :
444 : /* COMMAND "getExtension": Extract the file extension of the URL
445 : * SIGNATURE: getExtension(str) : str; */
446 : static str
447 6 : URLgetExtension(str *retval, url *val)
448 : {
449 6 : const char *s;
450 6 : const char *e = NULL;
451 :
452 6 : if (val == NULL || *val == NULL)
453 0 : throw(ILLARG, "url.getExtension", "url missing");
454 :
455 6 : if (strNil(*val)) {
456 1 : *retval = GDKstrdup(str_nil);
457 : } else {
458 5 : if ((s = skip_scheme(*val)) == NULL
459 5 : || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
460 5 : || (s = skip_path(s, NULL, &e)) == NULL)
461 0 : throw(ILLARG, "url.getExtension", "bad url");
462 5 : if (e == NULL) {
463 2 : *retval = GDKstrdup(str_nil);
464 : } else {
465 3 : size_t l = s - e;
466 :
467 3 : assert(*e == '.');
468 3 : if ((*retval = GDKmalloc(l)) != NULL) {
469 3 : strcpy_len(*retval, e + 1, l);
470 : }
471 : }
472 : }
473 :
474 6 : if (*retval == NULL)
475 0 : throw(MAL, "url.getExtension", SQLSTATE(HY013) MAL_MALLOC_FAIL);
476 : return MAL_SUCCEED;
477 : }
478 :
479 : /* COMMAND "getFile": Extract the last file name of the URL
480 : * SIGNATURE: getFile(str) : str; */
481 : static str
482 6 : URLgetFile(str *retval, url *val)
483 : {
484 6 : const char *s;
485 6 : const char *b = NULL;
486 :
487 6 : if (val == NULL || *val == NULL)
488 0 : throw(ILLARG, "url.getFile", "url missing");
489 :
490 6 : if (strNil(*val)) {
491 1 : *retval = GDKstrdup(str_nil);
492 : } else {
493 5 : if ((s = skip_scheme(*val)) == NULL
494 5 : || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
495 5 : || (s = skip_path(s, &b, NULL)) == NULL)
496 0 : throw(ILLARG, "url.getFile", "bad url");
497 5 : if (b == NULL) {
498 1 : *retval = GDKstrdup(str_nil);
499 : } else {
500 4 : size_t l;
501 :
502 4 : l = s - b;
503 4 : if ((*retval = GDKmalloc(l + 1)) != NULL) {
504 4 : strcpy_len(*retval, b, l + 1);
505 : }
506 : }
507 : }
508 :
509 6 : if (*retval == NULL)
510 0 : throw(MAL, "url.getFile", SQLSTATE(HY013) MAL_MALLOC_FAIL);
511 : return MAL_SUCCEED;
512 : }
513 :
514 : /* COMMAND "getHost": Extract the server identity from the URL */
515 : /* SIGNATURE: getHost(str) : str; */
516 : static str
517 6 : URLgetHost(str *retval, url *val)
518 : {
519 6 : const char *s;
520 6 : const char *h = NULL;
521 6 : const char *p = NULL;
522 :
523 6 : if (val == NULL || *val == NULL)
524 0 : throw(ILLARG, "url.getHost", "url missing");
525 :
526 6 : if (strNil(*val)) {
527 1 : *retval = GDKstrdup(str_nil);
528 : } else {
529 5 : if ((s = skip_scheme(*val)) == NULL
530 5 : || (s = skip_authority(s, NULL, NULL, &h, &p)) == NULL)
531 0 : throw(ILLARG, "url.getHost", "bad url");
532 5 : if (h == NULL) {
533 0 : *retval = GDKstrdup(str_nil);
534 : } else {
535 5 : size_t l;
536 :
537 5 : if (p != NULL) {
538 3 : l = p - h - 1;
539 : } else {
540 2 : l = s - h;
541 : }
542 5 : if ((*retval = GDKmalloc(l + 1)) != NULL) {
543 5 : strcpy_len(*retval, h, l + 1);
544 : }
545 : }
546 : }
547 :
548 6 : if (*retval == NULL)
549 0 : throw(MAL, "url.getHost", SQLSTATE(HY013) MAL_MALLOC_FAIL);
550 : return MAL_SUCCEED;
551 : }
552 :
553 : /* COMMAND "getDomain": Extract the Internet domain from the URL
554 : * SIGNATURE: getDomain(str) : str; */
555 : static str
556 6 : URLgetDomain(str *retval, url *val)
557 : {
558 6 : const char *s;
559 6 : const char *h = NULL;
560 6 : const char *p = NULL;
561 :
562 6 : if (val == NULL || *val == NULL)
563 0 : throw(ILLARG, "url.getDomain", "url missing");
564 :
565 6 : if (strNil(*val)) {
566 1 : *retval = GDKstrdup(str_nil);
567 : } else {
568 5 : if ((s = skip_scheme(*val)) == NULL
569 5 : || (s = skip_authority(s, NULL, NULL, &h, &p)) == NULL)
570 0 : throw(ILLARG, "url.getDomain", "bad url");
571 5 : if (h == NULL) {
572 0 : *retval = GDKstrdup(str_nil);
573 : } else {
574 5 : size_t l;
575 :
576 5 : if (p != NULL)
577 3 : p--;
578 : else
579 2 : p = s;
580 : l = 0;
581 19 : while (p > h && p[-1] != '.') {
582 14 : p--;
583 14 : l++;
584 : }
585 5 : if ((*retval = GDKmalloc(l + 1)) != NULL) {
586 5 : strcpy_len(*retval, p, l + 1);
587 : }
588 : }
589 : }
590 :
591 6 : if (*retval == NULL)
592 0 : throw(MAL, "url.getDomain", SQLSTATE(HY013) MAL_MALLOC_FAIL);
593 : return MAL_SUCCEED;
594 : }
595 :
596 : /* COMMAND "getPort": Extract the port id from the URL
597 : * SIGNATURE: getPort(str) : str; */
598 : static str
599 6 : URLgetPort(str *retval, url *val)
600 : {
601 6 : const char *s;
602 6 : const char *p = NULL;
603 :
604 6 : if (val == NULL || *val == NULL)
605 0 : throw(ILLARG, "url.getPort", "url missing");
606 :
607 6 : if (strNil(*val)) {
608 1 : *retval = GDKstrdup(str_nil);
609 : } else {
610 5 : if ((s = skip_scheme(*val)) == NULL
611 5 : || (s = skip_authority(s, NULL, NULL, NULL, &p)) == NULL)
612 0 : throw(ILLARG, "url.getPort", "bad url");
613 5 : if (p == NULL) {
614 2 : *retval = GDKstrdup(str_nil);
615 : } else {
616 3 : size_t l = s - p;
617 :
618 3 : if ((*retval = GDKmalloc(l + 1)) != NULL) {
619 3 : strcpy_len(*retval, p, l + 1);
620 : }
621 : }
622 : }
623 :
624 6 : if (*retval == NULL)
625 0 : throw(MAL, "url.getPort", SQLSTATE(HY013) MAL_MALLOC_FAIL);
626 : return MAL_SUCCEED;
627 : }
628 :
629 : /* COMMAND "getProtocol": Extract the protocol from the URL
630 : * SIGNATURE: getProtocol(str) : str; */
631 : static str
632 3 : URLgetProtocol(str *retval, url *val)
633 : {
634 3 : const char *s;
635 :
636 3 : if (val == NULL || *val == NULL)
637 0 : throw(ILLARG, "url.getProtocol", "url missing");
638 :
639 3 : if (strNil(*val)) {
640 1 : *retval = GDKstrdup(str_nil);
641 : } else {
642 2 : if ((s = skip_scheme(*val)) == NULL)
643 0 : throw(ILLARG, "url.getProtocol", "bad url");
644 2 : size_t l = s - *val;
645 :
646 2 : if ((*retval = GDKmalloc(l)) != NULL) {
647 2 : strcpy_len(*retval, *val, l);
648 : }
649 : }
650 :
651 3 : if (*retval == NULL)
652 0 : throw(MAL, "url.getProtocol", SQLSTATE(HY013) MAL_MALLOC_FAIL);
653 : return MAL_SUCCEED;
654 : }
655 :
656 : /* COMMAND "getQuery": Extract the query part from the URL
657 : * SIGNATURE: getQuery(str) : str; */
658 : static str
659 6 : URLgetQuery(str *retval, url *val)
660 : {
661 6 : const char *s;
662 6 : const char *q;
663 :
664 6 : if (val == NULL || *val == NULL)
665 0 : throw(ILLARG, "url.getQuery", "url missing");
666 :
667 6 : if (strNil(*val)) {
668 1 : *retval = GDKstrdup(str_nil);
669 : } else {
670 5 : if ((s = skip_scheme(*val)) == NULL
671 5 : || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
672 5 : || (q = skip_path(s, NULL, NULL)) == NULL
673 5 : || (s = skip_search(q)) == NULL)
674 0 : throw(ILLARG, "url.getQuery", "bad url");
675 5 : if (*q == '?') {
676 3 : size_t l;
677 :
678 3 : q++;
679 3 : l = s - q;
680 3 : if ((*retval = GDKmalloc(l + 1)) != NULL) {
681 3 : strcpy_len(*retval, q, l + 1);
682 : }
683 : } else {
684 2 : *retval = GDKstrdup(str_nil);
685 : }
686 : }
687 :
688 6 : if (*retval == NULL)
689 0 : throw(MAL, "url.getQuery", SQLSTATE(HY013) MAL_MALLOC_FAIL);
690 : return MAL_SUCCEED;
691 : }
692 :
693 : /* COMMAND "getRobotURL": Extract the location of the robot control file
694 : * SIGNATURE: getRobotURL(str) : str; */
695 : static str
696 6 : URLgetRobotURL(str *retval, url *val)
697 : {
698 6 : const char *s;
699 6 : size_t l;
700 :
701 6 : if (val == NULL || *val == NULL)
702 0 : throw(ILLARG, "url.getQuery", "url missing");
703 :
704 6 : if (strNil(*val)) {
705 1 : *retval = GDKstrdup(str_nil);
706 : } else {
707 5 : if ((s = skip_scheme(*val)) == NULL
708 5 : || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL)
709 0 : throw(ILLARG, "url.getQuery", "bad url");
710 5 : l = s - *val;
711 :
712 5 : if ((*retval = GDKmalloc(l + sizeof("/robots.txt"))) != NULL) {
713 5 : sprintf(*retval, "%.*s/robots.txt", (int) l, *val);
714 : }
715 : }
716 :
717 6 : if (*retval == NULL)
718 0 : throw(MAL, "url.getQuery", SQLSTATE(HY013) MAL_MALLOC_FAIL);
719 : return MAL_SUCCEED;
720 : }
721 :
722 : /* COMMAND "getUser": Extract the user identity from the URL
723 : * SIGNATURE: getUser(str) : str; */
724 : static str
725 9 : URLgetUser(str *retval, url *val)
726 : {
727 9 : const char *s, *h, *u, *p;
728 :
729 9 : if (val == NULL || *val == NULL)
730 0 : throw(ILLARG, "url.getUser", "url missing");
731 :
732 9 : if (strNil(*val)) {
733 1 : *retval = GDKstrdup(str_nil);
734 : } else {
735 8 : if ((s = skip_scheme(*val)) == NULL
736 8 : || (s = skip_authority(s, &u, &p, &h, NULL)) == NULL)
737 0 : throw(ILLARG, "url.getHost", "bad url");
738 8 : if (u == NULL || h == NULL) {
739 4 : *retval = GDKstrdup(str_nil);
740 : } else {
741 4 : size_t l;
742 :
743 4 : if (p) {
744 1 : l = p - u - 1;
745 : } else {
746 3 : l = h - u - 1;
747 : }
748 4 : if ((*retval = GDKmalloc(l + 1)) != NULL) {
749 4 : strcpy_len(*retval, u, l + 1);
750 : }
751 : }
752 : }
753 :
754 9 : if (*retval == NULL)
755 0 : throw(MAL, "url.getUser", SQLSTATE(HY013) MAL_MALLOC_FAIL);
756 : return MAL_SUCCEED;
757 : }
758 :
759 : /* COMMAND "isaURL": Check conformity of the URL syntax
760 : * SIGNATURE: isaURL(str) : bit; */
761 : static str
762 7 : URLisaURL(bit *retval, str *val)
763 : {
764 7 : if (val == NULL || *val == NULL)
765 0 : throw(ILLARG, "url.isaURL", "url missing");
766 7 : if (strNil(*val))
767 0 : *retval = bit_nil;
768 : else
769 7 : *retval = skip_scheme(*val) != NULL;
770 : return MAL_SUCCEED;
771 : }
772 :
773 : static str
774 49 : URLnew(url *u, str *val)
775 : {
776 49 : *u = GDKstrdup(*val);
777 49 : if (*u == NULL)
778 0 : throw(MAL, "url.new", SQLSTATE(HY013) MAL_MALLOC_FAIL);
779 : return MAL_SUCCEED;
780 : }
781 :
782 : static str
783 9 : URLnew3(url *u, str *protocol, str *server, str *file)
784 : {
785 9 : str Protocol = *protocol;
786 9 : str Server = *server;
787 9 : str File = *file;
788 9 : size_t l;
789 :
790 9 : if (strNil(File))
791 : File = "";
792 2 : else if (*File == '/')
793 0 : File++;
794 9 : if (strNil(Server))
795 : Server = "";
796 9 : if (strNil(Protocol))
797 : Protocol = "";
798 9 : l = strlen(File) + strlen(Server) + strlen(Protocol) + 10;
799 9 : *u = GDKmalloc(l);
800 9 : if (*u == NULL)
801 0 : throw(MAL, "url.newurl", SQLSTATE(HY013) MAL_MALLOC_FAIL);
802 9 : snprintf(*u, l, "%s://%s/%s", Protocol, Server, File);
803 9 : return MAL_SUCCEED;
804 : }
805 :
806 : static str
807 3 : URLnew4(url *u, str *protocol, str *server, int *port, str *file)
808 : {
809 3 : str Protocol = *protocol;
810 3 : str Server = *server;
811 3 : int Port = *port;
812 3 : str File = *file;
813 3 : size_t l;
814 :
815 3 : if (strNil(File))
816 : File = "";
817 2 : else if (*File == '/')
818 0 : File++;
819 3 : if (strNil(Server))
820 : Server = "";
821 3 : if (is_int_nil(Port))
822 1 : Port = 0;
823 3 : if (strNil(Protocol))
824 : Protocol = "";
825 3 : l = strlen(File) + strlen(Server) + strlen(Protocol) + 20;
826 3 : *u = GDKmalloc(l);
827 3 : if (*u == NULL)
828 0 : throw(MAL, "url.newurl", SQLSTATE(HY013) MAL_MALLOC_FAIL);
829 3 : snprintf(*u, l, "%s://%s:%d/%s", Protocol, Server, Port, File);
830 3 : return MAL_SUCCEED;
831 : }
832 :
833 : static str
834 0 : URLnoop(url *u, url *val)
835 : {
836 0 : *u = GDKstrdup(*val);
837 0 : if (*u == NULL)
838 0 : throw(MAL, "url.noop", SQLSTATE(HY013) MAL_MALLOC_FAIL);
839 : return MAL_SUCCEED;
840 : }
841 :
842 :
843 : /* Extract host identity from URL. This is a relaxed version,
844 : * where no exceptions is thrown when the input URL is not valid,
845 : * and empty string is returned instead.
846 : * */
847 : static str
848 1 : extractURLHost(str *retval, str *url, bit *no_www)
849 : {
850 1 : const char *s;
851 1 : const char *h = NULL;
852 1 : const char *p = NULL;
853 :
854 2 : if (url != NULL && *url != NULL && !strNil(*url)) {
855 1 : if ((s = skip_scheme(*url)) != NULL
856 0 : && (s = skip_authority(s, NULL, NULL, &h, &p)) != NULL
857 0 : && h != NULL) {
858 : ssize_t l;
859 : const char *pos = s;
860 0 : const char *domain = NULL;
861 0 : while (pos > h) {
862 0 : if (*pos == '.') {
863 : domain = pos;
864 : break;
865 : }
866 0 : pos--;
867 : }
868 :
869 0 : if (p != NULL) {
870 0 : l = p - h - 1;
871 : } else {
872 0 : l = s - h;
873 : }
874 0 : if (*no_www && !strncmp(h, "www.", 4)) {
875 0 : h += 4;
876 0 : l -= 4;
877 : }
878 0 : if (domain && l > 3) {
879 0 : if ((*retval = GDKmalloc(l + 1)) != NULL)
880 0 : strcpy_len(*retval, h, l + 1);
881 : } else {
882 0 : *retval = GDKstrdup(str_nil);
883 : }
884 : } else {
885 1 : *retval = GDKstrdup(str_nil);
886 : }
887 : } else {
888 0 : *retval = GDKstrdup(str_nil);
889 : }
890 1 : if (!*retval)
891 0 : throw(MAL, "url.getURLHost", SQLSTATE(HY013) MAL_MALLOC_FAIL);
892 :
893 : return MAL_SUCCEED;
894 : }
895 :
896 :
897 : static inline str
898 2 : str_buf_copy(str *buf, size_t *buflen, const char *s, size_t l)
899 : {
900 2 : CHECK_STR_BUFFER_LENGTH(buf, buflen, l, "url.str_buf_copy");
901 2 : strcpy_len(*buf, s, l);
902 2 : return MAL_SUCCEED;
903 : }
904 :
905 :
906 : // bulk version
907 : static str
908 2 : BATextractURLHost(bat *res, const bat *bid, bit *no_www)
909 : {
910 2 : const char *s;
911 2 : const char *host = NULL;
912 2 : const char *port = NULL;
913 2 : BAT *bn = NULL, *b = NULL;
914 2 : BUN p, q;
915 2 : size_t buflen = INITIAL_STR_BUFFER_LENGTH;
916 2 : str buf = GDKmalloc(buflen);
917 2 : str msg = MAL_SUCCEED;
918 2 : bool nils = false;
919 :
920 2 : if (buf == NULL)
921 0 : throw(MAL, "baturl.extractURLHost", SQLSTATE(HY013) MAL_MALLOC_FAIL);
922 :
923 2 : if (!(b = BATdescriptor(*bid))) {
924 0 : GDKfree(buf);
925 0 : throw(MAL, "baturl.extractURLHost",
926 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
927 : }
928 2 : if ((bn = COLnew(b->hseqbase, TYPE_str, BATcount(b), TRANSIENT)) == NULL) {
929 0 : GDKfree(buf);
930 0 : BBPunfix(b->batCacheid);
931 0 : throw(MAL, "baturl.extractURLHost", SQLSTATE(HY013) MAL_MALLOC_FAIL);
932 : }
933 :
934 2 : BATiter bi = bat_iterator(b);
935 4 : BATloop(b, p, q) {
936 2 : const char *url = (const char *) BUNtvar(bi, p);
937 2 : if (strNil(url)) {
938 0 : if (bunfastapp_nocheckVAR(bn, str_nil) != GDK_SUCCEED) {
939 0 : msg = createException(MAL, "baturl.extractURLHost",
940 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
941 0 : break;
942 : }
943 : nils = true;
944 : } else {
945 2 : if ((s = skip_scheme(url)) != NULL
946 2 : && (s = skip_authority(s, NULL, NULL, &host, &port)) != NULL
947 2 : && host != NULL) {
948 : ssize_t l;
949 : const char *pos = s;
950 18 : const char *domain = NULL;
951 18 : while (pos > host) {
952 18 : if (*pos == '.') {
953 : domain = pos;
954 : break;
955 : }
956 16 : pos--;
957 : }
958 :
959 2 : if (port != NULL) {
960 2 : l = port - host - 1;
961 : } else {
962 0 : l = s - host;
963 : }
964 2 : if (domain && l > 3) {
965 2 : if (*no_www && !strncmp(host, "www.", 4)) {
966 1 : host += 4;
967 1 : l -= 4;
968 : }
969 2 : if (l > 0) {
970 : // if ((msg = str_Sub_String(&buf, &buflen, host, 0, l)) != MAL_SUCCEED)
971 : // break;
972 2 : if ((msg = str_buf_copy(&buf, &buflen, host,
973 2 : (size_t) (l + 1))) != MAL_SUCCEED)
974 : break;
975 2 : if (bunfastapp_nocheckVAR(bn, buf) != GDK_SUCCEED) {
976 0 : msg = createException(MAL, "baturl.extractURLHost",
977 : SQLSTATE(HY013)
978 : MAL_MALLOC_FAIL);
979 0 : break;
980 : }
981 2 : continue;
982 : }
983 : }
984 : }
985 : // fall back insert nil str if no valid host
986 0 : if (bunfastapp_nocheckVAR(bn, str_nil) != GDK_SUCCEED) {
987 0 : msg = createException(MAL, "baturl.extractURLHost",
988 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
989 0 : break;
990 : }
991 : nils = true;
992 : }
993 : }
994 2 : bat_iterator_end(&bi);
995 :
996 2 : GDKfree(buf);
997 2 : if (msg == MAL_SUCCEED) {
998 2 : BATsetcount(bn, q);
999 2 : bn->tnil = nils;
1000 2 : bn->tnonil = !nils;
1001 2 : bn->tkey = BATcount(bn) <= 1;
1002 2 : bn->tsorted = BATcount(bn) <= 1;
1003 2 : bn->trevsorted = BATcount(bn) <= 1;
1004 2 : *res = bn->batCacheid;
1005 2 : BBPkeepref(bn);
1006 : }
1007 2 : BBPunfix(b->batCacheid);
1008 2 : return msg;
1009 : }
1010 :
1011 :
1012 : #include "mel.h"
1013 : mel_atom url_init_atoms[] = {
1014 : { .name="url", .basetype="str", .fromstr=URLfromString, .tostr=URLtoString, }, { .cmp=NULL }
1015 : };
1016 : mel_func url_init_funcs[] = {
1017 : command("url", "url", URLnew, false, "Create an URL from a string literal", args(1,2, arg("",url),arg("s",str))),
1018 : command("url", "url", URLnoop, false, "Create an URL from a string literal", args(1,2, arg("",url),arg("s",url))),
1019 : command("calc", "url", URLnew, false, "Create an URL from a string literal", args(1,2, arg("",url),arg("s",str))),
1020 : command("calc", "url", URLnoop, false, "Create an URL from a string literal", args(1,2, arg("",url),arg("s",url))),
1021 : command("url", "getAnchor", URLgetAnchor, false, "Extract the URL anchor (reference)", args(1,2, arg("",str),arg("u",url))),
1022 : command("url", "getBasename", URLgetBasename, false, "Extract the URL base file name", args(1,2, arg("",str),arg("u",url))),
1023 : command("url", "getContext", URLgetContext, false, "Get the path context of a URL", args(1,2, arg("",str),arg("u",url))),
1024 : command("url", "getDomain", URLgetDomain, false, "Extract Internet domain from the URL", args(1,2, arg("",str),arg("u",url))),
1025 : command("url", "getExtension", URLgetExtension, false, "Extract the file extension of the URL", args(1,2, arg("",str),arg("u",url))),
1026 : command("url", "getFile", URLgetFile, false, "Extract the last file name of the URL", args(1,2, arg("",str),arg("u",url))),
1027 : command("url", "getHost", URLgetHost, false, "Extract the server name from the URL strict version", args(1,2, arg("",str),arg("u",url))),
1028 : command("url", "getPort", URLgetPort, false, "Extract the port id from the URL", args(1,2, arg("",str),arg("u",url))),
1029 : command("url", "getProtocol", URLgetProtocol, false, "Extract the protocol from the URL", args(1,2, arg("",str),arg("u",url))),
1030 : command("url", "getQuery", URLgetQuery, false, "Extract the query string from the URL", args(1,2, arg("",str),arg("u",url))),
1031 : command("url", "getUser", URLgetUser, false, "Extract the user identity from the URL", args(1,2, arg("",str),arg("u",url))),
1032 : command("url", "getRobotURL", URLgetRobotURL, false, "Extract the location of the robot control file", args(1,2, arg("",str),arg("u",url))),
1033 : command("url", "isaURL", URLisaURL, false, "Check conformity of the URL syntax", args(1,2, arg("",bit),arg("u",str))),
1034 : command("url", "new", URLnew4, false, "Construct URL from protocol, host, port, and file", args(1,5, arg("",url),arg("p",str),arg("h",str),arg("prt",int),arg("f",str))),
1035 : command("url", "new", URLnew3, false, "Construct URL from protocol, host,and file", args(1,4, arg("",url),arg("prot",str),arg("host",str),arg("fnme",str))),
1036 : command("url", "extractURLHost", extractURLHost, false, "Extract host from a URL relaxed version", args(1,3, arg("",str),arg("u",str), arg("no_www", bit))),
1037 : command("baturl", "extractURLHost", BATextractURLHost, false, "Extract host from BAT of URLs", args(1,3, batarg("",str), batarg("s",str), arg("no_www", bit))),
1038 : { .imp=NULL }
1039 : };
1040 : #include "mal_import.h"
1041 : #ifdef _MSC_VER
1042 : #undef read
1043 : #pragma section(".CRT$XCU",read)
1044 : #endif
1045 329 : LIB_STARTUP_FUNC(init_url_mal)
1046 329 : { mal_module("url", url_init_atoms, url_init_funcs); }
|