Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : /*
14 : * M. Kersten
15 : * Y. Zhang
16 : * The URL module
17 : * The URL module contains a collection of commands to manipulate
18 : * Uniform Resource Locators - a resource on the World Wide Web-
19 : * represented as a string in Monet. The URL can represent
20 : * anything from a file, a directory or a complete movie.
21 : * This module is geared towards manipulation of their name only.
22 : * A complementary module can be used to gain access.[IOgate]
23 : *
24 : * The URL syntax is specified in RFC2396, Uniform Resource Identifiers
25 : * (URI): Generic Syntax. The URL syntax is dependent upon the scheme.
26 : * In general, a URL has the form <scheme>:<scheme-specific-part>.
27 : * Thus, accepting a valid URL is a simple process, unless the scheme
28 : * is known and schema-specific syntax is checked (e.g., http or ftp
29 : * scheme). For the URL module implemented here, we assume some common
30 : * fields of the <scheme-specific-part> that are shared among different
31 : * schemes.
32 : *
33 : * The core of the extension involves several operators to extract
34 : * portions of the URLs for further manipulation. In particular,
35 : * the domain, the server, and the protocol, and the file extension
36 : * can be extracted without copying the complete URL from the heap
37 : * into a string variable first.
38 : *
39 : * The commands provided are based on the corresponding Java class.
40 : *
41 : * A future version should use a special atom, because this may save
42 : * considerable space. Alternatively, break the URL strings into
43 : * components and represent them with a bunch of BATs. An intermediate
44 : * step would be to refine the atom STR, then it would be possible to
45 : * redefine hashing.
46 : */
47 :
48 : #include "monetdb_config.h"
49 : #include "mal.h"
50 : #include "gdk.h"
51 : #include <ctype.h>
52 : #include "mal_exception.h"
53 : #include "str.h"
54 :
55 : typedef str url;
56 :
57 : /* SCHEME "://" AUTHORITY [ PATH ] [ "?" SEARCH ] [ "#" FRAGMENT ]
58 : * AUTHORITY is: [ USER [ ":" PASSWORD ] "@" ] HOST [ ":" PORT ] */
59 :
60 : /* return pointer to string after the scheme and colon; input: pointer
61 : * to start of URI */
62 : static const char *
63 73 : skip_scheme(const char *uri)
64 : {
65 73 : if (('a' <= *uri && *uri <= 'z') || ('A' <= *uri && *uri <= 'Z')) {
66 73 : uri++;
67 73 : while (('a' <= *uri && *uri <= 'z') || ('A' <= *uri && *uri <= 'Z')
68 73 : || isdigit((unsigned char) *uri) || *uri == '+' || *uri == '-'
69 395 : || *uri == '.')
70 249 : uri++;
71 73 : if (*uri == ':')
72 72 : return uri + 1;
73 : }
74 : return NULL;
75 : }
76 :
77 : #define ishex(c) isxdigit((unsigned char) (c))
78 : #define isreserved(c) ((c) == ';' || (c) == '/' || (c) == '?' || \
79 : (c) == ':' || (c) == '@' || (c) == '&' || \
80 : (c) == '=' || (c) == '+' || (c) == '$' || \
81 : (c) == ',')
82 : #define isunreserved(c) (('a' <= (c) && (c) <= 'z') || \
83 : ('A' <= (c) && (c) <= 'Z') || \
84 : isdigit((unsigned char) (c)) || \
85 : (c) == '-' || (c) == '_' || (c) == '.' || \
86 : (c) == '!' || (c) == '~' || (c) == '*' || \
87 : (c) == '\'' || (c) == '(' || (c) == ')')
88 :
89 : /* return pointer to string after the authority, filling in pointers
90 : * to start of user, password, host, and port, if provided; input:
91 : * result of skip_scheme() */
92 : static const char *
93 63 : skip_authority(const char *uri, const char **userp, const char **passp,
94 : const char **hostp, const char **portp)
95 : {
96 63 : const char *user = NULL, *pass = NULL, *host = NULL, *port = NULL;
97 :
98 63 : if (uri[0] == '/' && uri[1] == '/') {
99 63 : uri += 2;
100 63 : user = host = uri;
101 436 : while (isunreserved(*uri)
102 0 : || (*uri == '%' && ishex(uri[1]) && ishex(uri[2])) || *uri == ';'
103 : || *uri == ':' || *uri == '=' || *uri == '+' || *uri == '$'
104 1193 : || *uri == ',' || *uri == '@') {
105 1130 : if (*uri == ':') {
106 39 : if (user == host)
107 15 : port = pass = uri + 1;
108 : else
109 24 : port = uri + 1;
110 1091 : } else if (*uri == '@') {
111 29 : host = uri + 1;
112 29 : port = NULL;
113 : }
114 2260 : uri += *uri == '%' ? 3 : 1;
115 : }
116 63 : if (user == host) {
117 : /* no "@", so no user info */
118 34 : if (userp)
119 4 : *userp = NULL;
120 34 : if (passp)
121 4 : *passp = NULL;
122 : } else {
123 29 : if (userp)
124 4 : *userp = user;
125 29 : if (passp)
126 4 : *passp = pass;
127 : }
128 63 : if (portp)
129 20 : *portp = port;
130 63 : if (hostp)
131 23 : *hostp = host;
132 63 : return uri;
133 : }
134 : return NULL;
135 : }
136 :
137 : /* return pointer to string after the path, filling in pointer to
138 : * start of last component and extension of that component; input:
139 : * result of skip_authority() */
140 : static const char *
141 30 : skip_path(const char *uri, const char **basep, const char **extp)
142 : {
143 30 : const char *base = NULL, *ext = NULL;
144 :
145 30 : if (*uri == '/') {
146 24 : uri++;
147 24 : base = uri;
148 132 : while (isunreserved(*uri)
149 0 : || (*uri == '%' && ishex(uri[1]) && ishex(uri[2])) || *uri == ':'
150 : || *uri == '@' || *uri == '&' || *uri == '=' || *uri == '+'
151 582 : || *uri == '$' || *uri == ',' || *uri == ';' || *uri == '/') {
152 558 : if (*uri == '/') {
153 36 : base = uri + 1;
154 36 : ext = NULL;
155 522 : } else if (*uri == '.' && ext == NULL && uri != base) {
156 558 : ext = uri;
157 : }
158 1116 : uri += *uri == '%' ? 3 : 1;
159 : }
160 : }
161 30 : if (basep)
162 10 : *basep = base;
163 30 : if (extp)
164 10 : *extp = ext;
165 30 : return uri;
166 : }
167 :
168 : /* return pointer to string after the search string; input: result of
169 : * skip_path() */
170 : static const char *
171 10 : skip_search(const char *uri)
172 : {
173 10 : if (*uri == '?') {
174 6 : uri++;
175 68 : while (isreserved(*uri) || isunreserved(*uri)
176 76 : || (*uri == '%' && ishex(uri[1]) && ishex(uri[2]))) {
177 140 : uri += *uri == '%' ? 3 : 1;
178 : }
179 : }
180 10 : return uri;
181 : }
182 :
183 : #if 0
184 : /*
185 : * Utilities
186 : */
187 :
188 : static char
189 : x2c(const char *what)
190 : {
191 : char digit;
192 :
193 : digit = (what[0] >= 'A' ? ((what[0] & 0xdf) - 'A') + 10 : (what[0] - '0'));
194 : digit *= 16;
195 : digit += (what[1] >= 'A' ? ((what[1] & 0xdf) - 'A') + 10 : (what[1] - '0'));
196 : return (digit);
197 : }
198 :
199 : static int
200 : needEscape(char c)
201 : {
202 : if (isalnum((unsigned char) c))
203 : return 0;
204 : if (c == '#' || c == '-' || c == '_' || c == '.' || c == '!' || c == '~'
205 : || c == '*' || c == '\'' || c == '(' || c == ')')
206 : return 0;
207 : return 1;
208 : }
209 :
210 : /* COMMAND "escape": this function applies the URI escaping rules defined in
211 : * section 2 of [RFC 3986] to the string supplied as 's'.
212 : * The effect of the function is to escape a set of identified characters in
213 : * the string. Each such character is replaced in the string by an escape
214 : * sequence, which is formed by encoding the character as a sequence of octets
215 : * in UTF-8, and then reprensenting each of these octets in the form %HH.
216 : *
217 : * All characters are escaped other than:
218 : * [a-z], [A-Z], [0-9], "#", "-", "_", ".", "!", "~", "*", "'", "(", ")"
219 : *
220 : * This function must always generate hexadecimal values using the upper-case
221 : * letters A-F.
222 : *
223 : * SIGNATURE: escape(str) : str; */
224 : static str
225 : escape_str(str *retval, const char *s)
226 : {
227 : int x, y;
228 : str res;
229 :
230 : if (!s)
231 : throw(ILLARG, "url.escape", "url missing");
232 :
233 : if (!(res = (str) GDKmalloc(strlen(s) * 3)))
234 : throw(MAL, "url.escape", SQLSTATE(HY013) MAL_MALLOC_FAIL);
235 : for (x = 0, y = 0; s[x]; ++x, ++y) {
236 : if (needEscape(s[x])) {
237 : if (s[x] == ' ') {
238 : res[y] = '+';
239 : } else {
240 : sprintf(res + y, "%%%2x", (uint8_t) s[x]);
241 : y += 2;
242 : }
243 : } else {
244 : res[y] = s[x];
245 : }
246 : }
247 : res[y] = '\0';
248 :
249 : if ((*retval = GDKrealloc(res, strlen(res) + 1)) == NULL) {
250 : GDKfree(res);
251 : throw(MAL, "url.escape", SQLSTATE(HY013) MAL_MALLOC_FAIL);
252 : }
253 : return MAL_SUCCEED;
254 : }
255 :
256 : /* COMMAND "unescape": Convert hexadecimal representations to ASCII characters.
257 : * All sequences of the form "% HEX HEX" are unescaped.
258 : * SIGNATURE: unescape(str) : str; */
259 : static str
260 : unescape_str(str *retval, const char *s)
261 : {
262 : int x, y;
263 : str res;
264 :
265 : if (!s)
266 : throw(ILLARG, "url.escape", "url missing");
267 :
268 : res = (str) GDKmalloc(strlen(s));
269 : if (!res)
270 : throw(MAL, "url.unescape", SQLSTATE(HY013) MAL_MALLOC_FAIL);
271 :
272 : for (x = 0, y = 0; s[x]; ++x, ++y) {
273 : if (s[x] == '%') {
274 : res[y] = x2c(&s[x + 1]);
275 : x += 2;
276 : } else {
277 : res[y] = s[x];
278 : }
279 : }
280 : res[y] = '\0';
281 :
282 : if ((*retval = GDKrealloc(res, strlen(res) + 1)) == NULL) {
283 : GDKfree(res);
284 : throw(MAL, "url.unescape", SQLSTATE(HY013) MAL_MALLOC_FAIL);
285 : }
286 : return MAL_SUCCEED;
287 : }
288 : #endif
289 :
290 : /*
291 : * Wrapping
292 : * Here you find the wrappers around the V4 url library included above.
293 : */
294 :
295 : static ssize_t
296 2000013 : URLfromString(const char *src, size_t *len, void **U, bool external)
297 : {
298 2000013 : char **u = (char **) U;
299 2000013 : size_t l = strlen(src) + 1;
300 :
301 2000013 : if (*len < l || *u == NULL) {
302 25 : GDKfree(*u);
303 25 : *u = GDKmalloc(l);
304 25 : if (*u == NULL)
305 : return -1;
306 25 : *len = l;
307 : }
308 :
309 : /* actually parse the message for valid url */
310 :
311 2000013 : if (external && strcmp(src, "nil") == 0)
312 0 : strcpy(*u, str_nil);
313 : else
314 2000013 : memcpy(*u, src, l);
315 2000013 : return (ssize_t) l - 1;
316 : }
317 :
318 : static ssize_t
319 198 : URLtoString(str *s, size_t *len, const void *SRC, bool external)
320 : {
321 198 : const char *src = SRC;
322 198 : size_t l = strlen(src);
323 :
324 198 : if (external)
325 188 : l += 2;
326 198 : if (l >= *len || *s == NULL) {
327 18 : GDKfree(*s);
328 18 : *s = GDKmalloc(l + 1);
329 18 : if (*s == NULL)
330 : return -1;
331 18 : *len = l + 1;
332 : }
333 :
334 198 : if (external) {
335 188 : if (strNil(src)) {
336 0 : strcpy(*s, "nil");
337 0 : return 3;
338 : }
339 188 : snprintf(*s, l + 1, "\"%s\"", src);
340 : } else {
341 10 : strcpy(*s, src);
342 : }
343 198 : return (ssize_t) l;
344 : }
345 :
346 : /* COMMAND "getAnchor": Extract an anchor (reference) from the URL
347 : * SIGNATURE: getAnchor(url) : str; */
348 : static str
349 6 : URLgetAnchor(str *retval, const url *val)
350 : {
351 6 : const char *s;
352 :
353 6 : if (val == NULL || *val == NULL)
354 0 : throw(ILLARG, "url.getAnchor", "url missing");
355 :
356 6 : if (strNil(*val)) {
357 : s = str_nil;
358 : } else {
359 5 : if ((s = skip_scheme(*val)) == NULL
360 5 : || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
361 5 : || (s = skip_path(s, NULL, NULL)) == NULL
362 5 : || (s = skip_search(s)) == NULL)
363 0 : throw(ILLARG, "url.getAnchor", "bad url");
364 5 : if (*s == '#')
365 2 : s++;
366 : else
367 : s = str_nil;
368 : }
369 :
370 6 : if ((*retval = GDKstrdup(s)) == NULL)
371 0 : throw(MAL, "url.getAnchor", SQLSTATE(HY013) MAL_MALLOC_FAIL);
372 : return MAL_SUCCEED;
373 : }
374 :
375 : /* COMMAND "getBasename": Extract the base of the last file name of the URL,
376 : * thus, excluding the file extension.
377 : * SIGNATURE: getBasename(str) : str; */
378 : static str
379 6 : URLgetBasename(str *retval, const url *val)
380 : {
381 6 : const char *s;
382 6 : const char *b = NULL;
383 6 : const char *e = NULL;
384 :
385 6 : if (val == NULL || *val == NULL)
386 0 : throw(ILLARG, "url.getBasename", "url missing");
387 :
388 6 : if (strNil(*val)) {
389 1 : *retval = GDKstrdup(str_nil);
390 : } else {
391 5 : if ((s = skip_scheme(*val)) == NULL
392 5 : || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
393 5 : || (s = skip_path(s, &b, &e)) == NULL)
394 0 : throw(ILLARG, "url.getBasename", "bad url");
395 5 : if (b == NULL) {
396 1 : *retval = GDKstrdup(str_nil);
397 : } else {
398 4 : size_t l;
399 :
400 4 : if (e != NULL) {
401 3 : l = e - b;
402 : } else {
403 1 : l = s - b;
404 : }
405 4 : if ((*retval = GDKmalloc(l + 1)) != NULL) {
406 4 : strcpy_len(*retval, b, l + 1);
407 : }
408 : }
409 : }
410 :
411 6 : if (*retval == NULL)
412 0 : throw(MAL, "url.getBasename", SQLSTATE(HY013) MAL_MALLOC_FAIL);
413 : return MAL_SUCCEED;
414 : }
415 :
416 : /* COMMAND "getContext": Extract the path context from the URL
417 : * SIGNATURE: getContext(str) : str; */
418 : static str
419 6 : URLgetContext(str *retval, const url *val)
420 : {
421 6 : const char *s;
422 6 : const char *p;
423 :
424 6 : if (val == NULL || *val == NULL)
425 0 : throw(ILLARG, "url.getContext", "url missing");
426 :
427 6 : if (strNil(*val)) {
428 1 : *retval = GDKstrdup(str_nil);
429 : } else {
430 5 : if ((s = skip_scheme(*val)) == NULL
431 5 : || (p = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
432 5 : || (s = skip_path(p, NULL, NULL)) == NULL)
433 0 : throw(ILLARG, "url.getContext", "bad url");
434 5 : if (p == s) {
435 1 : *retval = GDKstrdup(str_nil);
436 4 : } else if ((*retval = GDKmalloc(s - p + 1)) != NULL) {
437 4 : strcpy_len(*retval, p, s - p + 1);
438 : }
439 : }
440 :
441 6 : if (*retval == NULL)
442 0 : throw(MAL, "url.getContext", SQLSTATE(HY013) MAL_MALLOC_FAIL);
443 : return MAL_SUCCEED;
444 : }
445 :
446 : /* COMMAND "getExtension": Extract the file extension of the URL
447 : * SIGNATURE: getExtension(str) : str; */
448 : static str
449 6 : URLgetExtension(str *retval, const url *val)
450 : {
451 6 : const char *s;
452 6 : const char *e = NULL;
453 :
454 6 : if (val == NULL || *val == NULL)
455 0 : throw(ILLARG, "url.getExtension", "url missing");
456 :
457 6 : if (strNil(*val)) {
458 1 : *retval = GDKstrdup(str_nil);
459 : } else {
460 5 : if ((s = skip_scheme(*val)) == NULL
461 5 : || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
462 5 : || (s = skip_path(s, NULL, &e)) == NULL)
463 0 : throw(ILLARG, "url.getExtension", "bad url");
464 5 : if (e == NULL) {
465 2 : *retval = GDKstrdup(str_nil);
466 : } else {
467 3 : size_t l = s - e;
468 :
469 3 : assert(*e == '.');
470 3 : if ((*retval = GDKmalloc(l)) != NULL) {
471 3 : strcpy_len(*retval, e + 1, l);
472 : }
473 : }
474 : }
475 :
476 6 : if (*retval == NULL)
477 0 : throw(MAL, "url.getExtension", SQLSTATE(HY013) MAL_MALLOC_FAIL);
478 : return MAL_SUCCEED;
479 : }
480 :
481 : /* COMMAND "getFile": Extract the last file name of the URL
482 : * SIGNATURE: getFile(str) : str; */
483 : static str
484 6 : URLgetFile(str *retval, const url *val)
485 : {
486 6 : const char *s;
487 6 : const char *b = NULL;
488 :
489 6 : if (val == NULL || *val == NULL)
490 0 : throw(ILLARG, "url.getFile", "url missing");
491 :
492 6 : if (strNil(*val)) {
493 1 : *retval = GDKstrdup(str_nil);
494 : } else {
495 5 : if ((s = skip_scheme(*val)) == NULL
496 5 : || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
497 5 : || (s = skip_path(s, &b, NULL)) == NULL)
498 0 : throw(ILLARG, "url.getFile", "bad url");
499 5 : if (b == NULL) {
500 1 : *retval = GDKstrdup(str_nil);
501 : } else {
502 4 : size_t l;
503 :
504 4 : l = s - b;
505 4 : if ((*retval = GDKmalloc(l + 1)) != NULL) {
506 4 : strcpy_len(*retval, b, l + 1);
507 : }
508 : }
509 : }
510 :
511 6 : if (*retval == NULL)
512 0 : throw(MAL, "url.getFile", SQLSTATE(HY013) MAL_MALLOC_FAIL);
513 : return MAL_SUCCEED;
514 : }
515 :
516 : /* COMMAND "getHost": Extract the server identity from the URL */
517 : /* SIGNATURE: getHost(str) : str; */
518 : static str
519 9 : URLgetHost(str *retval, const url *val)
520 : {
521 9 : const char *s;
522 9 : const char *h = NULL;
523 9 : const char *p = NULL;
524 :
525 9 : if (val == NULL || *val == NULL)
526 0 : throw(ILLARG, "url.getHost", "url missing");
527 :
528 9 : if (strNil(*val)) {
529 1 : *retval = GDKstrdup(str_nil);
530 : } else {
531 8 : if ((s = skip_scheme(*val)) == NULL
532 8 : || (s = skip_authority(s, NULL, NULL, &h, &p)) == NULL)
533 0 : throw(ILLARG, "url.getHost", "bad url");
534 8 : if (h == NULL) {
535 0 : *retval = GDKstrdup(str_nil);
536 : } else {
537 8 : size_t l;
538 :
539 8 : if (p != NULL) {
540 3 : l = p - h - 1;
541 : } else {
542 5 : l = s - h;
543 : }
544 8 : if ((*retval = GDKmalloc(l + 1)) != NULL) {
545 8 : strcpy_len(*retval, h, l + 1);
546 : }
547 : }
548 : }
549 :
550 9 : if (*retval == NULL)
551 0 : throw(MAL, "url.getHost", SQLSTATE(HY013) MAL_MALLOC_FAIL);
552 : return MAL_SUCCEED;
553 : }
554 :
555 : /* COMMAND "getDomain": Extract the Internet domain from the URL
556 : * SIGNATURE: getDomain(str) : str; */
557 : static str
558 6 : URLgetDomain(str *retval, const url *val)
559 : {
560 6 : const char *s;
561 6 : const char *h = NULL;
562 6 : const char *p = NULL;
563 :
564 6 : if (val == NULL || *val == NULL)
565 0 : throw(ILLARG, "url.getDomain", "url missing");
566 :
567 6 : if (strNil(*val)) {
568 1 : *retval = GDKstrdup(str_nil);
569 : } else {
570 5 : if ((s = skip_scheme(*val)) == NULL
571 5 : || (s = skip_authority(s, NULL, NULL, &h, &p)) == NULL)
572 0 : throw(ILLARG, "url.getDomain", "bad url");
573 5 : if (h == NULL) {
574 0 : *retval = GDKstrdup(str_nil);
575 : } else {
576 5 : size_t l;
577 :
578 5 : if (p != NULL)
579 3 : p--;
580 : else
581 2 : p = s;
582 : l = 0;
583 19 : while (p > h && p[-1] != '.') {
584 14 : p--;
585 14 : l++;
586 : }
587 5 : if ((*retval = GDKmalloc(l + 1)) != NULL) {
588 5 : strcpy_len(*retval, p, l + 1);
589 : }
590 : }
591 : }
592 :
593 6 : if (*retval == NULL)
594 0 : throw(MAL, "url.getDomain", SQLSTATE(HY013) MAL_MALLOC_FAIL);
595 : return MAL_SUCCEED;
596 : }
597 :
598 : /* COMMAND "getPort": Extract the port id from the URL
599 : * SIGNATURE: getPort(str) : str; */
600 : static str
601 6 : URLgetPort(str *retval, const url *val)
602 : {
603 6 : const char *s;
604 6 : const char *p = NULL;
605 :
606 6 : if (val == NULL || *val == NULL)
607 0 : throw(ILLARG, "url.getPort", "url missing");
608 :
609 6 : if (strNil(*val)) {
610 1 : *retval = GDKstrdup(str_nil);
611 : } else {
612 5 : if ((s = skip_scheme(*val)) == NULL
613 5 : || (s = skip_authority(s, NULL, NULL, NULL, &p)) == NULL)
614 0 : throw(ILLARG, "url.getPort", "bad url");
615 5 : if (p == NULL) {
616 2 : *retval = GDKstrdup(str_nil);
617 : } else {
618 3 : size_t l = s - p;
619 :
620 3 : if ((*retval = GDKmalloc(l + 1)) != NULL) {
621 3 : strcpy_len(*retval, p, l + 1);
622 : }
623 : }
624 : }
625 :
626 6 : if (*retval == NULL)
627 0 : throw(MAL, "url.getPort", SQLSTATE(HY013) MAL_MALLOC_FAIL);
628 : return MAL_SUCCEED;
629 : }
630 :
631 : /* COMMAND "getProtocol": Extract the protocol from the URL
632 : * SIGNATURE: getProtocol(str) : str; */
633 : static str
634 3 : URLgetProtocol(str *retval, const url *val)
635 : {
636 3 : const char *s;
637 :
638 3 : if (val == NULL || *val == NULL)
639 0 : throw(ILLARG, "url.getProtocol", "url missing");
640 :
641 3 : if (strNil(*val)) {
642 1 : *retval = GDKstrdup(str_nil);
643 : } else {
644 2 : if ((s = skip_scheme(*val)) == NULL)
645 0 : throw(ILLARG, "url.getProtocol", "bad url");
646 2 : size_t l = s - *val;
647 :
648 2 : if ((*retval = GDKmalloc(l)) != NULL) {
649 2 : strcpy_len(*retval, *val, l);
650 : }
651 : }
652 :
653 3 : if (*retval == NULL)
654 0 : throw(MAL, "url.getProtocol", SQLSTATE(HY013) MAL_MALLOC_FAIL);
655 : return MAL_SUCCEED;
656 : }
657 :
658 : /* COMMAND "getQuery": Extract the query part from the URL
659 : * SIGNATURE: getQuery(str) : str; */
660 : static str
661 6 : URLgetQuery(str *retval, const url *val)
662 : {
663 6 : const char *s;
664 6 : const char *q;
665 :
666 6 : if (val == NULL || *val == NULL)
667 0 : throw(ILLARG, "url.getQuery", "url missing");
668 :
669 6 : if (strNil(*val)) {
670 1 : *retval = GDKstrdup(str_nil);
671 : } else {
672 5 : if ((s = skip_scheme(*val)) == NULL
673 5 : || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL
674 5 : || (q = skip_path(s, NULL, NULL)) == NULL
675 5 : || (s = skip_search(q)) == NULL)
676 0 : throw(ILLARG, "url.getQuery", "bad url");
677 5 : if (*q == '?') {
678 3 : size_t l;
679 :
680 3 : q++;
681 3 : l = s - q;
682 3 : if ((*retval = GDKmalloc(l + 1)) != NULL) {
683 3 : strcpy_len(*retval, q, l + 1);
684 : }
685 : } else {
686 2 : *retval = GDKstrdup(str_nil);
687 : }
688 : }
689 :
690 6 : if (*retval == NULL)
691 0 : throw(MAL, "url.getQuery", SQLSTATE(HY013) MAL_MALLOC_FAIL);
692 : return MAL_SUCCEED;
693 : }
694 :
695 : /* COMMAND "getRobotURL": Extract the location of the robot control file
696 : * SIGNATURE: getRobotURL(str) : str; */
697 : static str
698 6 : URLgetRobotURL(str *retval, const url *val)
699 : {
700 6 : const char *s;
701 6 : size_t l;
702 :
703 6 : if (val == NULL || *val == NULL)
704 0 : throw(ILLARG, "url.getQuery", "url missing");
705 :
706 6 : if (strNil(*val)) {
707 1 : *retval = GDKstrdup(str_nil);
708 : } else {
709 5 : if ((s = skip_scheme(*val)) == NULL
710 5 : || (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL)
711 0 : throw(ILLARG, "url.getQuery", "bad url");
712 5 : l = s - *val;
713 :
714 5 : if ((*retval = GDKmalloc(l + sizeof("/robots.txt"))) != NULL) {
715 5 : sprintf(*retval, "%.*s/robots.txt", (int) l, *val);
716 : }
717 : }
718 :
719 6 : if (*retval == NULL)
720 0 : throw(MAL, "url.getQuery", SQLSTATE(HY013) MAL_MALLOC_FAIL);
721 : return MAL_SUCCEED;
722 : }
723 :
724 : /* COMMAND "getUser": Extract the user identity from the URL
725 : * SIGNATURE: getUser(str) : str; */
726 : static str
727 9 : URLgetUser(str *retval, const url *val)
728 : {
729 9 : const char *s, *h, *u, *p;
730 :
731 9 : if (val == NULL || *val == NULL)
732 0 : throw(ILLARG, "url.getUser", "url missing");
733 :
734 9 : if (strNil(*val)) {
735 1 : *retval = GDKstrdup(str_nil);
736 : } else {
737 8 : if ((s = skip_scheme(*val)) == NULL
738 8 : || (s = skip_authority(s, &u, &p, &h, NULL)) == NULL)
739 0 : throw(ILLARG, "url.getHost", "bad url");
740 8 : if (u == NULL || h == NULL) {
741 4 : *retval = GDKstrdup(str_nil);
742 : } else {
743 4 : size_t l;
744 :
745 4 : if (p) {
746 1 : l = p - u - 1;
747 : } else {
748 3 : l = h - u - 1;
749 : }
750 4 : if ((*retval = GDKmalloc(l + 1)) != NULL) {
751 4 : strcpy_len(*retval, u, l + 1);
752 : }
753 : }
754 : }
755 :
756 9 : if (*retval == NULL)
757 0 : throw(MAL, "url.getUser", SQLSTATE(HY013) MAL_MALLOC_FAIL);
758 : return MAL_SUCCEED;
759 : }
760 :
761 : /* COMMAND "isaURL": Check conformity of the URL syntax
762 : * SIGNATURE: isaURL(str) : bit; */
763 : static str
764 7 : URLisaURL(bit *retval, const char *const *val)
765 : {
766 7 : if (val == NULL || *val == NULL)
767 0 : throw(ILLARG, "url.isaURL", "url missing");
768 7 : if (strNil(*val))
769 0 : *retval = bit_nil;
770 : else
771 7 : *retval = skip_scheme(*val) != NULL;
772 : return MAL_SUCCEED;
773 : }
774 :
775 : static str
776 52 : URLnew(url *u, const char *const *val)
777 : {
778 52 : *u = GDKstrdup(*val);
779 52 : if (*u == NULL)
780 0 : throw(MAL, "url.new", SQLSTATE(HY013) MAL_MALLOC_FAIL);
781 : return MAL_SUCCEED;
782 : }
783 :
784 : static str
785 9 : URLnew3(url *u, const char *const *protocol, const char *const *server, const char *const *file)
786 : {
787 9 : const char *Protocol = *protocol;
788 9 : const char *Server = *server;
789 9 : const char *File = *file;
790 9 : size_t l;
791 :
792 9 : if (strNil(File))
793 : File = "";
794 2 : else if (*File == '/')
795 0 : File++;
796 9 : if (strNil(Server))
797 : Server = "";
798 9 : if (strNil(Protocol))
799 : Protocol = "";
800 9 : l = strlen(File) + strlen(Server) + strlen(Protocol) + 10;
801 9 : *u = GDKmalloc(l);
802 9 : if (*u == NULL)
803 0 : throw(MAL, "url.newurl", SQLSTATE(HY013) MAL_MALLOC_FAIL);
804 9 : snprintf(*u, l, "%s://%s/%s", Protocol, Server, File);
805 9 : return MAL_SUCCEED;
806 : }
807 :
808 : static str
809 3 : URLnew4(url *u, const char *const *protocol, const char *const *server, const int *port, const char *const *file)
810 : {
811 3 : const char *Protocol = *protocol;
812 3 : const char *Server = *server;
813 3 : int Port = *port;
814 3 : const char *File = *file;
815 3 : size_t l;
816 :
817 3 : if (strNil(File))
818 : File = "";
819 2 : else if (*File == '/')
820 0 : File++;
821 3 : if (strNil(Server))
822 : Server = "";
823 3 : if (is_int_nil(Port))
824 1 : Port = 0;
825 3 : if (strNil(Protocol))
826 : Protocol = "";
827 3 : l = strlen(File) + strlen(Server) + strlen(Protocol) + 20;
828 3 : *u = GDKmalloc(l);
829 3 : if (*u == NULL)
830 0 : throw(MAL, "url.newurl", SQLSTATE(HY013) MAL_MALLOC_FAIL);
831 3 : snprintf(*u, l, "%s://%s:%d/%s", Protocol, Server, Port, File);
832 3 : return MAL_SUCCEED;
833 : }
834 :
835 : static str
836 0 : URLnoop(url *u, const url *val)
837 : {
838 0 : *u = GDKstrdup(*val);
839 0 : if (*u == NULL)
840 0 : throw(MAL, "url.noop", SQLSTATE(HY013) MAL_MALLOC_FAIL);
841 : return MAL_SUCCEED;
842 : }
843 :
844 :
845 : /* Extract host identity from URL. This is a relaxed version,
846 : * where no exceptions is thrown when the input URL is not valid,
847 : * and empty string is returned instead.
848 : * */
849 : static str
850 1 : extractURLHost(str *retval, const char *const *url, const bit *no_www)
851 : {
852 1 : const char *s;
853 1 : const char *h = NULL;
854 1 : const char *p = NULL;
855 :
856 2 : if (url != NULL && *url != NULL && !strNil(*url)) {
857 1 : if ((s = skip_scheme(*url)) != NULL
858 0 : && (s = skip_authority(s, NULL, NULL, &h, &p)) != NULL
859 0 : && h != NULL) {
860 : ssize_t l;
861 : const char *pos = s;
862 0 : const char *domain = NULL;
863 0 : while (pos > h) {
864 0 : if (*pos == '.') {
865 : domain = pos;
866 : break;
867 : }
868 0 : pos--;
869 : }
870 :
871 0 : if (p != NULL) {
872 0 : l = p - h - 1;
873 : } else {
874 0 : l = s - h;
875 : }
876 0 : if (*no_www && !strncmp(h, "www.", 4)) {
877 0 : h += 4;
878 0 : l -= 4;
879 : }
880 0 : if (domain && l > 3) {
881 0 : if ((*retval = GDKmalloc(l + 1)) != NULL)
882 0 : strcpy_len(*retval, h, l + 1);
883 : } else {
884 0 : *retval = GDKstrdup(str_nil);
885 : }
886 : } else {
887 1 : *retval = GDKstrdup(str_nil);
888 : }
889 : } else {
890 0 : *retval = GDKstrdup(str_nil);
891 : }
892 1 : if (!*retval)
893 0 : throw(MAL, "url.getURLHost", SQLSTATE(HY013) MAL_MALLOC_FAIL);
894 :
895 : return MAL_SUCCEED;
896 : }
897 :
898 :
899 : static inline str
900 2 : str_buf_copy(str *buf, size_t *buflen, const char *s, size_t l)
901 : {
902 2 : CHECK_STR_BUFFER_LENGTH(buf, buflen, l, "url.str_buf_copy");
903 2 : strcpy_len(*buf, s, l);
904 2 : return MAL_SUCCEED;
905 : }
906 :
907 :
908 : // bulk version
909 : static str
910 2 : BATextractURLHost(bat *res, const bat *bid, const bit *no_www)
911 : {
912 2 : const char *s;
913 2 : const char *host = NULL;
914 2 : const char *port = NULL;
915 2 : BAT *bn = NULL, *b = NULL;
916 2 : BUN p, q;
917 2 : size_t buflen = INITIAL_STR_BUFFER_LENGTH;
918 2 : str buf = GDKmalloc(buflen);
919 2 : str msg = MAL_SUCCEED;
920 2 : bool nils = false;
921 :
922 2 : if (buf == NULL)
923 0 : throw(MAL, "baturl.extractURLHost", SQLSTATE(HY013) MAL_MALLOC_FAIL);
924 :
925 2 : if (!(b = BATdescriptor(*bid))) {
926 0 : GDKfree(buf);
927 0 : throw(MAL, "baturl.extractURLHost",
928 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
929 : }
930 2 : if ((bn = COLnew(b->hseqbase, TYPE_str, BATcount(b), TRANSIENT)) == NULL) {
931 0 : GDKfree(buf);
932 0 : BBPunfix(b->batCacheid);
933 0 : throw(MAL, "baturl.extractURLHost", SQLSTATE(HY013) MAL_MALLOC_FAIL);
934 : }
935 :
936 2 : BATiter bi = bat_iterator(b);
937 4 : BATloop(b, p, q) {
938 2 : const char *url = (const char *) BUNtvar(bi, p);
939 2 : if (strNil(url)) {
940 0 : if (bunfastapp_nocheckVAR(bn, str_nil) != GDK_SUCCEED) {
941 0 : msg = createException(MAL, "baturl.extractURLHost",
942 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
943 0 : break;
944 : }
945 : nils = true;
946 : } else {
947 2 : if ((s = skip_scheme(url)) != NULL
948 2 : && (s = skip_authority(s, NULL, NULL, &host, &port)) != NULL
949 2 : && host != NULL) {
950 : ssize_t l;
951 : const char *pos = s;
952 18 : const char *domain = NULL;
953 18 : while (pos > host) {
954 18 : if (*pos == '.') {
955 : domain = pos;
956 : break;
957 : }
958 16 : pos--;
959 : }
960 :
961 2 : if (port != NULL) {
962 2 : l = port - host - 1;
963 : } else {
964 0 : l = s - host;
965 : }
966 2 : if (domain && l > 3) {
967 2 : if (*no_www && !strncmp(host, "www.", 4)) {
968 1 : host += 4;
969 1 : l -= 4;
970 : }
971 2 : if (l > 0) {
972 : // if ((msg = str_Sub_String(&buf, &buflen, host, 0, l)) != MAL_SUCCEED)
973 : // break;
974 2 : if ((msg = str_buf_copy(&buf, &buflen, host,
975 2 : (size_t) (l + 1))) != MAL_SUCCEED)
976 : break;
977 2 : if (bunfastapp_nocheckVAR(bn, buf) != GDK_SUCCEED) {
978 0 : msg = createException(MAL, "baturl.extractURLHost",
979 : SQLSTATE(HY013)
980 : MAL_MALLOC_FAIL);
981 0 : break;
982 : }
983 2 : continue;
984 : }
985 : }
986 : }
987 : // fall back insert nil str if no valid host
988 0 : if (bunfastapp_nocheckVAR(bn, str_nil) != GDK_SUCCEED) {
989 0 : msg = createException(MAL, "baturl.extractURLHost",
990 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
991 0 : break;
992 : }
993 : nils = true;
994 : }
995 : }
996 2 : bat_iterator_end(&bi);
997 :
998 2 : GDKfree(buf);
999 2 : if (msg == MAL_SUCCEED) {
1000 2 : BATsetcount(bn, q);
1001 2 : bn->tnil = nils;
1002 2 : bn->tnonil = !nils;
1003 2 : bn->tkey = BATcount(bn) <= 1;
1004 2 : bn->tsorted = BATcount(bn) <= 1;
1005 2 : bn->trevsorted = BATcount(bn) <= 1;
1006 2 : *res = bn->batCacheid;
1007 2 : BBPkeepref(bn);
1008 : }
1009 2 : BBPunfix(b->batCacheid);
1010 2 : return msg;
1011 : }
1012 :
1013 :
1014 : #include "mel.h"
1015 : mel_atom url_init_atoms[] = {
1016 : { .name="url", .basetype="str", .fromstr=URLfromString, .tostr=URLtoString, }, { .cmp=NULL }
1017 : };
1018 : mel_func url_init_funcs[] = {
1019 : command("url", "url", URLnew, false, "Create an URL from a string literal", args(1,2, arg("",url),arg("s",str))),
1020 : command("url", "url", URLnoop, false, "Create an URL from a string literal", args(1,2, arg("",url),arg("s",url))),
1021 : command("calc", "url", URLnew, false, "Create an URL from a string literal", args(1,2, arg("",url),arg("s",str))),
1022 : command("calc", "url", URLnoop, false, "Create an URL from a string literal", args(1,2, arg("",url),arg("s",url))),
1023 : command("url", "getAnchor", URLgetAnchor, false, "Extract the URL anchor (reference)", args(1,2, arg("",str),arg("u",url))),
1024 : command("url", "getBasename", URLgetBasename, false, "Extract the URL base file name", args(1,2, arg("",str),arg("u",url))),
1025 : command("url", "getContext", URLgetContext, false, "Get the path context of a URL", args(1,2, arg("",str),arg("u",url))),
1026 : command("url", "getDomain", URLgetDomain, false, "Extract Internet domain from the URL", args(1,2, arg("",str),arg("u",url))),
1027 : command("url", "getExtension", URLgetExtension, false, "Extract the file extension of the URL", args(1,2, arg("",str),arg("u",url))),
1028 : command("url", "getFile", URLgetFile, false, "Extract the last file name of the URL", args(1,2, arg("",str),arg("u",url))),
1029 : command("url", "getHost", URLgetHost, false, "Extract the server name from the URL strict version", args(1,2, arg("",str),arg("u",url))),
1030 : command("url", "getPort", URLgetPort, false, "Extract the port id from the URL", args(1,2, arg("",str),arg("u",url))),
1031 : command("url", "getProtocol", URLgetProtocol, false, "Extract the protocol from the URL", args(1,2, arg("",str),arg("u",url))),
1032 : command("url", "getQuery", URLgetQuery, false, "Extract the query string from the URL", args(1,2, arg("",str),arg("u",url))),
1033 : command("url", "getUser", URLgetUser, false, "Extract the user identity from the URL", args(1,2, arg("",str),arg("u",url))),
1034 : command("url", "getRobotURL", URLgetRobotURL, false, "Extract the location of the robot control file", args(1,2, arg("",str),arg("u",url))),
1035 : command("url", "isaURL", URLisaURL, false, "Check conformity of the URL syntax", args(1,2, arg("",bit),arg("u",str))),
1036 : command("url", "new", URLnew4, false, "Construct URL from protocol, host, port, and file", args(1,5, arg("",url),arg("p",str),arg("h",str),arg("prt",int),arg("f",str))),
1037 : command("url", "new", URLnew3, false, "Construct URL from protocol, host,and file", args(1,4, arg("",url),arg("prot",str),arg("host",str),arg("fnme",str))),
1038 : command("url", "extractURLHost", extractURLHost, false, "Extract host from a URL relaxed version", args(1,3, arg("",str),arg("u",str), arg("no_www", bit))),
1039 : command("baturl", "extractURLHost", BATextractURLHost, false, "Extract host from BAT of URLs", args(1,3, batarg("",str), batarg("s",str), arg("no_www", bit))),
1040 : { .imp=NULL }
1041 : };
1042 : #include "mal_import.h"
1043 : #ifdef _MSC_VER
1044 : #undef read
1045 : #pragma section(".CRT$XCU",read)
1046 : #endif
1047 323 : LIB_STARTUP_FUNC(init_url_mal)
1048 323 : { mal_module("url", url_init_atoms, url_init_funcs); }
|