Line data Source code
1 : /* 2 : * SPDX-License-Identifier: MPL-2.0 3 : * 4 : * This Source Code Form is subject to the terms of the Mozilla Public 5 : * License, v. 2.0. If a copy of the MPL was not distributed with this 6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 : * 8 : * Copyright 2024 MonetDB Foundation; 9 : * Copyright August 2008 - 2023 MonetDB B.V.; 10 : * Copyright 1997 - July 2008 CWI. 11 : */ 12 : 13 : /* return display width of Unicode codepoint c */ 14 : extern int charwidth(uint32_t c); 15 : 16 : /* decode UTF-8 string byte by byte into *state and *codep, returns 17 : * state; UTF-8 sequence is complete (and value is in *codep) when state 18 : * is UTF8_ACCEPT, incorrect when state is UTF8_REJECT, and incomplete 19 : * for any other value of state */ 20 : 21 : /* this function and the table are copyright Bjoern Hoehrmann per the 22 : * below notice. The layout was changed. */ 23 : 24 : // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> 25 : // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. 26 : 27 : extern const uint8_t utf8d[364]; 28 : #define UTF8_ACCEPT 0 29 : #define UTF8_REJECT 12 30 : 31 : static inline uint32_t 32 65683185 : decode(uint32_t *state, uint32_t *codep, uint32_t byte) 33 : { 34 65683185 : uint32_t type = utf8d[byte]; 35 : 36 131366370 : *codep = (*state != UTF8_ACCEPT) ? 37 65683185 : (byte & 0x3fu) | (*codep << 6) : 38 65673840 : (0xff >> type) & (byte); 39 : 40 65683185 : *state = utf8d[256 + *state + type]; 41 65683185 : return *state; 42 : } 43 : /* end copyright Bjoern Hoehrmann */ 44 : 45 : /* return in *c the codepoint of the next character in string s, return 46 : * a pointer to the start of the following character */ 47 : static inline char * 48 518 : nextchar(const char *s, uint32_t *c) 49 : { 50 518 : uint32_t codepoint = 0, state = 0; 51 518 : while (*s) { 52 518 : switch (decode(&state, &codepoint, (uint8_t) *s++)) { 53 518 : case UTF8_ACCEPT: 54 518 : *c = codepoint; 55 518 : return (char *) s; 56 0 : case UTF8_REJECT: 57 0 : *c = 0; 58 0 : return NULL; 59 : default: 60 : break; 61 : } 62 : } 63 0 : *c = 0; 64 0 : return NULL; 65 : } 66 : 67 : /* like the above, but s is at most n bytes long */ 68 : static inline char * 69 0 : nextcharn(const char *s, size_t n, uint32_t *c) 70 : { 71 0 : uint32_t codepoint = 0, state = 0; 72 0 : while (n-- > 0 && *s) { 73 0 : switch (decode(&state, &codepoint, (uint8_t) *s++)) { 74 0 : case UTF8_ACCEPT: 75 0 : *c = codepoint; 76 0 : return (char *) s; 77 0 : case UTF8_REJECT: 78 0 : *c = 0; 79 0 : return NULL; 80 : default: 81 : break; 82 : } 83 : } 84 0 : *c = 0; 85 0 : return NULL; 86 : }