LCOV - code coverage report
Current view: top level - common/utils - mutf8.h (source / functions) Hit Total Coverage
Test: coverage.info Lines: 14 31 45.2 %
Date: 2024-12-19 20:05:57 Functions: 2 3 66.7 %

          Line data    Source code
       1             : /*
       2             :  * SPDX-License-Identifier: MPL-2.0
       3             :  *
       4             :  * This Source Code Form is subject to the terms of the Mozilla Public
       5             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       6             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       7             :  *
       8             :  * Copyright 2024 MonetDB Foundation;
       9             :  * Copyright August 2008 - 2023 MonetDB B.V.;
      10             :  * Copyright 1997 - July 2008 CWI.
      11             :  */
      12             : 
      13             : /* return display width of Unicode codepoint c */
      14             : extern int charwidth(uint32_t c);
      15             : 
      16             : /* decode UTF-8 string byte by byte into *state and *codep, returns
      17             : *  state; UTF-8 sequence is complete (and value is in *codep) when state
      18             : *  is UTF8_ACCEPT, incorrect when state is UTF8_REJECT, and incomplete
      19             : *  for any other value of state */
      20             : 
      21             : /* this function and the table are copyright Bjoern Hoehrmann per the
      22             :  * below notice.  The layout was changed. */
      23             : 
      24             : // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
      25             : // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
      26             : 
      27             : extern const uint8_t utf8d[364];
      28             : #define UTF8_ACCEPT 0
      29             : #define UTF8_REJECT 12
      30             : 
      31             : static inline uint32_t
      32   138351083 : decode(uint32_t *state, uint32_t *codep, uint32_t byte)
      33             : {
      34   138351083 :         uint32_t type = utf8d[byte];
      35             : 
      36   276702166 :         *codep = (*state != UTF8_ACCEPT) ?
      37   138351083 :                 (byte & 0x3fu) | (*codep << 6) :
      38   138341738 :                 (0xff >> type) & (byte);
      39             : 
      40   138351083 :         *state = utf8d[256 + *state + type];
      41   138351083 :         return *state;
      42             : }
      43             : /* end copyright Bjoern Hoehrmann */
      44             : 
      45             : /* return in *c the codepoint of the next character in string s, return
      46             :  * a pointer to the start of the following character */
      47             : static inline char *
      48         518 : nextchar(const char *s, uint32_t *c)
      49             : {
      50         518 :         uint32_t codepoint = 0, state = 0;
      51         518 :         while (*s) {
      52         518 :                 switch (decode(&state, &codepoint, (uint8_t) *s++)) {
      53         518 :                 case UTF8_ACCEPT:
      54         518 :                         *c = codepoint;
      55         518 :                         return (char *) s;
      56           0 :                 case UTF8_REJECT:
      57           0 :                         *c = 0;
      58           0 :                         return NULL;
      59             :                 default:
      60             :                         break;
      61             :                 }
      62             :         }
      63           0 :         *c = 0;
      64           0 :         return NULL;
      65             : }
      66             : 
      67             : /* like the above, but s is at most n bytes long */
      68             : static inline char *
      69           0 : nextcharn(const char *s, size_t n, uint32_t *c)
      70             : {
      71           0 :         uint32_t codepoint = 0, state = 0;
      72           0 :         while (n-- > 0 && *s) {
      73           0 :                 switch (decode(&state, &codepoint, (uint8_t) *s++)) {
      74           0 :                 case UTF8_ACCEPT:
      75           0 :                         *c = codepoint;
      76           0 :                         return (char *) s;
      77           0 :                 case UTF8_REJECT:
      78           0 :                         *c = 0;
      79           0 :                         return NULL;
      80             :                 default:
      81             :                         break;
      82             :                 }
      83             :         }
      84           0 :         *c = 0;
      85           0 :         return NULL;
      86             : }

Generated by: LCOV version 1.14