LCOV - code coverage report
Current view: top level - sql/backends/monet5/UDF/pyapi3 - unicode3.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 26 132 19.7 %
Date: 2024-11-14 20:04:02 Functions: 2 10 20.0 %

          Line data    Source code
       1             : /*
       2             :  * SPDX-License-Identifier: MPL-2.0
       3             :  *
       4             :  * This Source Code Form is subject to the terms of the Mozilla Public
       5             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       6             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       7             :  *
       8             :  * Copyright 2024 MonetDB Foundation;
       9             :  * Copyright August 2008 - 2023 MonetDB B.V.;
      10             :  * Copyright 1997 - July 2008 CWI.
      11             :  */
      12             : 
      13             : #include "monetdb_config.h"
      14             : #include "unicode.h"
      15             : 
      16             : #include <string.h>
      17             : 
      18           0 : int utf8_strlen(const char *utf8_str, bool *ascii)
      19             : {
      20           0 :         int utf8_char_count = 0;
      21           0 :         int i = 0;
      22             :         // we traverse the string and simply count the amount of utf8 characters in
      23             :         // the string
      24           0 :         while (true) {
      25           0 :                 int offset;
      26           0 :                 if (utf8_str[i] == '\0')
      27             :                         break;
      28           0 :                 offset = utf8_length(utf8_str[i]);
      29           0 :                 if (offset < 0)
      30             :                         return -1; // invalid utf8 character
      31           0 :                 i += offset;
      32           0 :                 utf8_char_count++;
      33             :         }
      34           0 :         if (ascii != NULL)
      35           0 :                 *ascii = i == utf8_char_count;
      36             :         return utf8_char_count;
      37             : }
      38             : 
      39           0 : size_t utf32_strlen(const wchar_t *utf32_str)
      40             : {
      41           0 :         size_t i = 0;
      42           0 :         while (utf32_str[i] != 0)
      43           0 :                 i++;
      44           0 :         return i;
      45             : }
      46             : 
      47           0 : int utf8_length(unsigned char utf8_char)
      48             : {
      49             :         // the first byte tells us how many bytes the utf8 character uses
      50           0 :         if (utf8_char < 0x80)
      51             :                 return 1;
      52           0 :         else if (utf8_char < 0xe0)
      53             :                 return 2;
      54           0 :         else if (utf8_char < 0xf0)
      55             :                 return 3;
      56           0 :         else if (utf8_char < 0xf8)
      57             :                 return 4;
      58             :         else
      59           0 :                 return -1; // invalid utf8 character, the maximum value of the first
      60             :                                    // byte is 0xf7
      61             : }
      62             : 
      63     5000116 : int utf32_char_to_utf8_char(size_t position, char *utf8_storage,
      64             :                                                         unsigned int utf32_char)
      65             : {
      66     5000116 :         int utf8_size = 4;
      67     5000116 :         if (utf32_char < 0x80)
      68             :                 utf8_size = 1;
      69           2 :         else if (utf32_char < 0x800)
      70             :                 utf8_size = 2;
      71           0 :         else if (utf32_char < 0x10000)
      72             :                 utf8_size = 3;
      73           0 :         else if (utf32_char > 0x0010FFFF)
      74             :                 return -1; // utf32 character is out of legal range
      75             : 
      76     5000116 :         switch (utf8_size) {
      77             :                 case 4:
      78           0 :                         utf8_storage[position + 3] = ((utf32_char | 0x80) & 0xbf);
      79           0 :                         utf32_char >>= 6;
      80           0 :                         utf8_storage[position + 2] = ((utf32_char | 0x80) & 0xbf);
      81           0 :                         utf32_char >>= 6;
      82           0 :                         utf8_storage[position + 1] = ((utf32_char | 0x80) & 0xbf);
      83           0 :                         utf32_char >>= 6;
      84           0 :                         utf8_storage[position] = (utf32_char | 0xf0);
      85           0 :                         return utf8_size;
      86             :                 case 3:
      87           0 :                         utf8_storage[position + 2] = ((utf32_char | 0x80) & 0xbf);
      88           0 :                         utf32_char >>= 6;
      89           0 :                         utf8_storage[position + 1] = ((utf32_char | 0x80) & 0xbf);
      90           0 :                         utf32_char >>= 6;
      91           0 :                         utf8_storage[position] = (utf32_char | 0xe0);
      92           0 :                         return utf8_size;
      93             :                 case 2:
      94           2 :                         utf8_storage[position + 1] = ((utf32_char | 0x80) & 0xbf);
      95           2 :                         utf32_char >>= 6;
      96           2 :                         utf8_storage[position] = (utf32_char | 0xc0);
      97           2 :                         return utf8_size;
      98             :                 default:
      99     5000114 :                         utf8_storage[position] = (char)utf32_char;
     100     5000114 :                         return utf8_size;
     101             :         }
     102             : }
     103             : 
     104           0 : bool ucs2_to_utf8(size_t offset, size_t size, char *utf8_storage,
     105             :                                   const wchar_t *ucs2)
     106             : {
     107           0 :         size_t i = 0;
     108           0 :         int position = 0;
     109           0 :         int shift;
     110           0 :         for (i = 0; i < size; i++) {
     111           0 :                 if (ucs2[offset + i] == 0) {
     112           0 :                         utf8_storage[position] = '\0';
     113           0 :                         return true;
     114             :                 }
     115           0 :                 shift =
     116           0 :                         utf32_char_to_utf8_char(position, utf8_storage, ucs2[offset + i]);
     117           0 :                 if (shift < 0)
     118             :                         return false;
     119           0 :                 position += shift;
     120             :         }
     121           0 :         utf8_storage[position] = '\0';
     122           0 :         return true;
     123             : }
     124             : 
     125     1000022 : bool utf32_to_utf8(size_t offset, size_t size, char *utf8_storage,
     126             :                                    const wchar_t *utf32_input)
     127             : {
     128     1000022 :         size_t i = 0;
     129     1000022 :         int position = 0;
     130     1000022 :         int shift;
     131     1000022 :         unsigned int *utf32 = (unsigned int *)utf32_input;
     132             : 
     133     6000138 :         for (i = 0; i < size; i++) {
     134     5000126 :                 if (utf32[offset + i] == 0) {
     135          10 :                         utf8_storage[position] = '\0';
     136          10 :                         return true;
     137             :                 }
     138             : 
     139     5000116 :                 shift =
     140     5000116 :                         utf32_char_to_utf8_char(position, utf8_storage, utf32[offset + i]);
     141     5000116 :                 if (shift < 0)
     142             :                         return false;
     143     5000116 :                 position += shift;
     144             :         }
     145     1000012 :         utf8_storage[position] = '\0';
     146     1000012 :         return true;
     147             : }
     148             : 
     149           0 : bool unicode_to_utf8(size_t offset, size_t size, char *utf8_storage,
     150             :                                          const wchar_t *unicode)
     151             : {
     152             : #if SIZEOF_WCHAR_T == 2
     153             :         return ucs2_to_utf8(offset, size, utf8_storage, unicode);
     154             : #else
     155           0 :         return utf32_to_utf8(offset, size, utf8_storage, unicode);
     156             : #endif
     157             : }
     158             : 
     159           0 : int utf8_char_to_utf32_char(size_t position, wchar_t *utf32_storage,
     160             :                                                         int offset, const unsigned char *utf8_char)
     161             : {
     162           0 :         unsigned char bytes[4];
     163           0 :         int utf8_size = 4;
     164           0 :         bytes[0] = utf8_char[offset];
     165           0 :         bytes[1] = 0xFF;
     166           0 :         bytes[2] = 0xFF;
     167           0 :         bytes[3] = 0xFF;
     168             :         // the first byte tells us how many bytes the utf8 character uses
     169           0 :         if (bytes[0] < 0x80)
     170             :                 utf8_size = 1;
     171           0 :         else if (bytes[0] < 0xe0)
     172             :                 utf8_size = 2;
     173           0 :         else if (bytes[0] < 0xf0)
     174             :                 utf8_size = 3;
     175           0 :         else if (bytes[0] < 0xf8)
     176             :                 utf8_size = 4;
     177             :         else
     178             :                 return -1; // invalid utf8 character, the maximum value of the first
     179             :                                    // byte is 0xf7
     180             : 
     181             : #if SIZEOF_WCHAR_T == 2
     182             :         if (utf8_size > 2) {
     183             :                 // utf-8 character out of range on a UCS2 python compilation
     184             :                 return -1;
     185             :         }
     186             : #endif
     187             : 
     188           0 :         switch (utf8_size) {
     189             :                 case 4:
     190           0 :                         bytes[3] = utf8_char[offset + 3];
     191           0 :                         if (bytes[3] > 0xc0)
     192             :                                 return -1; // invalid utf8 character, the maximum value of the
     193             :                                                    // second, third and fourth bytes is 0xbf
     194             :                         /* fall through */
     195             :                 case 3:
     196           0 :                         bytes[2] = utf8_char[offset + 2];
     197           0 :                         if (bytes[2] > 0xc0)
     198             :                                 return -1;
     199             :                         /* fall through */
     200             :                 case 2:
     201           0 :                         bytes[1] = utf8_char[offset + 1];
     202           0 :                         if (bytes[1] > 0xc0)
     203             :                                 return -1;
     204             :         }
     205             : 
     206           0 :         utf32_storage[position] = 0;
     207             : 
     208           0 :         switch (utf8_size) {
     209           0 :                 case 4:
     210           0 :                         utf32_storage[position] |= (0x3f & bytes[3]);
     211           0 :                         utf32_storage[position] |= (0x3f & bytes[2]) << 6;
     212           0 :                         utf32_storage[position] |= (0x3f & bytes[1]) << 12;
     213           0 :                         utf32_storage[position] |= (0x7 & bytes[0]) << 18;
     214           0 :                         return utf8_size;
     215           0 :                 case 3:
     216           0 :                         utf32_storage[position] |= (0x3f & bytes[2]);
     217           0 :                         utf32_storage[position] |= (0x3f & bytes[1]) << 6;
     218           0 :                         utf32_storage[position] |= (0xf & bytes[0]) << 12;
     219           0 :                         return utf8_size;
     220           0 :                 case 2:
     221           0 :                         utf32_storage[position] |= (0x3f & bytes[1]);
     222           0 :                         utf32_storage[position] |= (0x1f & bytes[0]) << 6;
     223           0 :                         return utf8_size;
     224           0 :                 default:
     225           0 :                         utf32_storage[position] |= 0x7f & bytes[0];
     226           0 :                         return utf8_size;
     227             :         }
     228             : }
     229             : 
     230           0 : bool utf8_to_utf32(size_t offset, size_t size, wchar_t *utf32_storage,
     231             :                                    const unsigned char *utf8)
     232             : {
     233           0 :         size_t i = 0;
     234           0 :         int position = 0;
     235           0 :         int shift;
     236           0 :         for (i = 0; i < size; i++) {
     237           0 :                 if (utf8[offset + position] == 0) {
     238           0 :                         utf32_storage[i] = '\0';
     239           0 :                         return true;
     240             :                 }
     241             : 
     242           0 :                 shift = utf8_char_to_utf32_char((int)i, utf32_storage,
     243           0 :                                                                                 (int)(offset + position), utf8);
     244           0 :                 if (shift < 0)
     245             :                         return false;
     246           0 :                 position += shift;
     247             :         }
     248             :         return true;
     249             : }
     250             : 
     251           0 : void _unicode_init(void) { _import_array(); }

Generated by: LCOV version 1.14