cangjie_stdx/src/stdx/fuzz/native/utf8_fix.c-代码预览-cangjie_stdx:基于仓颉编程语言的拓展标准库项目 - AtomGit

/*
 * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
 * This source file is part of the Cangjie project, licensed under Apache-2.0
 * with Runtime Library Exception.
 *
 * See https://cangjie-lang.cn/pages/LICENSE for license information.
 */

#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include "utf8_fix.h"

static inline __attribute__((always_inline)) uint8_t countl_one(uint8_t byte)
{
    // The result of __builtin_clz is undefined for 0.
    // Just return 8.
    if (byte == 0xFF) {
        return 8;
    }
    // __builtin_clz perform on uint32, input is uint8, the result should minus 24
    return __builtin_clz((uint8_t)(~byte)) - 24;
}

// Forces a byte to be a valid UTF-8 continuation byte.
static inline __attribute__((always_inline)) void ForceContinuationByte(uint8_t* byte)
{
    *byte = (*byte | (1u << 7u)) & ~(1u << 6u);
}

int64_t CJ_fix_to_utf8(uint8_t* data, int64_t data_len)
{
    if (data == NULL) {
        return 0;
    }
    int64_t offset = 0;
    bool stop = false;
    while (!stop && offset < data_len) {
        uint8_t c0 = data[offset];
        // In UTF8 encoding, the number of subsequent bytes required is determined by the number of consecutive '1's
        // starting with the first byte, and the subsequent bytes are forced to be converted to ContinuationByte
        switch (countl_one(c0)) {
            case 0: {
                // Starts with 0 '1's, which is 0xxxxxxx, indicating 1-byte UTF8, which is legal and does not need to be
                // processed
                offset += 1;
                break; // breakout switch
            }
            case 1: {
                // Characters starting with 1 '1' cannot be used as the starting character, but can only be used as
                // subsequent characters. They need to be converted to case 2. Requires 2 bytes
                if (offset + 2 > data_len) {
                    stop = true; // breakout loop
                    break;       // breakout switch
                }
                ForceContinuationByte(&data[offset + 1]);

                // Convert the first byte from 10xxxxxx to 110xxxxx
                c0 = (c0 & 0b00011111) | 0b11000000;
                data[offset] = c0;

                // c0 is 110xxxxx, to avoid codepoint falling in [0,2**7), it is necessary to ensure that xxxxx cannot
                // be 0000x
                if ((c0 & 0b11110) == 0) {
                    // Need to fix c0, fix xxxxx to xxx1x
                    c0 |= 0b00000010;
                    data[offset] = c0;
                }
                // Processing 2 bytes completed
                offset += 2;
                break; // breakout switch
            }
            case 2: {
                // Requires 2 bytes
                if (offset + 2 > data_len) {
                    stop = true; // breakout loop
                    break;       // breakout switch
                }
                ForceContinuationByte(&data[offset + 1]);
                // c0 is 110xxxxx, to avoid codepoint falling in [0,2**7), it is necessary to ensure that xxxxx cannot
                // be 0000x
                if ((c0 & 0b11110) == 0) {
                    // Need to fix c0, fix xxxxx to xxx1x
                    c0 |= 0b00000010;
                    data[offset] = c0;
                }
                // Processing 2 bytes completed
                offset += 2;
                break;
            }
            case 3: {
                // Requires 3 bytes
                if (offset + 3 > data_len) {
                    stop = true; // breakout loop
                    break;       // breakout switch
                }
                ForceContinuationByte(&data[offset + 1]);
                ForceContinuationByte(&data[offset + 2]);

                // c0 is 1110xxxx, to avoid codepoint falling in [0,2**11), it is necessary to ensure that when
                // xxxx==0000, yyyyyy cannot be 0yyyyyy
                uint8_t c1 = data[offset + 1];
                if ((c0 & 0b1111) == 0 && (c1 & 0b00100000) == 0) {
                    // Need to fix c1, write yyyyyy as 1yyyyy
                    c1 |= 0b00100000;
                    data[offset + 1] = c1;
                }
                // To prevent the codepoint from falling within the range [0xD800, 0xE000), it is necessary to ensure
                // that when xxxx==1101, yyyyyy cannot be 1yyyyyy
                if ((c0 & 0b1111) == 0b1101 && (c1 & 0b00100000) != 0) {
                    // Need to fix c1, write yyyyyy to 0yyyyy
                    c1 &= 0b11011111;
                    data[offset + 1] = c1;
                }
                // Processing 3 bytes completed
                offset += 3;
                break; // breakout switch
            }
            case 4: {
                // Requires 4 bytes
                if (offset + 4 > data_len) {
                    stop = true; // breakout loop
                    break;       // breakout switch
                }
                ForceContinuationByte(&data[offset + 1]);
                ForceContinuationByte(&data[offset + 2]);
                ForceContinuationByte(&data[offset + 3]);

                uint8_t c1 = data[offset + 1];
                // c0 is 11110xxx, the codepoint range is [2**16, 2**21), and the value range of xxx is only 000, 001,
                // 010, 011, 100
                if ((c0 & 0b111) > 0b100) {
                    // xxx is 101/110/111
                    // Need to fix c0, write 101/110/111 to 001/010/011
                    c0 &= 0b11111011;
                    data[offset] = c0;
                }
                // To avoid codepoint falling in [0,2**16), it is necessary to ensure that when xxx==000, yyyyyy cannot
                // be 00yyyy
                if ((c0 & 0b111) == 0 && (c1 & 0b00110000) == 0) {
                    // xxx is 000
                    // Need to fix c1, write yyyyyy to y1yyyy
                    c1 |= 0b00010000;
                    data[offset + 1] = c1;
                }
                // To avoid codepoint falling in the range of [0x11000000,2**21), when xxx==100 is required, yyyyyy must
                // be 00yyyy
                if ((c0 & 0b111) == 0b100 && (c1 & 0b00110000) != 0) {
                    // xxx is 100
                    // Need to fix c1, write yyyyyy as 00yyyy
                    c1 &= 0b11001111;
                    data[offset + 1] = c1;
                }
                // Processing 4 bytes completed
                offset += 4;
                break; // breakout switch
            }
            default: {
                // 0b111110xx, 0b1111110x, 0b11111110
                // Illegal, convert directly to ascii
                data[offset] = c0 & 0b01111111;
                offset += 1;
                break; // breakout switch
            }
        }
    }
    // At this time, the entire data is repaired, and the repair range is [0, offset), and the data outside the range
    // remains unchanged.
    return offset;
}