feat(utf-8 string): implemented validation of utf-8 string

This commit is contained in:
Mineplay 2025-05-11 15:24:45 -05:00
parent e9d8cdd8a3
commit 13a95d9027
4 changed files with 77 additions and 6 deletions

View file

@ -23,7 +23,6 @@
* Author: Mineplay
* -----------------------------------------------------------------------------
*/
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
@ -38,7 +37,7 @@ typedef struct {
FledastyError fledasty_utf8_string_initialize(FledastyUtf8String *new_string, unsigned char *character_string, size_t character_string_length);
FledastyError fledasty_utf8_string_destroy(FledastyUtf8String *current_string);
FledastyUtf8String fledasty_utf8_string_encode(uint32_t *unicode, size_t size);
FledastyUtf8String fledasty_utf8_string_encode(uint32_t *unicode, const size_t size);
uint32_t *fledasty_utf8_string_decode(FledastyUtf8String *current_string, size_t *unicode_string_length);
FledastyError fledasty_utf8_string_append(FledastyUtf8String *current_string, unsigned char *character_string);
bool fledasty_utf8_string_validate(unsigned char *character_string, const size_t character_string_length);

View file

@ -30,6 +30,7 @@ typedef enum {
FLEDASTY_ERROR_INDEX_OUT_OF_RANGE = 3,
FLEDASTY_ERROR_VALUE_NOT_FOUND = 4,
FLEDASTY_ERROR_KEY_NOT_FOUND = 5,
FLEDASTY_ERROR_INVALID_VALUE = 6,
} FledastyError;
#endif

View file

@ -35,12 +35,16 @@ FledastyError fledasty_utf8_string_initialize(FledastyUtf8String *new_string, un
return FLEDASTY_ERROR_INVALID_POINTER;
}
if (character_string == NULL) {
if (character_string == NULL || character_string_length == 0) {
new_string->size = 0;
new_string->capacity = 10;
new_string->character_string = hallocy_malloc(new_string->capacity);
} else {
if (!fledasty_utf8_string_validate(character_string, character_string_length)) {
return FLEDASTY_ERROR_INVALID_VALUE;
}
new_string->size = character_string_length;
new_string->capacity = new_string->size + new_string->size;
@ -68,7 +72,7 @@ FledastyError fledasty_utf8_string_destroy(FledastyUtf8String *current_string) {
return FLEDASTY_ERROR_NONE;
}
FledastyUtf8String fledasty_utf8_string_encode(uint32_t *unicode, size_t size) {
FledastyUtf8String fledasty_utf8_string_encode(uint32_t *unicode, const size_t size) {
FledastyUtf8String utf8_string;
fledasty_utf8_string_initialize(&utf8_string, NULL, 0);
@ -130,7 +134,7 @@ FledastyUtf8String fledasty_utf8_string_encode(uint32_t *unicode, size_t size) {
}
uint32_t *fledasty_utf8_string_decode(FledastyUtf8String *current_string, size_t *unicode_string_length) {
if (current_string == NULL) {
if (current_string == NULL || unicode_string_length == NULL) {
return NULL;
}
@ -157,3 +161,54 @@ uint32_t *fledasty_utf8_string_decode(FledastyUtf8String *current_string, size_t
return unicode_string;
}
bool fledasty_utf8_string_validate(unsigned char *character_string, const size_t character_string_length) {
if (character_string == NULL) {
return false;
}
size_t index = 0;
while (index < character_string_length) {
if ((character_string[index] & 0xF0) == 0xF0) {
if (index + 3 >= character_string_length) {
return false;
}
if ((character_string[index + 1] & 0xC0) != 0x80) {
return false;
} else if ((character_string[index + 2] & 0xC0) != 0x80) {
return false;
} else if ((character_string[index + 3] & 0xC0) != 0x80) {
return false;
}
index += 4;
} else if ((character_string[index] & 0xE0) == 0xC0) {
if (index + 2 >= character_string_length) {
return false;
}
if ((character_string[index + 1] & 0xC0) != 0x80) {
return false;
} else if ((character_string[index + 2] & 0xC0) != 0x80) {
return false;
}
index += 3;
} else if ((character_string[index] & 0xC0) == 0xC0) {
if (index + 1 >= character_string_length) {
return false;
}
if ((character_string[index + 1] & 0xC0) != 0x80) {
return false;
}
index += 2;
} else {
index += 1;
}
}
return true;
}

View file

@ -235,6 +235,22 @@ int main() {
printf("%s\n", encoded_string.character_string);
if (fledasty_utf8_string_validate(test_utf8_string.character_string, encoded_string.size)) {
printf("UTF-8 test string is valid!\n");
}
if (fledasty_utf8_string_validate(encoded_string.character_string, encoded_string.size)) {
printf("UTF-8 encoded string is valid!\n");
}
unsigned char *invalid_utf8 = (unsigned char*)hallocy_malloc(2 * sizeof(unsigned char));
invalid_utf8[0] = 0xDF;
invalid_utf8[1] = 0xFF;
if (!fledasty_utf8_string_validate(invalid_utf8, 2)) {
printf("UTF-8 invalid string is invalid!\n");
}
hallocy_free(invalid_utf8);
hallocy_free(unicode);
fledasty_utf8_string_destroy(&encoded_string);