feat(utf-8 string): implemented encoding and decoding of utf-8 string with unicode

This commit is contained in:
Mineplay 2025-05-07 04:33:25 -05:00
parent 2353358199
commit e9d8cdd8a3
3 changed files with 107 additions and 2 deletions

View file

@ -25,6 +25,7 @@
*/
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#include "../Utils/Error.h"
@ -37,4 +38,7 @@ typedef struct {
FledastyError fledasty_utf8_string_initialize(FledastyUtf8String *new_string, unsigned char *character_string, size_t character_string_length);
FledastyError fledasty_utf8_string_destroy(FledastyUtf8String *current_string);
FledastyUtf8String fledasty_utf8_string_encode(uint32_t *unicode, size_t size);
uint32_t *fledasty_utf8_string_decode(FledastyUtf8String *current_string, size_t *unicode_string_length);
FledastyError fledasty_utf8_string_append(FledastyUtf8String *current_string, unsigned char *character_string);

View file

@ -28,13 +28,14 @@
#include <Hallocy/Core/Allocator.h>
#include <Hallocy/Core/Memory.h>
#include <Hallocy/Utils/Error.h>
#include <stdint.h>
FledastyError fledasty_utf8_string_initialize(FledastyUtf8String *new_string, unsigned char *character_string, size_t character_string_length) {
if (new_string == NULL) {
return FLEDASTY_ERROR_INVALID_POINTER;
}
if (character_string == NULL || character_string_length == 0) {
if (character_string == NULL) {
new_string->size = 0;
new_string->capacity = 10;
@ -66,3 +67,93 @@ FledastyError fledasty_utf8_string_destroy(FledastyUtf8String *current_string) {
current_string->character_string = NULL;
return FLEDASTY_ERROR_NONE;
}
FledastyUtf8String fledasty_utf8_string_encode(uint32_t *unicode, size_t size) {
FledastyUtf8String utf8_string;
fledasty_utf8_string_initialize(&utf8_string, NULL, 0);
if (unicode == NULL) {
return utf8_string;
}
size_t string_index = 0;
for (size_t index = 0; index < size; index += 1) {
if (unicode[index] <= 0x00007F) {
if (utf8_string.capacity <= string_index) {
utf8_string.capacity += utf8_string.capacity;
utf8_string.character_string = hallocy_realloc(utf8_string.character_string, utf8_string.capacity);
}
utf8_string.character_string[string_index] = unicode[index];
string_index += 1;
} else if (unicode[index] <= 0x0007FF) {
if (utf8_string.capacity <= string_index + 2) {
utf8_string.capacity += utf8_string.capacity;
utf8_string.character_string = hallocy_realloc(utf8_string.character_string, utf8_string.capacity);
}
utf8_string.character_string[string_index] = 0xC0 | ((unicode[index] >> 6) & 0x07);
utf8_string.character_string[string_index + 1] = 0x80 | (unicode[index] & 0x3F);
string_index += 2;
} else if (unicode[index] <= 0x00FFFF) {
if (utf8_string.capacity <= string_index + 3) {
utf8_string.capacity += utf8_string.capacity;
utf8_string.character_string = hallocy_realloc(utf8_string.character_string, utf8_string.capacity);
}
utf8_string.character_string[string_index] = 0xE0 | ((unicode[index] >> 12) & 0x07);
utf8_string.character_string[string_index + 1] = 0x80 | ((unicode[index] >> 6) & 0x3F);
utf8_string.character_string[string_index + 2] = 0x80 | (unicode[index] & 0x3F);
string_index += 3;
} else if (unicode[index] <= 0x10FFFF) {
if (utf8_string.capacity <= string_index + 4) {
utf8_string.capacity += utf8_string.capacity;
utf8_string.character_string = hallocy_realloc(utf8_string.character_string, utf8_string.capacity);
}
utf8_string.character_string[string_index] = 0xF0 | ((unicode[index] >> 18) & 0x07);
utf8_string.character_string[string_index + 1] = 0x80 | ((unicode[index] >> 12) & 0x3F);
utf8_string.character_string[string_index + 2] = 0x80 | ((unicode[index] >> 6) & 0x3F);
utf8_string.character_string[string_index + 3] = 0x80 | (unicode[index] & 0x3F);
string_index += 4;
}
}
utf8_string.size = string_index;
if (utf8_string.capacity <= utf8_string.size + 1) {
utf8_string.capacity += utf8_string.capacity;
utf8_string.character_string = hallocy_realloc(utf8_string.character_string, utf8_string.capacity);
}
utf8_string.character_string[utf8_string.size] = '\0';
return utf8_string;
}
uint32_t *fledasty_utf8_string_decode(FledastyUtf8String *current_string, size_t *unicode_string_length) {
if (current_string == NULL) {
return NULL;
}
(*unicode_string_length) = 0;
size_t index = 0;
uint32_t *unicode_string = hallocy_malloc(current_string->size * sizeof(uint32_t));
while (index < current_string->size) {
if ((current_string->character_string[index] & 0xF0) == 0xF0) {
unicode_string[*unicode_string_length] = ((current_string->character_string[index] & 0x07) << 18) | ((current_string->character_string[index + 1] & 0x3F) << 12) | ((current_string->character_string[index + 2] & 0x3F) << 6) | (current_string->character_string[index + 3] & 0x3F);
index += 4;
} else if ((current_string->character_string[index] & 0xE0) == 0xE0) {
unicode_string[*unicode_string_length] = ((current_string->character_string[index] & 0x0F) << 12) | ((current_string->character_string[index + 1] & 0x3F) << 6) | (current_string->character_string[index + 2] & 0x3F);
index += 3;
} else if ((current_string->character_string[index] & 0xC0) == 0xC0) {
unicode_string[*unicode_string_length] = ((current_string->character_string[index] & 0x1F) << 6) | (current_string->character_string[index + 1] & 0x3F);
index += 2;
} else {
unicode_string[*unicode_string_length] = current_string->character_string[index];
index += 1;
}
(*unicode_string_length) += 1;
}
return unicode_string;
}

View file

@ -20,6 +20,7 @@
* -----------------------------------------------------------------------------
*/
#include <stdio.h>
#include <Hallocy/Core/Allocator.h>
#include <Fledasty/Core/Queue.h>
#include <Fledasty/Core/Stack.h>
#include <Fledasty/Core/DynamicArray.h>
@ -224,10 +225,19 @@ int main() {
fledasty_hash_table_destroy(&test_hash_table);
FledastyUtf8String test_utf8_string;
unsigned char *test_string = (unsigned char*)"😀€Testing";
fledasty_utf8_string_initialize(&test_utf8_string, test_string, 11);
fledasty_utf8_string_initialize(&test_utf8_string, test_string, 15);
printf("%s\n", test_string);
printf("%s\n", test_utf8_string.character_string);
size_t unicode_length = 0;
uint32_t *unicode = fledasty_utf8_string_decode(&test_utf8_string, &unicode_length);
FledastyUtf8String encoded_string = fledasty_utf8_string_encode(unicode, unicode_length);
printf("%s\n", encoded_string.character_string);
hallocy_free(unicode);
fledasty_utf8_string_destroy(&encoded_string);
fledasty_utf8_string_destroy(&test_utf8_string);
printf("Done\n");
return 0;