Hallocy/Src/Core/Memory.c

643 lines
25 KiB
C

/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* -----------------------------------------------------------------------------
* File: Memory.c
* Description:
* This file implements the functions for managing memory. It includes functions
* to copy, move, compare and set memory.
*
* Author: Mineplay
* -----------------------------------------------------------------------------
*/
#include "../../Include/Hallocy/Core/Memory.h"
#include "../../Include/Hallocy/Utils/Simd.h"
HallocyError hallocy_set_memory(void *destination, int value, const size_t size) {
if (destination == NULL) {
return HALLOCY_ERROR_INVALID_POINTER;
}
unsigned char *destination_bytes = (unsigned char*)destination;
unsigned char *end_address = destination_bytes + size;
unsigned char value_bytes = (unsigned char)value;
switch (hallocy_is_simd_supported()) {
#if defined(LIN_NEON) || defined(WIN_NEON)
case HALLOCY_SIMD_NEON: {
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
*destination_bytes = value_bytes;
destination_bytes += 1;
}
uint8x16_t simd_value = vdupq_n_u8(value_bytes);
while (destination_bytes - end_address >= SIMD_128_WIDTH) {
vst1q_u8(destination_bytes, simd_value);
destination_bytes += SIMD_128_WIDTH;
}
break;
}
#elif defined(LIN_SIMD) || defined(WIN_SIMD)
case HALLOCY_SIMD_AVX512: {
while (((size_t)destination_bytes % SIMD_512_WIDTH) != 0 && destination_bytes != end_address) {
*destination_bytes = value_bytes;
destination_bytes += 1;
}
__m512i simd_value = _mm512_set1_epi8((char)value_bytes);
while (destination_bytes - end_address >= SIMD_512_WIDTH) {
_mm512_store_si512((__m512i*)destination_bytes, simd_value);
destination_bytes += SIMD_512_WIDTH;
}
break;
}
case HALLOCY_SIMD_AVX2: {
while (((size_t)destination_bytes % SIMD_256_WIDTH) != 0 && destination_bytes != end_address) {
*destination_bytes = value_bytes;
destination_bytes += 1;
}
__m256i simd_value = _mm256_set1_epi8((char)value_bytes);
while (destination_bytes - end_address >= SIMD_256_WIDTH) {
_mm256_store_si256((__m256i*)destination_bytes, simd_value);
destination_bytes += SIMD_256_WIDTH;
}
break;
}
case HALLOCY_SIMD_AVX:
case HALLOCY_SIMD_SSE2:
case HALLOCY_SIMD_SSE: {
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
*destination_bytes = value_bytes;
destination_bytes += 1;
}
__m128i simd_value = _mm_set1_epi8((char)value_bytes);
while (destination_bytes - end_address >= SIMD_128_WIDTH) {
_mm_store_si128((__m128i*)destination_bytes, simd_value);
destination_bytes += SIMD_128_WIDTH;
}
break;
}
#endif
default: {
size_t word_size = sizeof(size_t);
while (((size_t)destination_bytes % word_size) != 0 && destination_bytes != end_address) {
*destination_bytes = value_bytes;
destination_bytes += 1;
}
size_t value_word = 0;
for (size_t i = 0; i < word_size; i++) {
value_word |= (size_t)value_bytes << (i * SIMD_64_WIDTH);
}
size_t *destination_word = (size_t*)destination_bytes;
while ((unsigned char*)(destination_word + 1) < end_address) {
*destination_word = value_word;
destination_word += 1;
}
destination_bytes = (unsigned char*)destination_word;
break;
}
}
while (destination_bytes != end_address) {
*destination_bytes = value_bytes;
destination_bytes += 1;
}
return HALLOCY_ERROR_NONE;
}
HallocyError hallocy_copy_memory(void *destination, void *source, const size_t size) {
if (destination == NULL || source == NULL) {
return HALLOCY_ERROR_INVALID_POINTER;
}
unsigned char *destination_bytes = (unsigned char*)destination;
unsigned char *source_bytes = (unsigned char*)source;
unsigned char *end_address = destination_bytes + size;
switch (hallocy_is_simd_supported()) {
#if defined(LIN_NEON) || defined(WIN_NEON)
case HALLOCY_SIMD_NEON: {
if ((size_t)destination_bytes % SIMD_128_WIDTH == (size_t)source_bytes % SIMD_128_WIDTH) {
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
*destination_bytes = *source_bytes;
destination_bytes += 1;
source_bytes += 1;
}
}
uint8x16_t simd_value;
while (destination_bytes - end_address >= SIMD_128_WIDTH) {
simd_value = vdupq_n_u8(*source_bytes);
vst1q_u8(destination_bytes, simd_value);
destination_bytes += SIMD_128_WIDTH;
source_bytes += SIMD_128_WIDTH;
}
break;
}
#elif defined(LIN_SIMD) || defined(WIN_SIMD)
case HALLOCY_SIMD_AVX512: {
if ((size_t)destination_bytes % SIMD_512_WIDTH == (size_t)source_bytes % SIMD_512_WIDTH) {
while (((size_t)destination_bytes % SIMD_512_WIDTH) != 0 && destination_bytes != end_address) {
*destination_bytes = *source_bytes;
destination_bytes += 1;
source_bytes += 1;
}
__m512i simd_value;
while (destination_bytes - end_address >= SIMD_512_WIDTH) {
simd_value = _mm512_load_si512((__m512i*)source_bytes);
_mm512_store_si512((__m512i*)destination_bytes, simd_value);
destination_bytes += SIMD_512_WIDTH;
source_bytes += SIMD_512_WIDTH;
}
} else {
__m512i simd_value;
while (destination_bytes - end_address >= SIMD_512_WIDTH) {
simd_value = _mm512_loadu_si512((__m512i*)source_bytes);
_mm512_storeu_si512((__m512i*)destination_bytes, simd_value);
destination_bytes += SIMD_512_WIDTH;
source_bytes += SIMD_512_WIDTH;
}
}
break;
}
case HALLOCY_SIMD_AVX2:
case HALLOCY_SIMD_AVX: {
if ((size_t)destination_bytes % SIMD_256_WIDTH == (size_t)source_bytes % SIMD_256_WIDTH) {
while (((size_t)destination_bytes % SIMD_256_WIDTH) != 0 && destination_bytes != end_address) {
*destination_bytes = *source_bytes;
destination_bytes += 1;
source_bytes += 1;
}
__m256i simd_value;
while (destination_bytes - end_address >= SIMD_256_WIDTH) {
simd_value = _mm256_load_si256((__m256i*)source_bytes);
_mm256_store_si256((__m256i*)destination_bytes, simd_value);
destination_bytes += SIMD_256_WIDTH;
source_bytes += SIMD_256_WIDTH;
}
} else {
__m256i simd_value;
while (destination_bytes - end_address >= SIMD_256_WIDTH) {
simd_value = _mm256_loadu_si256((__m256i*)source_bytes);
_mm256_storeu_si256((__m256i*)destination_bytes, simd_value);
destination_bytes += SIMD_256_WIDTH;
source_bytes += SIMD_256_WIDTH;
}
}
break;
}
case HALLOCY_SIMD_SSE2:
case HALLOCY_SIMD_SSE: {
if ((size_t)destination_bytes % SIMD_128_WIDTH == (size_t)source_bytes % SIMD_128_WIDTH) {
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
*destination_bytes = *source_bytes;
destination_bytes += 1;
source_bytes += 1;
}
__m128i simd_value;
while (destination_bytes - end_address >= SIMD_128_WIDTH) {
simd_value = _mm_load_si128((__m128i*)source_bytes);
_mm_store_si128((__m128i*)destination_bytes, simd_value);
destination_bytes += SIMD_128_WIDTH;
source_bytes += SIMD_128_WIDTH;
}
} else {
__m128i simd_value;
while (destination_bytes - end_address >= SIMD_128_WIDTH) {
simd_value = _mm_loadu_si128((__m128i*)source_bytes);
_mm_storeu_si128((__m128i*)destination_bytes, simd_value);
destination_bytes += SIMD_128_WIDTH;
source_bytes += SIMD_128_WIDTH;
}
}
break;
}
#endif
default: {
size_t word_size = sizeof(size_t);
if ((size_t)destination_bytes % word_size == (size_t)source_bytes % word_size) {
while (((size_t)destination_bytes % word_size) != 0 && destination_bytes != end_address) {
*destination_bytes = *source_bytes;
destination_bytes += 1;
source_bytes += 1;
}
size_t *destination_word = (size_t*)destination_bytes;
size_t *source_word = (size_t*)source_bytes;
while ((unsigned char*)(destination_word + 1) < end_address) {
*destination_word = *source_word;
destination_word += 1;
source_word += 1;
}
source_bytes = (unsigned char*)source_word;
destination_bytes = (unsigned char*)destination_word;
}
break;
}
}
while (destination_bytes != end_address) {
*destination_bytes = *source_bytes;
destination_bytes += 1;
source_bytes += 1;
}
return HALLOCY_ERROR_NONE;
}
HallocyError hallocy_move_memory(void *destination, void *source, const size_t size) {
if (!((void*)((size_t)source + size) > destination && source < destination)) {
return hallocy_copy_memory(destination, source, size);
}
if (destination == NULL || source == NULL) {
return HALLOCY_ERROR_INVALID_POINTER;
}
unsigned char *end_address = (unsigned char*)destination;
unsigned char *destination_bytes = (unsigned char*)destination + size;
unsigned char *source_bytes = (unsigned char*)source + size;
switch (hallocy_is_simd_supported()) {
#if defined(LIN_NEON) || defined(WIN_NEON)
case HALLOCY_SIMD_NEON: {
if ((size_t)destination_bytes % SIMD_128_WIDTH == (size_t)source_bytes % SIMD_128_WIDTH) {
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
destination_bytes -= 1;
source_bytes -= 1;
*destination_bytes = *source_bytes;
}
}
uint8x16_t simd_value;
while (end_address - destination_bytes >= SIMD_128_WIDTH) {
destination_bytes -= SIMD_128_WIDTH;
source_bytes -= SIMD_128_WIDTH;
simd_value = vdupq_n_u8(*source_bytes);
vst1q_u8(destination_bytes, simd_value);
}
break;
}
#elif defined(LIN_SIMD) || defined(WIN_SIMD)
case HALLOCY_SIMD_AVX512: {
if ((size_t)destination_bytes % SIMD_512_WIDTH == (size_t)source_bytes % SIMD_512_WIDTH) {
while (((size_t)destination_bytes % SIMD_512_WIDTH) != 0 && destination_bytes != end_address) {
destination_bytes -= 1;
source_bytes -= 1;
*destination_bytes = *source_bytes;
}
__m512i simd_value;
while (end_address - destination_bytes >= SIMD_512_WIDTH) {
destination_bytes -= SIMD_512_WIDTH;
source_bytes -= SIMD_512_WIDTH;
simd_value = _mm512_load_si512((__m512i*)source_bytes);
_mm512_store_si512((__m512i*)destination_bytes, simd_value);
}
} else {
__m512i simd_value;
while (end_address - destination_bytes >= SIMD_512_WIDTH) {
destination_bytes -= SIMD_512_WIDTH;
source_bytes -= SIMD_512_WIDTH;
simd_value = _mm512_loadu_si512((__m512i*)source_bytes);
_mm512_storeu_si512((__m512i*)destination_bytes, simd_value);
}
}
break;
}
case HALLOCY_SIMD_AVX2:
case HALLOCY_SIMD_AVX: {
if ((size_t)destination_bytes % SIMD_256_WIDTH == (size_t)source_bytes % SIMD_512_WIDTH) {
while (((size_t)destination_bytes % SIMD_256_WIDTH) != 0 && destination_bytes != end_address) {
destination_bytes -= 1;
source_bytes -= 1;
*destination_bytes = *source_bytes;
}
__m256i simd_value;
while (end_address - destination_bytes >= SIMD_256_WIDTH) {
destination_bytes -= SIMD_256_WIDTH;
source_bytes -= SIMD_256_WIDTH;
simd_value = _mm256_load_si256((__m256i*)source_bytes);
_mm256_store_si256((__m256i*)destination_bytes, simd_value);
}
} else {
__m256i simd_value;
while (end_address - destination_bytes >= SIMD_256_WIDTH) {
destination_bytes -= SIMD_256_WIDTH;
source_bytes -= SIMD_256_WIDTH;
simd_value = _mm256_loadu_si256((__m256i*)source_bytes);
_mm256_storeu_si256((__m256i*)destination_bytes, simd_value);
}
}
break;
}
case HALLOCY_SIMD_SSE2:
case HALLOCY_SIMD_SSE: {
if ((size_t)destination_bytes % SIMD_128_WIDTH == (size_t)source_bytes % SIMD_512_WIDTH) {
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
destination_bytes -= 1;
source_bytes -= 1;
*destination_bytes = *source_bytes;
}
__m128i simd_value;
while (end_address - destination_bytes >= SIMD_128_WIDTH) {
destination_bytes -= SIMD_128_WIDTH;
source_bytes -= SIMD_128_WIDTH;
simd_value = _mm_load_si128((__m128i*)source_bytes);
_mm_store_si128((__m128i*)destination_bytes, simd_value);
}
} else {
__m128i simd_value;
while (end_address - destination_bytes >= SIMD_128_WIDTH) {
destination_bytes -= SIMD_128_WIDTH;
source_bytes -= SIMD_128_WIDTH;
simd_value = _mm_loadu_si128((__m128i*)source_bytes);
_mm_storeu_si128((__m128i*)destination_bytes, simd_value);
}
}
break;
}
#endif
default: {
size_t word_size = sizeof(size_t);
if ((size_t)destination_bytes % word_size == (size_t)source_bytes % word_size) {
while (((size_t)destination_bytes % word_size) != 0 && destination_bytes != end_address) {
destination_bytes -= 1;
source_bytes -= 1;
*destination_bytes = *source_bytes;
}
size_t *destination_word = (size_t*)destination_bytes;
size_t *source_word = (size_t*)source_bytes;
while ((unsigned char*)(destination_word - 1) > end_address) {
destination_word -= 1;
source_word -= 1;
*destination_word = *source_word;
}
source_bytes = (unsigned char*)source_word;
destination_bytes = (unsigned char*)destination_word;
}
break;
}
}
while (destination_bytes != end_address) {
destination_bytes -= 1;
source_bytes -= 1;
*destination_bytes = *source_bytes;
}
return HALLOCY_ERROR_NONE;
}
bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size) {
if (left_side == NULL || right_side == NULL) {
return false;
}
unsigned char *left_side_bytes = (unsigned char*)left_side;
unsigned char *right_side_bytes = (unsigned char*)right_side;
unsigned char *end_address = left_side_bytes + size;
switch (hallocy_is_simd_supported()) {
#if defined(LIN_NEON) || defined(WIN_NEON)
case HALLOCY_SIMD_NEON: {
if ((size_t)left_side_bytes % SIMD_128_WIDTH == (size_t)right_side_bytes % SIMD_128_WIDTH) {
while (((size_t)left_side_bytes % SIMD_128_WIDTH) != 0 && left_side_bytes != end_address) {
if (*left_side_bytes != *right_side_bytes) {
return false;
}
left_side_bytes += 1;
right_side_bytes += 1;
}
}
while (left_side_bytes - end_address >= SIMD_128_WIDTH) {
uint8x16_t simd_left_side_value = vdupq_n_u8(*left_side_bytes);
uint8x16_t simd_right_side_value = vdupq_n_u8(*right_side_bytes);
uint8x16_t result = vceqq_u8(simd_left_side_value, simd_right_side_value);
if (vmaxvq_u8(result) != 0xFF) {
return false;
}
left_side_bytes += SIMD_128_WIDTH;
right_side_bytes += SIMD_128_WIDTH;
}
break;
}
#elif defined(LIN_SIMD) || defined(WIN_SIMD)
case HALLOCY_SIMD_AVX512: {
if ((size_t)left_side_bytes % SIMD_512_WIDTH == (size_t)right_side_bytes % SIMD_512_WIDTH) {
while (((size_t)left_side_bytes % SIMD_512_WIDTH) != 0 && left_side_bytes != end_address) {
if (*left_side_bytes != *right_side_bytes) {
return false;
}
left_side_bytes += 1;
right_side_bytes += 1;
}
while (left_side_bytes - end_address >= SIMD_512_WIDTH) {
__m512i simd_left_side_value = _mm512_load_si512((__m512i*)left_side_bytes);
__m512i simd_right_side_value = _mm512_load_si512((__m512i*)right_side_bytes);
__m512i result = _mm512_xor_si512(simd_left_side_value, simd_right_side_value);
if (_mm512_test_epi64_mask(result, result) != 0) {
return false;
}
left_side_bytes += SIMD_512_WIDTH;
right_side_bytes += SIMD_512_WIDTH;
}
} else {
while (left_side_bytes - end_address >= SIMD_512_WIDTH) {
__m512i simd_left_side_value = _mm512_loadu_si512((__m512i*)left_side_bytes);
__m512i simd_right_side_value = _mm512_loadu_si512((__m512i*)right_side_bytes);
__m512i result = _mm512_xor_si512(simd_left_side_value, simd_right_side_value);
if (_mm512_test_epi64_mask(result, result) != 0) {
return false;
}
left_side_bytes += SIMD_512_WIDTH;
right_side_bytes += SIMD_512_WIDTH;
}
}
break;
}
case HALLOCY_SIMD_AVX2:
case HALLOCY_SIMD_AVX: {
if ((size_t)left_side_bytes % SIMD_256_WIDTH == (size_t)right_side_bytes % SIMD_256_WIDTH) {
while (((size_t)left_side_bytes % SIMD_256_WIDTH) != 0 && left_side_bytes != end_address) {
if (*left_side_bytes != *right_side_bytes) {
return false;
}
left_side_bytes += 1;
right_side_bytes += 1;
}
while (left_side_bytes - end_address >= SIMD_256_WIDTH) {
__m256i simd_left_side_value = _mm256_load_si256((__m256i*)left_side_bytes);
__m256i simd_right_side_value = _mm256_load_si256((__m256i*)right_side_bytes);
__m256i result = _mm256_xor_si256(simd_left_side_value, simd_right_side_value);
if (_mm256_testz_si256(result, result) == 0) {
return false;
}
left_side_bytes += SIMD_256_WIDTH;
right_side_bytes += SIMD_256_WIDTH;
}
} else {
while (left_side_bytes - end_address >= SIMD_256_WIDTH) {
__m256i simd_left_side_value = _mm256_loadu_si256((__m256i*)left_side_bytes);
__m256i simd_right_side_value = _mm256_loadu_si256((__m256i*)right_side_bytes);
__m256i result = _mm256_xor_si256(simd_left_side_value, simd_right_side_value);
if (_mm256_testz_si256(result, result) == 0) {
return false;
}
left_side_bytes += SIMD_256_WIDTH;
right_side_bytes += SIMD_256_WIDTH;
}
}
break;
}
case HALLOCY_SIMD_SSE2:
case HALLOCY_SIMD_SSE: {
if ((size_t)left_side_bytes % SIMD_128_WIDTH == (size_t)right_side_bytes % SIMD_128_WIDTH) {
while (((size_t)left_side_bytes % SIMD_128_WIDTH) != 0 && left_side_bytes != end_address) {
if (*left_side_bytes != *right_side_bytes) {
return false;
}
left_side_bytes += 1;
right_side_bytes += 1;
}
while (left_side_bytes - end_address >= SIMD_128_WIDTH) {
__m128i simd_left_side_value = _mm_load_si128((__m128i*)left_side_bytes);
__m128i simd_right_side_value = _mm_load_si128((__m128i*)right_side_bytes);
__m128i result = _mm_xor_si128(simd_left_side_value, simd_right_side_value);
if (_mm_testz_si128(result, result) == 0) {
return false;
}
left_side_bytes += SIMD_128_WIDTH;
right_side_bytes += SIMD_128_WIDTH;
}
} else {
while (left_side_bytes - end_address >= SIMD_128_WIDTH) {
__m128i simd_left_side_value = _mm_loadu_si128((__m128i*)left_side_bytes);
__m128i simd_right_side_value = _mm_loadu_si128((__m128i*)right_side_bytes);
__m128i result = _mm_xor_si128(simd_left_side_value, simd_right_side_value);
if (_mm_testz_si128(result, result) == 0) {
return false;
}
left_side_bytes += SIMD_128_WIDTH;
right_side_bytes += SIMD_128_WIDTH;
}
}
break;
}
#endif
default: {
size_t word_size = sizeof(size_t);
if ((size_t)left_side_bytes % word_size == (size_t)right_side_bytes % word_size) {
while (((size_t)left_side_bytes % word_size) != 0 && left_side_bytes != end_address) {
*left_side_bytes = *right_side_bytes;
left_side_bytes += 1;
right_side_bytes += 1;
}
size_t *left_side_word = (size_t*)left_side_bytes;
size_t *right_side_word = (size_t*)right_side_bytes;
while ((unsigned char*)(left_side_word + 1) < end_address) {
if (*left_side_word != *right_side_word) {
return false;
}
left_side_word += 1;
right_side_word += 1;
}
right_side_bytes = (unsigned char*)right_side_word;
left_side_bytes = (unsigned char*)left_side_word;
}
break;
}
}
while (left_side_bytes != end_address) {
if (*left_side_bytes != *right_side_bytes) {
return false;
}
left_side_bytes += 1;
right_side_bytes += 1;
}
return true;
}