Compare commits
No commits in common. "main" and "V1.0.0" have entirely different histories.
10 changed files with 240 additions and 239 deletions
11
.clang-tidy
11
.clang-tidy
|
|
@ -1,11 +0,0 @@
|
|||
Checks: >
|
||||
clang-analyzer-*,
|
||||
bugprone-*,
|
||||
performance-*,
|
||||
readability-*,
|
||||
portability-*,
|
||||
cppcoreguidelines-*,
|
||||
-clang-analyzer-osx*,
|
||||
-cppcoreguidelines-pro-type-vararg
|
||||
WarningsAsErrors: ''
|
||||
FormatStyle: file
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -1,3 +1 @@
|
|||
Build
|
||||
compile_commands.json
|
||||
.cache
|
||||
|
|
@ -16,13 +16,11 @@ target_link_libraries(HallocyTest Hallocy)
|
|||
if (MSVC)
|
||||
target_compile_options(Hallocy PRIVATE /W4 /Zl)
|
||||
else()
|
||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
|
||||
target_compile_options(Hallocy PRIVATE -mavx512f -mavx512vl -march=native)
|
||||
target_compile_options(HallocyTest PRIVATE -mavx512f -mavx512vl -march=native)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
|
||||
target_compile_options(Hallocy PRIVATE -mfpu=neon)
|
||||
target_compile_options(HallocyTest PRIVATE -mfpu=neon)
|
||||
endif()
|
||||
target_compile_options(Hallocy PRIVATE -mavx512f -mavx512vl)
|
||||
target_compile_options(HallocyTest PRIVATE -mavx512f -mavx512vl)
|
||||
|
||||
target_compile_options(Hallocy PRIVATE -march=native)
|
||||
target_compile_options(HallocyTest PRIVATE -march=native)
|
||||
|
||||
target_compile_options(Hallocy PRIVATE -Wall -Wextra -pedantic)
|
||||
endif()
|
||||
|
|
@ -28,11 +28,11 @@
|
|||
|
||||
#include "../Utils/Error.h"
|
||||
|
||||
void *hallocy_allocate(size_t size, bool zero_memory);
|
||||
void *hallocy_allocate(const size_t size, const bool zero_memory);
|
||||
|
||||
static inline void *hallocy_malloc(const size_t size) { return hallocy_allocate(size, false); }
|
||||
static inline void *hallocy_calloc(const size_t size, size_t count) { return hallocy_allocate(size * count, true); }
|
||||
void *hallocy_realloc(void *memory_pointer, size_t size);
|
||||
void *hallocy_realloc(void *memory_pointer, const size_t size);
|
||||
HallocyError hallocy_free(void *pointer);
|
||||
|
||||
#endif
|
||||
|
|
@ -28,9 +28,9 @@
|
|||
|
||||
#include "../Utils/Error.h"
|
||||
|
||||
HallocyError hallocy_set_memory(void *destination, int value, size_t size);
|
||||
HallocyError hallocy_copy_memory(void *destination, void *source, size_t size);
|
||||
HallocyError hallocy_move_memory(void *destination, void *source, size_t size);
|
||||
bool hallocy_compare_memory(void *left_side, void *right_side, size_t size);
|
||||
HallocyError hallocy_set_memory(void *destination, int value, const size_t size);
|
||||
HallocyError hallocy_copy_memory(void *destination, void *source, const size_t size);
|
||||
HallocyError hallocy_move_memory(void *destination, void *source, const size_t size);
|
||||
bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size);
|
||||
|
||||
#endif
|
||||
|
|
@ -23,27 +23,22 @@
|
|||
#ifndef HALLOCY_SIMD
|
||||
#define HALLOCY_SIMD
|
||||
|
||||
#if defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM))
|
||||
#define WIN_NEON
|
||||
#include <arm_neon.h>
|
||||
#elif defined(_MSC_VER)
|
||||
#define WIN_SIMD
|
||||
#if defined(_MSC_VER)
|
||||
#if defined(_M_ARM64)
|
||||
#include <arm64intr.h>
|
||||
#else
|
||||
#include <intrin.h>
|
||||
#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
|
||||
#define LIN_SIMD
|
||||
#include <immintrin.h>
|
||||
#elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__aarch64__))
|
||||
#define LIN_NEON
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
#else
|
||||
#warning "SIMD is unsupported by this architecture or compiler (only x86/x64/ARM/ARM64 supported)."
|
||||
#if defined(__aarch64__)
|
||||
#include <arm64intr.h>
|
||||
#elif defined(__arm__)
|
||||
#include <arm_neon.h>
|
||||
#else
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define SIMD_64_WIDTH 8
|
||||
#define SIMD_128_WIDTH 16
|
||||
#define SIMD_256_WIDTH 32
|
||||
#define SIMD_512_WIDTH 64
|
||||
|
||||
typedef enum {
|
||||
HALLOCY_SIMD_UNDEFINED = 0,
|
||||
HALLOCY_SIMD_NONE = 1,
|
||||
|
|
@ -55,6 +50,6 @@ typedef enum {
|
|||
HALLOCY_SIMD_NEON = 7
|
||||
} HallocySimdType;
|
||||
|
||||
HallocySimdType hallocy_is_simd_supported(void);
|
||||
HallocySimdType hallocy_is_simd_supported();
|
||||
|
||||
#endif
|
||||
|
|
@ -55,6 +55,7 @@ static _Thread_local size_t hallocy_small_memory_freed = 0;
|
|||
static _Thread_local size_t hallocy_small_memory_allocated = 0;
|
||||
static _Thread_local HallocyMemoryHeader *hallocy_small_memory_bin = NULL;
|
||||
|
||||
static size_t page_size = 0;
|
||||
static size_t hallocy_small_allocation_size = 0;
|
||||
static size_t hallocy_medium_allocation_size = 0;
|
||||
|
||||
|
|
@ -69,7 +70,6 @@ static BOOL CALLBACK hallocy_initialize_mutex(PINIT_ONCE init_once, PVOID parame
|
|||
#endif
|
||||
|
||||
void *hallocy_allocate(const size_t size, const bool zero_memory) {
|
||||
static size_t page_size = 0;
|
||||
if (page_size == 0) {
|
||||
#if defined(_WIN32)
|
||||
SYSTEM_INFO system_info;
|
||||
|
|
|
|||
|
|
@ -23,6 +23,9 @@
|
|||
#include "../../Include/Hallocy/Core/Memory.h"
|
||||
#include "../../Include/Hallocy/Utils/Simd.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stddef.h>
|
||||
|
||||
HallocyError hallocy_set_memory(void *destination, int value, const size_t size) {
|
||||
if (destination == NULL) {
|
||||
return HALLOCY_ERROR_INVALID_POINTER;
|
||||
|
|
@ -34,46 +37,46 @@ HallocyError hallocy_set_memory(void *destination, int value, const size_t size)
|
|||
unsigned char value_bytes = (unsigned char)value;
|
||||
|
||||
switch (hallocy_is_simd_supported()) {
|
||||
#if defined(LIN_NEON) || defined(WIN_NEON)
|
||||
#if defined(_M_ARM64) || defined(__aarch64__) || defined(__arm__)
|
||||
case HALLOCY_SIMD_NEON: {
|
||||
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
|
||||
while (((size_t)destination_bytes % 16) != 0 && destination_bytes != end_address) {
|
||||
*destination_bytes = value_bytes;
|
||||
destination_bytes += 1;
|
||||
}
|
||||
|
||||
uint8x16_t simd_value = vdupq_n_u8(value_bytes);
|
||||
while (destination_bytes - end_address >= SIMD_128_WIDTH) {
|
||||
while (destination_bytes - end_address >= 16) {
|
||||
vst1q_u8(destination_bytes, simd_value);
|
||||
destination_bytes += SIMD_128_WIDTH;
|
||||
destination_bytes += 16;
|
||||
}
|
||||
break;
|
||||
}
|
||||
#elif defined(LIN_SIMD) || defined(WIN_SIMD)
|
||||
#else
|
||||
case HALLOCY_SIMD_AVX512: {
|
||||
while (((size_t)destination_bytes % SIMD_512_WIDTH) != 0 && destination_bytes != end_address) {
|
||||
while (((size_t)destination_bytes % 64) != 0 && destination_bytes != end_address) {
|
||||
*destination_bytes = value_bytes;
|
||||
destination_bytes += 1;
|
||||
}
|
||||
|
||||
__m512i simd_value = _mm512_set1_epi8((char)value_bytes);
|
||||
while (destination_bytes - end_address >= SIMD_512_WIDTH) {
|
||||
__m512i simd_value = _mm512_set1_epi8(value_bytes);
|
||||
while (destination_bytes - end_address >= 64) {
|
||||
_mm512_store_si512((__m512i*)destination_bytes, simd_value);
|
||||
destination_bytes += SIMD_512_WIDTH;
|
||||
destination_bytes += 64;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
case HALLOCY_SIMD_AVX2: {
|
||||
while (((size_t)destination_bytes % SIMD_256_WIDTH) != 0 && destination_bytes != end_address) {
|
||||
while (((size_t)destination_bytes % 32) != 0 && destination_bytes != end_address) {
|
||||
*destination_bytes = value_bytes;
|
||||
destination_bytes += 1;
|
||||
}
|
||||
|
||||
__m256i simd_value = _mm256_set1_epi8((char)value_bytes);
|
||||
while (destination_bytes - end_address >= SIMD_256_WIDTH) {
|
||||
__m256i simd_value = _mm256_set1_epi8(value_bytes);
|
||||
while (destination_bytes - end_address >= 32) {
|
||||
_mm256_store_si256((__m256i*)destination_bytes, simd_value);
|
||||
destination_bytes += SIMD_256_WIDTH;
|
||||
destination_bytes += 32;
|
||||
}
|
||||
|
||||
break;
|
||||
|
|
@ -82,15 +85,15 @@ HallocyError hallocy_set_memory(void *destination, int value, const size_t size)
|
|||
case HALLOCY_SIMD_AVX:
|
||||
case HALLOCY_SIMD_SSE2:
|
||||
case HALLOCY_SIMD_SSE: {
|
||||
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
|
||||
while (((size_t)destination_bytes % 16) != 0 && destination_bytes != end_address) {
|
||||
*destination_bytes = value_bytes;
|
||||
destination_bytes += 1;
|
||||
}
|
||||
|
||||
__m128i simd_value = _mm_set1_epi8((char)value_bytes);
|
||||
while (destination_bytes - end_address >= SIMD_128_WIDTH) {
|
||||
__m128i simd_value = _mm_set1_epi8(value_bytes);
|
||||
while (destination_bytes - end_address >= 16) {
|
||||
_mm_store_si128((__m128i*)destination_bytes, simd_value);
|
||||
destination_bytes += SIMD_128_WIDTH;
|
||||
destination_bytes += 16;
|
||||
}
|
||||
|
||||
break;
|
||||
|
|
@ -105,7 +108,7 @@ HallocyError hallocy_set_memory(void *destination, int value, const size_t size)
|
|||
|
||||
size_t value_word = 0;
|
||||
for (size_t i = 0; i < word_size; i++) {
|
||||
value_word |= (size_t)value_bytes << (i * SIMD_64_WIDTH);
|
||||
value_word |= (size_t)value_bytes << (i * 8);
|
||||
}
|
||||
|
||||
size_t *destination_word = (size_t*)destination_bytes;
|
||||
|
|
@ -138,10 +141,10 @@ HallocyError hallocy_copy_memory(void *destination, void *source, const size_t s
|
|||
unsigned char *end_address = destination_bytes + size;
|
||||
|
||||
switch (hallocy_is_simd_supported()) {
|
||||
#if defined(LIN_NEON) || defined(WIN_NEON)
|
||||
#if defined(_M_ARM64) || defined(__aarch64__) || defined(__arm__)
|
||||
case HALLOCY_SIMD_NEON: {
|
||||
if ((size_t)destination_bytes % SIMD_128_WIDTH == (size_t)source_bytes % SIMD_128_WIDTH) {
|
||||
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
|
||||
if ((size_t)destination_bytes % 16 == (size_t)source_bytes % 16) {
|
||||
while (((size_t)destination_bytes % 16) != 0 && destination_bytes != end_address) {
|
||||
*destination_bytes = *source_bytes;
|
||||
destination_bytes += 1;
|
||||
source_bytes += 1;
|
||||
|
|
@ -149,39 +152,39 @@ HallocyError hallocy_copy_memory(void *destination, void *source, const size_t s
|
|||
}
|
||||
|
||||
uint8x16_t simd_value;
|
||||
while (destination_bytes - end_address >= SIMD_128_WIDTH) {
|
||||
simd_value = vdupq_n_u8(*source_bytes);
|
||||
while (destination_bytes - end_address >= 16) {
|
||||
simd_value = vdupq_n_u8(source_bytes);
|
||||
vst1q_u8(destination_bytes, simd_value);
|
||||
destination_bytes += SIMD_128_WIDTH;
|
||||
source_bytes += SIMD_128_WIDTH;
|
||||
destination_bytes += 16;
|
||||
source_bytes += 16;
|
||||
}
|
||||
break;
|
||||
}
|
||||
#elif defined(LIN_SIMD) || defined(WIN_SIMD)
|
||||
#else
|
||||
case HALLOCY_SIMD_AVX512: {
|
||||
if ((size_t)destination_bytes % SIMD_512_WIDTH == (size_t)source_bytes % SIMD_512_WIDTH) {
|
||||
while (((size_t)destination_bytes % SIMD_512_WIDTH) != 0 && destination_bytes != end_address) {
|
||||
if ((size_t)destination_bytes % 64 == (size_t)source_bytes % 64) {
|
||||
while (((size_t)destination_bytes % 64) != 0 && destination_bytes != end_address) {
|
||||
*destination_bytes = *source_bytes;
|
||||
destination_bytes += 1;
|
||||
source_bytes += 1;
|
||||
}
|
||||
|
||||
__m512i simd_value;
|
||||
while (destination_bytes - end_address >= SIMD_512_WIDTH) {
|
||||
while (destination_bytes - end_address >= 64) {
|
||||
simd_value = _mm512_load_si512((__m512i*)source_bytes);
|
||||
_mm512_store_si512((__m512i*)destination_bytes, simd_value);
|
||||
|
||||
destination_bytes += SIMD_512_WIDTH;
|
||||
source_bytes += SIMD_512_WIDTH;
|
||||
destination_bytes += 64;
|
||||
source_bytes += 64;
|
||||
}
|
||||
} else {
|
||||
__m512i simd_value;
|
||||
while (destination_bytes - end_address >= SIMD_512_WIDTH) {
|
||||
while (destination_bytes - end_address >= 64) {
|
||||
simd_value = _mm512_loadu_si512((__m512i*)source_bytes);
|
||||
_mm512_storeu_si512((__m512i*)destination_bytes, simd_value);
|
||||
|
||||
destination_bytes += SIMD_512_WIDTH;
|
||||
source_bytes += SIMD_512_WIDTH;
|
||||
destination_bytes += 64;
|
||||
source_bytes += 64;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
|
@ -189,29 +192,29 @@ HallocyError hallocy_copy_memory(void *destination, void *source, const size_t s
|
|||
|
||||
case HALLOCY_SIMD_AVX2:
|
||||
case HALLOCY_SIMD_AVX: {
|
||||
if ((size_t)destination_bytes % SIMD_256_WIDTH == (size_t)source_bytes % SIMD_256_WIDTH) {
|
||||
while (((size_t)destination_bytes % SIMD_256_WIDTH) != 0 && destination_bytes != end_address) {
|
||||
if ((size_t)destination_bytes % 32 == (size_t)source_bytes % 64) {
|
||||
while (((size_t)destination_bytes % 32) != 0 && destination_bytes != end_address) {
|
||||
*destination_bytes = *source_bytes;
|
||||
destination_bytes += 1;
|
||||
source_bytes += 1;
|
||||
}
|
||||
|
||||
__m256i simd_value;
|
||||
while (destination_bytes - end_address >= SIMD_256_WIDTH) {
|
||||
while (destination_bytes - end_address >= 32) {
|
||||
simd_value = _mm256_load_si256((__m256i*)source_bytes);
|
||||
_mm256_store_si256((__m256i*)destination_bytes, simd_value);
|
||||
|
||||
destination_bytes += SIMD_256_WIDTH;
|
||||
source_bytes += SIMD_256_WIDTH;
|
||||
destination_bytes += 32;
|
||||
source_bytes += 32;
|
||||
}
|
||||
} else {
|
||||
__m256i simd_value;
|
||||
while (destination_bytes - end_address >= SIMD_256_WIDTH) {
|
||||
while (destination_bytes - end_address >= 32) {
|
||||
simd_value = _mm256_loadu_si256((__m256i*)source_bytes);
|
||||
_mm256_storeu_si256((__m256i*)destination_bytes, simd_value);
|
||||
|
||||
destination_bytes += SIMD_256_WIDTH;
|
||||
source_bytes += SIMD_256_WIDTH;
|
||||
destination_bytes += 32;
|
||||
source_bytes += 32;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
|
@ -219,29 +222,29 @@ HallocyError hallocy_copy_memory(void *destination, void *source, const size_t s
|
|||
|
||||
case HALLOCY_SIMD_SSE2:
|
||||
case HALLOCY_SIMD_SSE: {
|
||||
if ((size_t)destination_bytes % SIMD_128_WIDTH == (size_t)source_bytes % SIMD_128_WIDTH) {
|
||||
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
|
||||
if ((size_t)destination_bytes % 16 == (size_t)source_bytes % 64) {
|
||||
while (((size_t)destination_bytes % 16) != 0 && destination_bytes != end_address) {
|
||||
*destination_bytes = *source_bytes;
|
||||
destination_bytes += 1;
|
||||
source_bytes += 1;
|
||||
}
|
||||
|
||||
__m128i simd_value;
|
||||
while (destination_bytes - end_address >= SIMD_128_WIDTH) {
|
||||
while (destination_bytes - end_address >= 16) {
|
||||
simd_value = _mm_load_si128((__m128i*)source_bytes);
|
||||
_mm_store_si128((__m128i*)destination_bytes, simd_value);
|
||||
|
||||
destination_bytes += SIMD_128_WIDTH;
|
||||
source_bytes += SIMD_128_WIDTH;
|
||||
destination_bytes += 16;
|
||||
source_bytes += 16;
|
||||
}
|
||||
} else {
|
||||
__m128i simd_value;
|
||||
while (destination_bytes - end_address >= SIMD_128_WIDTH) {
|
||||
while (destination_bytes - end_address >= 16) {
|
||||
simd_value = _mm_loadu_si128((__m128i*)source_bytes);
|
||||
_mm_storeu_si128((__m128i*)destination_bytes, simd_value);
|
||||
|
||||
destination_bytes += SIMD_128_WIDTH;
|
||||
source_bytes += SIMD_128_WIDTH;
|
||||
destination_bytes += 16;
|
||||
source_bytes += 16;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
|
@ -296,10 +299,10 @@ HallocyError hallocy_move_memory(void *destination, void *source, const size_t s
|
|||
unsigned char *source_bytes = (unsigned char*)source + size;
|
||||
|
||||
switch (hallocy_is_simd_supported()) {
|
||||
#if defined(LIN_NEON) || defined(WIN_NEON)
|
||||
#if defined(_M_ARM64) || defined(__aarch64__) || defined(__arm__)
|
||||
case HALLOCY_SIMD_NEON: {
|
||||
if ((size_t)destination_bytes % SIMD_128_WIDTH == (size_t)source_bytes % SIMD_128_WIDTH) {
|
||||
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
|
||||
if ((size_t)destination_bytes % 16 == (size_t)source_bytes % 16) {
|
||||
while (((size_t)destination_bytes % 16) != 0 && destination_bytes != end_address) {
|
||||
destination_bytes -= 1;
|
||||
source_bytes -= 1;
|
||||
*destination_bytes = *source_bytes;
|
||||
|
|
@ -307,37 +310,37 @@ HallocyError hallocy_move_memory(void *destination, void *source, const size_t s
|
|||
}
|
||||
|
||||
uint8x16_t simd_value;
|
||||
while (end_address - destination_bytes >= SIMD_128_WIDTH) {
|
||||
destination_bytes -= SIMD_128_WIDTH;
|
||||
source_bytes -= SIMD_128_WIDTH;
|
||||
while (end_address - destination_bytes >= 16) {
|
||||
destination_bytes -= 16;
|
||||
source_bytes -= 16;
|
||||
|
||||
simd_value = vdupq_n_u8(*source_bytes);
|
||||
simd_value = vdupq_n_u8(source_bytes);
|
||||
vst1q_u8(destination_bytes, simd_value);
|
||||
}
|
||||
break;
|
||||
}
|
||||
#elif defined(LIN_SIMD) || defined(WIN_SIMD)
|
||||
#else
|
||||
case HALLOCY_SIMD_AVX512: {
|
||||
if ((size_t)destination_bytes % SIMD_512_WIDTH == (size_t)source_bytes % SIMD_512_WIDTH) {
|
||||
while (((size_t)destination_bytes % SIMD_512_WIDTH) != 0 && destination_bytes != end_address) {
|
||||
if ((size_t)destination_bytes % 64 == (size_t)source_bytes % 64) {
|
||||
while (((size_t)destination_bytes % 64) != 0 && destination_bytes != end_address) {
|
||||
destination_bytes -= 1;
|
||||
source_bytes -= 1;
|
||||
*destination_bytes = *source_bytes;
|
||||
}
|
||||
|
||||
__m512i simd_value;
|
||||
while (end_address - destination_bytes >= SIMD_512_WIDTH) {
|
||||
destination_bytes -= SIMD_512_WIDTH;
|
||||
source_bytes -= SIMD_512_WIDTH;
|
||||
while (end_address - destination_bytes >= 64) {
|
||||
destination_bytes -= 64;
|
||||
source_bytes -= 64;
|
||||
|
||||
simd_value = _mm512_load_si512((__m512i*)source_bytes);
|
||||
_mm512_store_si512((__m512i*)destination_bytes, simd_value);
|
||||
}
|
||||
} else {
|
||||
__m512i simd_value;
|
||||
while (end_address - destination_bytes >= SIMD_512_WIDTH) {
|
||||
destination_bytes -= SIMD_512_WIDTH;
|
||||
source_bytes -= SIMD_512_WIDTH;
|
||||
while (end_address - destination_bytes >= 64) {
|
||||
destination_bytes -= 64;
|
||||
source_bytes -= 64;
|
||||
|
||||
simd_value = _mm512_loadu_si512((__m512i*)source_bytes);
|
||||
_mm512_storeu_si512((__m512i*)destination_bytes, simd_value);
|
||||
|
|
@ -348,8 +351,8 @@ HallocyError hallocy_move_memory(void *destination, void *source, const size_t s
|
|||
|
||||
case HALLOCY_SIMD_AVX2:
|
||||
case HALLOCY_SIMD_AVX: {
|
||||
if ((size_t)destination_bytes % SIMD_256_WIDTH == (size_t)source_bytes % SIMD_512_WIDTH) {
|
||||
while (((size_t)destination_bytes % SIMD_256_WIDTH) != 0 && destination_bytes != end_address) {
|
||||
if ((size_t)destination_bytes % 32 == (size_t)source_bytes % 64) {
|
||||
while (((size_t)destination_bytes % 32) != 0 && destination_bytes != end_address) {
|
||||
destination_bytes -= 1;
|
||||
source_bytes -= 1;
|
||||
|
||||
|
|
@ -357,18 +360,18 @@ HallocyError hallocy_move_memory(void *destination, void *source, const size_t s
|
|||
}
|
||||
|
||||
__m256i simd_value;
|
||||
while (end_address - destination_bytes >= SIMD_256_WIDTH) {
|
||||
destination_bytes -= SIMD_256_WIDTH;
|
||||
source_bytes -= SIMD_256_WIDTH;
|
||||
while (end_address - destination_bytes >= 32) {
|
||||
destination_bytes -= 32;
|
||||
source_bytes -= 32;
|
||||
|
||||
simd_value = _mm256_load_si256((__m256i*)source_bytes);
|
||||
_mm256_store_si256((__m256i*)destination_bytes, simd_value);
|
||||
}
|
||||
} else {
|
||||
__m256i simd_value;
|
||||
while (end_address - destination_bytes >= SIMD_256_WIDTH) {
|
||||
destination_bytes -= SIMD_256_WIDTH;
|
||||
source_bytes -= SIMD_256_WIDTH;
|
||||
while (end_address - destination_bytes >= 32) {
|
||||
destination_bytes -= 32;
|
||||
source_bytes -= 32;
|
||||
|
||||
simd_value = _mm256_loadu_si256((__m256i*)source_bytes);
|
||||
_mm256_storeu_si256((__m256i*)destination_bytes, simd_value);
|
||||
|
|
@ -379,26 +382,26 @@ HallocyError hallocy_move_memory(void *destination, void *source, const size_t s
|
|||
|
||||
case HALLOCY_SIMD_SSE2:
|
||||
case HALLOCY_SIMD_SSE: {
|
||||
if ((size_t)destination_bytes % SIMD_128_WIDTH == (size_t)source_bytes % SIMD_512_WIDTH) {
|
||||
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
|
||||
if ((size_t)destination_bytes % 16 == (size_t)source_bytes % 64) {
|
||||
while (((size_t)destination_bytes % 16) != 0 && destination_bytes != end_address) {
|
||||
destination_bytes -= 1;
|
||||
source_bytes -= 1;
|
||||
*destination_bytes = *source_bytes;
|
||||
}
|
||||
|
||||
__m128i simd_value;
|
||||
while (end_address - destination_bytes >= SIMD_128_WIDTH) {
|
||||
destination_bytes -= SIMD_128_WIDTH;
|
||||
source_bytes -= SIMD_128_WIDTH;
|
||||
while (end_address - destination_bytes >= 16) {
|
||||
destination_bytes -= 16;
|
||||
source_bytes -= 16;
|
||||
|
||||
simd_value = _mm_load_si128((__m128i*)source_bytes);
|
||||
_mm_store_si128((__m128i*)destination_bytes, simd_value);
|
||||
}
|
||||
} else {
|
||||
__m128i simd_value;
|
||||
while (end_address - destination_bytes >= SIMD_128_WIDTH) {
|
||||
destination_bytes -= SIMD_128_WIDTH;
|
||||
source_bytes -= SIMD_128_WIDTH;
|
||||
while (end_address - destination_bytes >= 16) {
|
||||
destination_bytes -= 16;
|
||||
source_bytes -= 16;
|
||||
|
||||
simd_value = _mm_loadu_si128((__m128i*)source_bytes);
|
||||
_mm_storeu_si128((__m128i*)destination_bytes, simd_value);
|
||||
|
|
@ -452,10 +455,10 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
|
|||
unsigned char *end_address = left_side_bytes + size;
|
||||
|
||||
switch (hallocy_is_simd_supported()) {
|
||||
#if defined(LIN_NEON) || defined(WIN_NEON)
|
||||
#if defined(_M_ARM64) || defined(__aarch64__) || defined(__arm__)
|
||||
case HALLOCY_SIMD_NEON: {
|
||||
if ((size_t)left_side_bytes % SIMD_128_WIDTH == (size_t)right_side_bytes % SIMD_128_WIDTH) {
|
||||
while (((size_t)left_side_bytes % SIMD_128_WIDTH) != 0 && left_side_bytes != end_address) {
|
||||
if ((size_t)left_side_bytes % 16 == (size_t)right_side_bytes % 16) {
|
||||
while (((size_t)left_side_bytes % 16) != 0 && left_side_bytes != end_address) {
|
||||
if (*left_side_bytes != *right_side_bytes) {
|
||||
return false;
|
||||
}
|
||||
|
|
@ -465,24 +468,24 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
|
|||
}
|
||||
}
|
||||
|
||||
while (left_side_bytes - end_address >= SIMD_128_WIDTH) {
|
||||
uint8x16_t simd_left_side_value = vdupq_n_u8(*left_side_bytes);
|
||||
uint8x16_t simd_right_side_value = vdupq_n_u8(*right_side_bytes);
|
||||
while (end_address - destination_bytes >= 16) {
|
||||
uint8x16_t simd_left_side_value = vdupq_n_u8(left_side_bytes);
|
||||
uint8x16_t simd_right_side_value = vdupq_n_u8(right_side_bytes);
|
||||
|
||||
uint8x16_t result = vceqq_u8(simd_left_side_value, simd_right_side_value);
|
||||
if (vmaxvq_u8(result) != 0xFF) {
|
||||
return false;
|
||||
}
|
||||
|
||||
left_side_bytes += SIMD_128_WIDTH;
|
||||
right_side_bytes += SIMD_128_WIDTH;
|
||||
left_side_bytes += 16;
|
||||
right_side_bytes += 16
|
||||
}
|
||||
break;
|
||||
}
|
||||
#elif defined(LIN_SIMD) || defined(WIN_SIMD)
|
||||
#else
|
||||
case HALLOCY_SIMD_AVX512: {
|
||||
if ((size_t)left_side_bytes % SIMD_512_WIDTH == (size_t)right_side_bytes % SIMD_512_WIDTH) {
|
||||
while (((size_t)left_side_bytes % SIMD_512_WIDTH) != 0 && left_side_bytes != end_address) {
|
||||
if ((size_t)left_side_bytes % 64 == (size_t)right_side_bytes % 64) {
|
||||
while (((size_t)left_side_bytes % 64) != 0 && left_side_bytes != end_address) {
|
||||
if (*left_side_bytes != *right_side_bytes) {
|
||||
return false;
|
||||
}
|
||||
|
|
@ -491,7 +494,7 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
|
|||
right_side_bytes += 1;
|
||||
}
|
||||
|
||||
while (left_side_bytes - end_address >= SIMD_512_WIDTH) {
|
||||
while (left_side_bytes - end_address >= 64) {
|
||||
__m512i simd_left_side_value = _mm512_load_si512((__m512i*)left_side_bytes);
|
||||
__m512i simd_right_side_value = _mm512_load_si512((__m512i*)right_side_bytes);
|
||||
|
||||
|
|
@ -500,11 +503,11 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
|
|||
return false;
|
||||
}
|
||||
|
||||
left_side_bytes += SIMD_512_WIDTH;
|
||||
right_side_bytes += SIMD_512_WIDTH;
|
||||
left_side_bytes += 64;
|
||||
right_side_bytes += 64;
|
||||
}
|
||||
} else {
|
||||
while (left_side_bytes - end_address >= SIMD_512_WIDTH) {
|
||||
while (left_side_bytes - end_address >= 64) {
|
||||
__m512i simd_left_side_value = _mm512_loadu_si512((__m512i*)left_side_bytes);
|
||||
__m512i simd_right_side_value = _mm512_loadu_si512((__m512i*)right_side_bytes);
|
||||
|
||||
|
|
@ -513,8 +516,8 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
|
|||
return false;
|
||||
}
|
||||
|
||||
left_side_bytes += SIMD_512_WIDTH;
|
||||
right_side_bytes += SIMD_512_WIDTH;
|
||||
left_side_bytes += 64;
|
||||
right_side_bytes += 64;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
|
@ -522,8 +525,8 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
|
|||
|
||||
case HALLOCY_SIMD_AVX2:
|
||||
case HALLOCY_SIMD_AVX: {
|
||||
if ((size_t)left_side_bytes % SIMD_256_WIDTH == (size_t)right_side_bytes % SIMD_256_WIDTH) {
|
||||
while (((size_t)left_side_bytes % SIMD_256_WIDTH) != 0 && left_side_bytes != end_address) {
|
||||
if ((size_t)left_side_bytes % 32 == (size_t)right_side_bytes % 32) {
|
||||
while (((size_t)left_side_bytes % 32) != 0 && left_side_bytes != end_address) {
|
||||
if (*left_side_bytes != *right_side_bytes) {
|
||||
return false;
|
||||
}
|
||||
|
|
@ -532,7 +535,7 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
|
|||
right_side_bytes += 1;
|
||||
}
|
||||
|
||||
while (left_side_bytes - end_address >= SIMD_256_WIDTH) {
|
||||
while (left_side_bytes - end_address >= 32) {
|
||||
__m256i simd_left_side_value = _mm256_load_si256((__m256i*)left_side_bytes);
|
||||
__m256i simd_right_side_value = _mm256_load_si256((__m256i*)right_side_bytes);
|
||||
|
||||
|
|
@ -541,11 +544,11 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
|
|||
return false;
|
||||
}
|
||||
|
||||
left_side_bytes += SIMD_256_WIDTH;
|
||||
right_side_bytes += SIMD_256_WIDTH;
|
||||
left_side_bytes += 32;
|
||||
right_side_bytes += 32;
|
||||
}
|
||||
} else {
|
||||
while (left_side_bytes - end_address >= SIMD_256_WIDTH) {
|
||||
while (left_side_bytes - end_address >= 32) {
|
||||
__m256i simd_left_side_value = _mm256_loadu_si256((__m256i*)left_side_bytes);
|
||||
__m256i simd_right_side_value = _mm256_loadu_si256((__m256i*)right_side_bytes);
|
||||
|
||||
|
|
@ -554,8 +557,8 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
|
|||
return false;
|
||||
}
|
||||
|
||||
left_side_bytes += SIMD_256_WIDTH;
|
||||
right_side_bytes += SIMD_256_WIDTH;
|
||||
left_side_bytes += 32;
|
||||
right_side_bytes += 32;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
|
@ -563,8 +566,8 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
|
|||
|
||||
case HALLOCY_SIMD_SSE2:
|
||||
case HALLOCY_SIMD_SSE: {
|
||||
if ((size_t)left_side_bytes % SIMD_128_WIDTH == (size_t)right_side_bytes % SIMD_128_WIDTH) {
|
||||
while (((size_t)left_side_bytes % SIMD_128_WIDTH) != 0 && left_side_bytes != end_address) {
|
||||
if ((size_t)left_side_bytes % 16 == (size_t)right_side_bytes % 16) {
|
||||
while (((size_t)left_side_bytes % 16) != 0 && left_side_bytes != end_address) {
|
||||
if (*left_side_bytes != *right_side_bytes) {
|
||||
return false;
|
||||
}
|
||||
|
|
@ -573,7 +576,7 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
|
|||
right_side_bytes += 1;
|
||||
}
|
||||
|
||||
while (left_side_bytes - end_address >= SIMD_128_WIDTH) {
|
||||
while (left_side_bytes - end_address >= 16) {
|
||||
__m128i simd_left_side_value = _mm_load_si128((__m128i*)left_side_bytes);
|
||||
__m128i simd_right_side_value = _mm_load_si128((__m128i*)right_side_bytes);
|
||||
|
||||
|
|
@ -582,11 +585,11 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
|
|||
return false;
|
||||
}
|
||||
|
||||
left_side_bytes += SIMD_128_WIDTH;
|
||||
right_side_bytes += SIMD_128_WIDTH;
|
||||
left_side_bytes += 16;
|
||||
right_side_bytes += 16;
|
||||
}
|
||||
} else {
|
||||
while (left_side_bytes - end_address >= SIMD_128_WIDTH) {
|
||||
while (left_side_bytes - end_address >= 16) {
|
||||
__m128i simd_left_side_value = _mm_loadu_si128((__m128i*)left_side_bytes);
|
||||
__m128i simd_right_side_value = _mm_loadu_si128((__m128i*)right_side_bytes);
|
||||
|
||||
|
|
@ -595,8 +598,8 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
|
|||
return false;
|
||||
}
|
||||
|
||||
left_side_bytes += SIMD_128_WIDTH;
|
||||
right_side_bytes += SIMD_128_WIDTH;
|
||||
left_side_bytes += 16;
|
||||
right_side_bytes += 16;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -21,22 +21,20 @@
|
|||
*/
|
||||
#include "../../Include/Hallocy/Utils/Simd.h"
|
||||
|
||||
#ifdef LIN_NEON
|
||||
#include <sys/auxv.h>
|
||||
#endif
|
||||
static HallocySimdType hallocy_supported_simd = HALLOCY_SIMD_UNDEFINED;
|
||||
|
||||
HallocySimdType hallocy_is_simd_supported(void) {
|
||||
static HallocySimdType hallocy_supported_simd = HALLOCY_SIMD_UNDEFINED;
|
||||
HallocySimdType hallocy_is_simd_supported() {
|
||||
if (hallocy_supported_simd != HALLOCY_SIMD_UNDEFINED) {
|
||||
return hallocy_supported_simd;
|
||||
}
|
||||
|
||||
#if defined(WIN_NEON)
|
||||
#if defined(_MSC_VER)
|
||||
#if defined(_M_ARM64)
|
||||
if (isProcessorFeaturePresent(PF_ARM64_SVE)) {
|
||||
hallocy_supported_simd = HALLOCY_SIMD_NEON;
|
||||
return hallocy_supported_simd;
|
||||
}
|
||||
#elif defined(WIN_SIMD)
|
||||
#else
|
||||
int cpu_info[4] = { 0 };
|
||||
__cpuid(cpu_info, 7);
|
||||
if ((cpu_info[1] & (1 << 16)) != 0) {
|
||||
|
|
@ -65,12 +63,31 @@ HallocySimdType hallocy_is_simd_supported(void) {
|
|||
hallocy_supported_simd = HALLOCY_SIMD_SSE2;
|
||||
return hallocy_supported_simd;
|
||||
}
|
||||
#elif defined(LIN_NEON)
|
||||
if (getauxval(16) & (1 << 12)) {
|
||||
#endif
|
||||
#else
|
||||
#if defined(__aarch64__) || defined(__arm__)
|
||||
int file_descriptor = open("/proc/cpuinfo", O_READONLY);
|
||||
if (file_descriptor == -1) {
|
||||
return hallocy_supported_simd;
|
||||
}
|
||||
|
||||
char buffer[256];
|
||||
int bytes_read = read(file_descriptor, buffer, sizeof(buffer));
|
||||
while (bytes_read > 0) {
|
||||
for (size_t i = 0; i < bytes_read - 4; i++) {
|
||||
if (buffer[i] == 'n' && buffer[i + 1] == 'e' && buffer[i + 2] == 'o' && buffer[i + 3] == 'n') {
|
||||
close(file_descriptor);
|
||||
|
||||
hallocy_supported_simd = HALLOCY_SIMD_NEON;
|
||||
return hallocy_supported_simd;
|
||||
}
|
||||
#elif defined(LIN_SIMD)
|
||||
}
|
||||
|
||||
bytes_read = read(file_descriptor, buffer, sizeof(buffer));
|
||||
}
|
||||
|
||||
close(file_descriptor);
|
||||
#else
|
||||
unsigned int a, b, c, d;
|
||||
__asm__ __volatile__ (
|
||||
"cpuid"
|
||||
|
|
@ -109,6 +126,7 @@ HallocySimdType hallocy_is_simd_supported(void) {
|
|||
return hallocy_supported_simd;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
hallocy_supported_simd = HALLOCY_SIMD_NONE;
|
||||
return hallocy_supported_simd;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue