Compare commits

..

19 commits
V1.0.0 ... main

Author SHA1 Message Date
9cef673a7e Merge pull request 'h7-fix-android-build' (#14) from h7-fix-android-build into main
Reviewed-on: #14
2025-10-16 17:25:34 -05:00
a5f43bce5a fix(android build): fixed checking for simd version 2025-10-16 12:07:15 -05:00
4b87134390 fix(android build): fixed mistake in while comparison for compare memory function 2025-10-16 11:55:41 -05:00
c0f40ecddf fix(android build): fixed typing of neon memory implementation 2025-10-16 11:53:49 -05:00
a53db34b9b fix(android build): added missing flag check for neon 2025-10-16 11:48:44 -05:00
19544228f2 fix(android build): made cmake file work with neon processors 2025-10-16 11:43:29 -05:00
5ab70241de fix(android build): made cmake file work with none simd processors 2025-10-16 11:17:38 -05:00
58ba6ee289 fix(android build): changed way that simd is detected at build and run time 2025-10-16 11:08:54 -05:00
061cb514fe fix(android build): fixed unsused includes 2025-10-16 10:33:50 -05:00
db9d974f5e Merge pull request 'h6-linting' (#12) from h6-linting into main
Reviewed-on: #12
2025-10-05 10:32:53 -05:00
945137ee09 refactor(linting): removed const in header files for function definitions 2025-10-05 10:28:59 -05:00
afcdff4c17 refactor(linting): moved page_size into function where it is used 2025-10-05 10:27:54 -05:00
c642e75f1d chore(linting): removed .vscode from gitignore 2025-10-05 10:18:48 -05:00
4e545e649a refactor(linting): added type casting for safety 2025-10-05 10:06:29 -05:00
73eeb4ef70 refactor(linting): removed magical numbers from memory functions 2025-10-05 10:02:50 -05:00
7e2b9e5045 chore(linting): added vscode settings folder to gitignore 2025-10-05 09:45:54 -05:00
b985a99618 feat(linting): added clang tidy 2025-10-05 09:42:31 -05:00
d497356835 refactor(linting): fixed warnings given by clang 2025-10-05 09:40:33 -05:00
8359efa2bc chore(gitignore): added cache and compile commands to ignore 2025-10-05 09:38:38 -05:00
10 changed files with 239 additions and 240 deletions

11
.clang-tidy Normal file
View file

@ -0,0 +1,11 @@
Checks: >
clang-analyzer-*,
bugprone-*,
performance-*,
readability-*,
portability-*,
cppcoreguidelines-*,
-clang-analyzer-osx*,
-cppcoreguidelines-pro-type-vararg
WarningsAsErrors: ''
FormatStyle: file

4
.gitignore vendored
View file

@ -1 +1,3 @@
Build
Build
compile_commands.json
.cache

View file

@ -16,11 +16,13 @@ target_link_libraries(HallocyTest Hallocy)
if (MSVC)
target_compile_options(Hallocy PRIVATE /W4 /Zl)
else()
target_compile_options(Hallocy PRIVATE -mavx512f -mavx512vl)
target_compile_options(HallocyTest PRIVATE -mavx512f -mavx512vl)
target_compile_options(Hallocy PRIVATE -march=native)
target_compile_options(HallocyTest PRIVATE -march=native)
if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
target_compile_options(Hallocy PRIVATE -mavx512f -mavx512vl -march=native)
target_compile_options(HallocyTest PRIVATE -mavx512f -mavx512vl -march=native)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
target_compile_options(Hallocy PRIVATE -mfpu=neon)
target_compile_options(HallocyTest PRIVATE -mfpu=neon)
endif()
target_compile_options(Hallocy PRIVATE -Wall -Wextra -pedantic)
endif()

View file

@ -28,11 +28,11 @@
#include "../Utils/Error.h"
void *hallocy_allocate(const size_t size, const bool zero_memory);
void *hallocy_allocate(size_t size, bool zero_memory);
static inline void *hallocy_malloc(const size_t size) { return hallocy_allocate(size, false); }
static inline void *hallocy_calloc(const size_t size, size_t count) { return hallocy_allocate(size * count, true); }
void *hallocy_realloc(void *memory_pointer, const size_t size);
void *hallocy_realloc(void *memory_pointer, size_t size);
HallocyError hallocy_free(void *pointer);
#endif
#endif

View file

@ -28,9 +28,9 @@
#include "../Utils/Error.h"
HallocyError hallocy_set_memory(void *destination, int value, const size_t size);
HallocyError hallocy_copy_memory(void *destination, void *source, const size_t size);
HallocyError hallocy_move_memory(void *destination, void *source, const size_t size);
bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size);
HallocyError hallocy_set_memory(void *destination, int value, size_t size);
HallocyError hallocy_copy_memory(void *destination, void *source, size_t size);
HallocyError hallocy_move_memory(void *destination, void *source, size_t size);
bool hallocy_compare_memory(void *left_side, void *right_side, size_t size);
#endif
#endif

View file

@ -31,4 +31,4 @@ typedef enum {
HALLOCY_ERROR_INVALID_PARAM = 4
} HallocyError;
#endif
#endif

View file

@ -23,22 +23,27 @@
#ifndef HALLOCY_SIMD
#define HALLOCY_SIMD
#if defined(_MSC_VER)
#if defined(_M_ARM64)
#include <arm64intr.h>
#else
#include <intrin.h>
#endif
#else
#if defined(__aarch64__)
#include <arm64intr.h>
#elif defined(__arm__)
#if defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM))
#define WIN_NEON
#include <arm_neon.h>
#else
#elif defined(_MSC_VER)
#define WIN_SIMD
#include <intrin.h>
#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
#define LIN_SIMD
#include <immintrin.h>
#endif
#elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__aarch64__))
#define LIN_NEON
#include <arm_neon.h>
#else
#warning "SIMD is unsupported by this architecture or compiler (only x86/x64/ARM/ARM64 supported)."
#endif
#define SIMD_64_WIDTH 8
#define SIMD_128_WIDTH 16
#define SIMD_256_WIDTH 32
#define SIMD_512_WIDTH 64
typedef enum {
HALLOCY_SIMD_UNDEFINED = 0,
HALLOCY_SIMD_NONE = 1,
@ -50,6 +55,6 @@ typedef enum {
HALLOCY_SIMD_NEON = 7
} HallocySimdType;
HallocySimdType hallocy_is_simd_supported();
HallocySimdType hallocy_is_simd_supported(void);
#endif
#endif

View file

@ -55,7 +55,6 @@ static _Thread_local size_t hallocy_small_memory_freed = 0;
static _Thread_local size_t hallocy_small_memory_allocated = 0;
static _Thread_local HallocyMemoryHeader *hallocy_small_memory_bin = NULL;
static size_t page_size = 0;
static size_t hallocy_small_allocation_size = 0;
static size_t hallocy_medium_allocation_size = 0;
@ -70,6 +69,7 @@ static BOOL CALLBACK hallocy_initialize_mutex(PINIT_ONCE init_once, PVOID parame
#endif
void *hallocy_allocate(const size_t size, const bool zero_memory) {
static size_t page_size = 0;
if (page_size == 0) {
#if defined(_WIN32)
SYSTEM_INFO system_info;
@ -431,4 +431,4 @@ HallocyError hallocy_free(void *pointer) {
}
return HALLOCY_ERROR_NONE;
}
}

View file

@ -23,9 +23,6 @@
#include "../../Include/Hallocy/Core/Memory.h"
#include "../../Include/Hallocy/Utils/Simd.h"
#include <immintrin.h>
#include <stddef.h>
HallocyError hallocy_set_memory(void *destination, int value, const size_t size) {
if (destination == NULL) {
return HALLOCY_ERROR_INVALID_POINTER;
@ -37,46 +34,46 @@ HallocyError hallocy_set_memory(void *destination, int value, const size_t size)
unsigned char value_bytes = (unsigned char)value;
switch (hallocy_is_simd_supported()) {
#if defined(_M_ARM64) || defined(__aarch64__) || defined(__arm__)
#if defined(LIN_NEON) || defined(WIN_NEON)
case HALLOCY_SIMD_NEON: {
while (((size_t)destination_bytes % 16) != 0 && destination_bytes != end_address) {
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
*destination_bytes = value_bytes;
destination_bytes += 1;
}
uint8x16_t simd_value = vdupq_n_u8(value_bytes);
while (destination_bytes - end_address >= 16) {
while (destination_bytes - end_address >= SIMD_128_WIDTH) {
vst1q_u8(destination_bytes, simd_value);
destination_bytes += 16;
destination_bytes += SIMD_128_WIDTH;
}
break;
}
#else
#elif defined(LIN_SIMD) || defined(WIN_SIMD)
case HALLOCY_SIMD_AVX512: {
while (((size_t)destination_bytes % 64) != 0 && destination_bytes != end_address) {
while (((size_t)destination_bytes % SIMD_512_WIDTH) != 0 && destination_bytes != end_address) {
*destination_bytes = value_bytes;
destination_bytes += 1;
}
__m512i simd_value = _mm512_set1_epi8(value_bytes);
while (destination_bytes - end_address >= 64) {
__m512i simd_value = _mm512_set1_epi8((char)value_bytes);
while (destination_bytes - end_address >= SIMD_512_WIDTH) {
_mm512_store_si512((__m512i*)destination_bytes, simd_value);
destination_bytes += 64;
destination_bytes += SIMD_512_WIDTH;
}
break;
}
case HALLOCY_SIMD_AVX2: {
while (((size_t)destination_bytes % 32) != 0 && destination_bytes != end_address) {
while (((size_t)destination_bytes % SIMD_256_WIDTH) != 0 && destination_bytes != end_address) {
*destination_bytes = value_bytes;
destination_bytes += 1;
}
__m256i simd_value = _mm256_set1_epi8(value_bytes);
while (destination_bytes - end_address >= 32) {
__m256i simd_value = _mm256_set1_epi8((char)value_bytes);
while (destination_bytes - end_address >= SIMD_256_WIDTH) {
_mm256_store_si256((__m256i*)destination_bytes, simd_value);
destination_bytes += 32;
destination_bytes += SIMD_256_WIDTH;
}
break;
@ -85,15 +82,15 @@ HallocyError hallocy_set_memory(void *destination, int value, const size_t size)
case HALLOCY_SIMD_AVX:
case HALLOCY_SIMD_SSE2:
case HALLOCY_SIMD_SSE: {
while (((size_t)destination_bytes % 16) != 0 && destination_bytes != end_address) {
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
*destination_bytes = value_bytes;
destination_bytes += 1;
}
__m128i simd_value = _mm_set1_epi8(value_bytes);
while (destination_bytes - end_address >= 16) {
__m128i simd_value = _mm_set1_epi8((char)value_bytes);
while (destination_bytes - end_address >= SIMD_128_WIDTH) {
_mm_store_si128((__m128i*)destination_bytes, simd_value);
destination_bytes += 16;
destination_bytes += SIMD_128_WIDTH;
}
break;
@ -108,7 +105,7 @@ HallocyError hallocy_set_memory(void *destination, int value, const size_t size)
size_t value_word = 0;
for (size_t i = 0; i < word_size; i++) {
value_word |= (size_t)value_bytes << (i * 8);
value_word |= (size_t)value_bytes << (i * SIMD_64_WIDTH);
}
size_t *destination_word = (size_t*)destination_bytes;
@ -141,10 +138,10 @@ HallocyError hallocy_copy_memory(void *destination, void *source, const size_t s
unsigned char *end_address = destination_bytes + size;
switch (hallocy_is_simd_supported()) {
#if defined(_M_ARM64) || defined(__aarch64__) || defined(__arm__)
#if defined(LIN_NEON) || defined(WIN_NEON)
case HALLOCY_SIMD_NEON: {
if ((size_t)destination_bytes % 16 == (size_t)source_bytes % 16) {
while (((size_t)destination_bytes % 16) != 0 && destination_bytes != end_address) {
if ((size_t)destination_bytes % SIMD_128_WIDTH == (size_t)source_bytes % SIMD_128_WIDTH) {
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
*destination_bytes = *source_bytes;
destination_bytes += 1;
source_bytes += 1;
@ -152,39 +149,39 @@ HallocyError hallocy_copy_memory(void *destination, void *source, const size_t s
}
uint8x16_t simd_value;
while (destination_bytes - end_address >= 16) {
simd_value = vdupq_n_u8(source_bytes);
while (destination_bytes - end_address >= SIMD_128_WIDTH) {
simd_value = vdupq_n_u8(*source_bytes);
vst1q_u8(destination_bytes, simd_value);
destination_bytes += 16;
source_bytes += 16;
destination_bytes += SIMD_128_WIDTH;
source_bytes += SIMD_128_WIDTH;
}
break;
}
#else
#elif defined(LIN_SIMD) || defined(WIN_SIMD)
case HALLOCY_SIMD_AVX512: {
if ((size_t)destination_bytes % 64 == (size_t)source_bytes % 64) {
while (((size_t)destination_bytes % 64) != 0 && destination_bytes != end_address) {
if ((size_t)destination_bytes % SIMD_512_WIDTH == (size_t)source_bytes % SIMD_512_WIDTH) {
while (((size_t)destination_bytes % SIMD_512_WIDTH) != 0 && destination_bytes != end_address) {
*destination_bytes = *source_bytes;
destination_bytes += 1;
source_bytes += 1;
}
__m512i simd_value;
while (destination_bytes - end_address >= 64) {
while (destination_bytes - end_address >= SIMD_512_WIDTH) {
simd_value = _mm512_load_si512((__m512i*)source_bytes);
_mm512_store_si512((__m512i*)destination_bytes, simd_value);
destination_bytes += 64;
source_bytes += 64;
destination_bytes += SIMD_512_WIDTH;
source_bytes += SIMD_512_WIDTH;
}
} else {
__m512i simd_value;
while (destination_bytes - end_address >= 64) {
while (destination_bytes - end_address >= SIMD_512_WIDTH) {
simd_value = _mm512_loadu_si512((__m512i*)source_bytes);
_mm512_storeu_si512((__m512i*)destination_bytes, simd_value);
destination_bytes += 64;
source_bytes += 64;
destination_bytes += SIMD_512_WIDTH;
source_bytes += SIMD_512_WIDTH;
}
}
break;
@ -192,29 +189,29 @@ HallocyError hallocy_copy_memory(void *destination, void *source, const size_t s
case HALLOCY_SIMD_AVX2:
case HALLOCY_SIMD_AVX: {
if ((size_t)destination_bytes % 32 == (size_t)source_bytes % 64) {
while (((size_t)destination_bytes % 32) != 0 && destination_bytes != end_address) {
if ((size_t)destination_bytes % SIMD_256_WIDTH == (size_t)source_bytes % SIMD_256_WIDTH) {
while (((size_t)destination_bytes % SIMD_256_WIDTH) != 0 && destination_bytes != end_address) {
*destination_bytes = *source_bytes;
destination_bytes += 1;
source_bytes += 1;
}
__m256i simd_value;
while (destination_bytes - end_address >= 32) {
while (destination_bytes - end_address >= SIMD_256_WIDTH) {
simd_value = _mm256_load_si256((__m256i*)source_bytes);
_mm256_store_si256((__m256i*)destination_bytes, simd_value);
destination_bytes += 32;
source_bytes += 32;
destination_bytes += SIMD_256_WIDTH;
source_bytes += SIMD_256_WIDTH;
}
} else {
__m256i simd_value;
while (destination_bytes - end_address >= 32) {
while (destination_bytes - end_address >= SIMD_256_WIDTH) {
simd_value = _mm256_loadu_si256((__m256i*)source_bytes);
_mm256_storeu_si256((__m256i*)destination_bytes, simd_value);
destination_bytes += 32;
source_bytes += 32;
destination_bytes += SIMD_256_WIDTH;
source_bytes += SIMD_256_WIDTH;
}
}
break;
@ -222,29 +219,29 @@ HallocyError hallocy_copy_memory(void *destination, void *source, const size_t s
case HALLOCY_SIMD_SSE2:
case HALLOCY_SIMD_SSE: {
if ((size_t)destination_bytes % 16 == (size_t)source_bytes % 64) {
while (((size_t)destination_bytes % 16) != 0 && destination_bytes != end_address) {
if ((size_t)destination_bytes % SIMD_128_WIDTH == (size_t)source_bytes % SIMD_128_WIDTH) {
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
*destination_bytes = *source_bytes;
destination_bytes += 1;
source_bytes += 1;
}
__m128i simd_value;
while (destination_bytes - end_address >= 16) {
while (destination_bytes - end_address >= SIMD_128_WIDTH) {
simd_value = _mm_load_si128((__m128i*)source_bytes);
_mm_store_si128((__m128i*)destination_bytes, simd_value);
destination_bytes += 16;
source_bytes += 16;
destination_bytes += SIMD_128_WIDTH;
source_bytes += SIMD_128_WIDTH;
}
} else {
__m128i simd_value;
while (destination_bytes - end_address >= 16) {
while (destination_bytes - end_address >= SIMD_128_WIDTH) {
simd_value = _mm_loadu_si128((__m128i*)source_bytes);
_mm_storeu_si128((__m128i*)destination_bytes, simd_value);
destination_bytes += 16;
source_bytes += 16;
destination_bytes += SIMD_128_WIDTH;
source_bytes += SIMD_128_WIDTH;
}
}
break;
@ -299,10 +296,10 @@ HallocyError hallocy_move_memory(void *destination, void *source, const size_t s
unsigned char *source_bytes = (unsigned char*)source + size;
switch (hallocy_is_simd_supported()) {
#if defined(_M_ARM64) || defined(__aarch64__) || defined(__arm__)
#if defined(LIN_NEON) || defined(WIN_NEON)
case HALLOCY_SIMD_NEON: {
if ((size_t)destination_bytes % 16 == (size_t)source_bytes % 16) {
while (((size_t)destination_bytes % 16) != 0 && destination_bytes != end_address) {
if ((size_t)destination_bytes % SIMD_128_WIDTH == (size_t)source_bytes % SIMD_128_WIDTH) {
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
destination_bytes -= 1;
source_bytes -= 1;
*destination_bytes = *source_bytes;
@ -310,37 +307,37 @@ HallocyError hallocy_move_memory(void *destination, void *source, const size_t s
}
uint8x16_t simd_value;
while (end_address - destination_bytes >= 16) {
destination_bytes -= 16;
source_bytes -= 16;
while (end_address - destination_bytes >= SIMD_128_WIDTH) {
destination_bytes -= SIMD_128_WIDTH;
source_bytes -= SIMD_128_WIDTH;
simd_value = vdupq_n_u8(source_bytes);
simd_value = vdupq_n_u8(*source_bytes);
vst1q_u8(destination_bytes, simd_value);
}
break;
}
#else
#elif defined(LIN_SIMD) || defined(WIN_SIMD)
case HALLOCY_SIMD_AVX512: {
if ((size_t)destination_bytes % 64 == (size_t)source_bytes % 64) {
while (((size_t)destination_bytes % 64) != 0 && destination_bytes != end_address) {
if ((size_t)destination_bytes % SIMD_512_WIDTH == (size_t)source_bytes % SIMD_512_WIDTH) {
while (((size_t)destination_bytes % SIMD_512_WIDTH) != 0 && destination_bytes != end_address) {
destination_bytes -= 1;
source_bytes -= 1;
*destination_bytes = *source_bytes;
}
__m512i simd_value;
while (end_address - destination_bytes >= 64) {
destination_bytes -= 64;
source_bytes -= 64;
while (end_address - destination_bytes >= SIMD_512_WIDTH) {
destination_bytes -= SIMD_512_WIDTH;
source_bytes -= SIMD_512_WIDTH;
simd_value = _mm512_load_si512((__m512i*)source_bytes);
_mm512_store_si512((__m512i*)destination_bytes, simd_value);
}
} else {
__m512i simd_value;
while (end_address - destination_bytes >= 64) {
destination_bytes -= 64;
source_bytes -= 64;
while (end_address - destination_bytes >= SIMD_512_WIDTH) {
destination_bytes -= SIMD_512_WIDTH;
source_bytes -= SIMD_512_WIDTH;
simd_value = _mm512_loadu_si512((__m512i*)source_bytes);
_mm512_storeu_si512((__m512i*)destination_bytes, simd_value);
@ -351,8 +348,8 @@ HallocyError hallocy_move_memory(void *destination, void *source, const size_t s
case HALLOCY_SIMD_AVX2:
case HALLOCY_SIMD_AVX: {
if ((size_t)destination_bytes % 32 == (size_t)source_bytes % 64) {
while (((size_t)destination_bytes % 32) != 0 && destination_bytes != end_address) {
if ((size_t)destination_bytes % SIMD_256_WIDTH == (size_t)source_bytes % SIMD_512_WIDTH) {
while (((size_t)destination_bytes % SIMD_256_WIDTH) != 0 && destination_bytes != end_address) {
destination_bytes -= 1;
source_bytes -= 1;
@ -360,18 +357,18 @@ HallocyError hallocy_move_memory(void *destination, void *source, const size_t s
}
__m256i simd_value;
while (end_address - destination_bytes >= 32) {
destination_bytes -= 32;
source_bytes -= 32;
while (end_address - destination_bytes >= SIMD_256_WIDTH) {
destination_bytes -= SIMD_256_WIDTH;
source_bytes -= SIMD_256_WIDTH;
simd_value = _mm256_load_si256((__m256i*)source_bytes);
_mm256_store_si256((__m256i*)destination_bytes, simd_value);
}
} else {
__m256i simd_value;
while (end_address - destination_bytes >= 32) {
destination_bytes -= 32;
source_bytes -= 32;
while (end_address - destination_bytes >= SIMD_256_WIDTH) {
destination_bytes -= SIMD_256_WIDTH;
source_bytes -= SIMD_256_WIDTH;
simd_value = _mm256_loadu_si256((__m256i*)source_bytes);
_mm256_storeu_si256((__m256i*)destination_bytes, simd_value);
@ -382,26 +379,26 @@ HallocyError hallocy_move_memory(void *destination, void *source, const size_t s
case HALLOCY_SIMD_SSE2:
case HALLOCY_SIMD_SSE: {
if ((size_t)destination_bytes % 16 == (size_t)source_bytes % 64) {
while (((size_t)destination_bytes % 16) != 0 && destination_bytes != end_address) {
if ((size_t)destination_bytes % SIMD_128_WIDTH == (size_t)source_bytes % SIMD_512_WIDTH) {
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
destination_bytes -= 1;
source_bytes -= 1;
*destination_bytes = *source_bytes;
}
__m128i simd_value;
while (end_address - destination_bytes >= 16) {
destination_bytes -= 16;
source_bytes -= 16;
while (end_address - destination_bytes >= SIMD_128_WIDTH) {
destination_bytes -= SIMD_128_WIDTH;
source_bytes -= SIMD_128_WIDTH;
simd_value = _mm_load_si128((__m128i*)source_bytes);
_mm_store_si128((__m128i*)destination_bytes, simd_value);
}
} else {
__m128i simd_value;
while (end_address - destination_bytes >= 16) {
destination_bytes -= 16;
source_bytes -= 16;
while (end_address - destination_bytes >= SIMD_128_WIDTH) {
destination_bytes -= SIMD_128_WIDTH;
source_bytes -= SIMD_128_WIDTH;
simd_value = _mm_loadu_si128((__m128i*)source_bytes);
_mm_storeu_si128((__m128i*)destination_bytes, simd_value);
@ -455,10 +452,10 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
unsigned char *end_address = left_side_bytes + size;
switch (hallocy_is_simd_supported()) {
#if defined(_M_ARM64) || defined(__aarch64__) || defined(__arm__)
#if defined(LIN_NEON) || defined(WIN_NEON)
case HALLOCY_SIMD_NEON: {
if ((size_t)left_side_bytes % 16 == (size_t)right_side_bytes % 16) {
while (((size_t)left_side_bytes % 16) != 0 && left_side_bytes != end_address) {
if ((size_t)left_side_bytes % SIMD_128_WIDTH == (size_t)right_side_bytes % SIMD_128_WIDTH) {
while (((size_t)left_side_bytes % SIMD_128_WIDTH) != 0 && left_side_bytes != end_address) {
if (*left_side_bytes != *right_side_bytes) {
return false;
}
@ -468,24 +465,24 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
}
}
while (end_address - destination_bytes >= 16) {
uint8x16_t simd_left_side_value = vdupq_n_u8(left_side_bytes);
uint8x16_t simd_right_side_value = vdupq_n_u8(right_side_bytes);
while (left_side_bytes - end_address >= SIMD_128_WIDTH) {
uint8x16_t simd_left_side_value = vdupq_n_u8(*left_side_bytes);
uint8x16_t simd_right_side_value = vdupq_n_u8(*right_side_bytes);
uint8x16_t result = vceqq_u8(simd_left_side_value, simd_right_side_value);
if (vmaxvq_u8(result) != 0xFF) {
return false;
}
left_side_bytes += 16;
right_side_bytes += 16
left_side_bytes += SIMD_128_WIDTH;
right_side_bytes += SIMD_128_WIDTH;
}
break;
}
#else
#elif defined(LIN_SIMD) || defined(WIN_SIMD)
case HALLOCY_SIMD_AVX512: {
if ((size_t)left_side_bytes % 64 == (size_t)right_side_bytes % 64) {
while (((size_t)left_side_bytes % 64) != 0 && left_side_bytes != end_address) {
if ((size_t)left_side_bytes % SIMD_512_WIDTH == (size_t)right_side_bytes % SIMD_512_WIDTH) {
while (((size_t)left_side_bytes % SIMD_512_WIDTH) != 0 && left_side_bytes != end_address) {
if (*left_side_bytes != *right_side_bytes) {
return false;
}
@ -494,7 +491,7 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
right_side_bytes += 1;
}
while (left_side_bytes - end_address >= 64) {
while (left_side_bytes - end_address >= SIMD_512_WIDTH) {
__m512i simd_left_side_value = _mm512_load_si512((__m512i*)left_side_bytes);
__m512i simd_right_side_value = _mm512_load_si512((__m512i*)right_side_bytes);
@ -503,11 +500,11 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
return false;
}
left_side_bytes += 64;
right_side_bytes += 64;
left_side_bytes += SIMD_512_WIDTH;
right_side_bytes += SIMD_512_WIDTH;
}
} else {
while (left_side_bytes - end_address >= 64) {
while (left_side_bytes - end_address >= SIMD_512_WIDTH) {
__m512i simd_left_side_value = _mm512_loadu_si512((__m512i*)left_side_bytes);
__m512i simd_right_side_value = _mm512_loadu_si512((__m512i*)right_side_bytes);
@ -516,8 +513,8 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
return false;
}
left_side_bytes += 64;
right_side_bytes += 64;
left_side_bytes += SIMD_512_WIDTH;
right_side_bytes += SIMD_512_WIDTH;
}
}
break;
@ -525,8 +522,8 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
case HALLOCY_SIMD_AVX2:
case HALLOCY_SIMD_AVX: {
if ((size_t)left_side_bytes % 32 == (size_t)right_side_bytes % 32) {
while (((size_t)left_side_bytes % 32) != 0 && left_side_bytes != end_address) {
if ((size_t)left_side_bytes % SIMD_256_WIDTH == (size_t)right_side_bytes % SIMD_256_WIDTH) {
while (((size_t)left_side_bytes % SIMD_256_WIDTH) != 0 && left_side_bytes != end_address) {
if (*left_side_bytes != *right_side_bytes) {
return false;
}
@ -535,7 +532,7 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
right_side_bytes += 1;
}
while (left_side_bytes - end_address >= 32) {
while (left_side_bytes - end_address >= SIMD_256_WIDTH) {
__m256i simd_left_side_value = _mm256_load_si256((__m256i*)left_side_bytes);
__m256i simd_right_side_value = _mm256_load_si256((__m256i*)right_side_bytes);
@ -544,11 +541,11 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
return false;
}
left_side_bytes += 32;
right_side_bytes += 32;
left_side_bytes += SIMD_256_WIDTH;
right_side_bytes += SIMD_256_WIDTH;
}
} else {
while (left_side_bytes - end_address >= 32) {
while (left_side_bytes - end_address >= SIMD_256_WIDTH) {
__m256i simd_left_side_value = _mm256_loadu_si256((__m256i*)left_side_bytes);
__m256i simd_right_side_value = _mm256_loadu_si256((__m256i*)right_side_bytes);
@ -557,8 +554,8 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
return false;
}
left_side_bytes += 32;
right_side_bytes += 32;
left_side_bytes += SIMD_256_WIDTH;
right_side_bytes += SIMD_256_WIDTH;
}
}
break;
@ -566,8 +563,8 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
case HALLOCY_SIMD_SSE2:
case HALLOCY_SIMD_SSE: {
if ((size_t)left_side_bytes % 16 == (size_t)right_side_bytes % 16) {
while (((size_t)left_side_bytes % 16) != 0 && left_side_bytes != end_address) {
if ((size_t)left_side_bytes % SIMD_128_WIDTH == (size_t)right_side_bytes % SIMD_128_WIDTH) {
while (((size_t)left_side_bytes % SIMD_128_WIDTH) != 0 && left_side_bytes != end_address) {
if (*left_side_bytes != *right_side_bytes) {
return false;
}
@ -576,7 +573,7 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
right_side_bytes += 1;
}
while (left_side_bytes - end_address >= 16) {
while (left_side_bytes - end_address >= SIMD_128_WIDTH) {
__m128i simd_left_side_value = _mm_load_si128((__m128i*)left_side_bytes);
__m128i simd_right_side_value = _mm_load_si128((__m128i*)right_side_bytes);
@ -585,11 +582,11 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
return false;
}
left_side_bytes += 16;
right_side_bytes += 16;
left_side_bytes += SIMD_128_WIDTH;
right_side_bytes += SIMD_128_WIDTH;
}
} else {
while (left_side_bytes - end_address >= 16) {
while (left_side_bytes - end_address >= SIMD_128_WIDTH) {
__m128i simd_left_side_value = _mm_loadu_si128((__m128i*)left_side_bytes);
__m128i simd_right_side_value = _mm_loadu_si128((__m128i*)right_side_bytes);
@ -598,8 +595,8 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
return false;
}
left_side_bytes += 16;
right_side_bytes += 16;
left_side_bytes += SIMD_128_WIDTH;
right_side_bytes += SIMD_128_WIDTH;
}
}
break;
@ -643,4 +640,4 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
}
return true;
}
}

View file

@ -21,113 +21,95 @@
*/
#include "../../Include/Hallocy/Utils/Simd.h"
static HallocySimdType hallocy_supported_simd = HALLOCY_SIMD_UNDEFINED;
#ifdef LIN_NEON
#include <sys/auxv.h>
#endif
HallocySimdType hallocy_is_simd_supported() {
HallocySimdType hallocy_is_simd_supported(void) {
static HallocySimdType hallocy_supported_simd = HALLOCY_SIMD_UNDEFINED;
if (hallocy_supported_simd != HALLOCY_SIMD_UNDEFINED) {
return hallocy_supported_simd;
}
#if defined(_MSC_VER)
#if defined(_M_ARM64)
if (isProcessorFeaturePresent(PF_ARM64_SVE)) {
hallocy_supported_simd = HALLOCY_SIMD_NEON;
return hallocy_supported_simd;
}
#else
int cpu_info[4] = { 0 };
__cpuid(cpu_info, 7);
if ((cpu_info[1] & (1 << 16)) != 0) {
hallocy_supported_simd = HALLOCY_SIMD_AVX512;
return hallocy_supported_simd;
}
#if defined(WIN_NEON)
if (isProcessorFeaturePresent(PF_ARM64_SVE)) {
hallocy_supported_simd = HALLOCY_SIMD_NEON;
return hallocy_supported_simd;
}
#elif defined(WIN_SIMD)
int cpu_info[4] = { 0 };
__cpuid(cpu_info, 7);
if ((cpu_info[1] & (1 << 16)) != 0) {
hallocy_supported_simd = HALLOCY_SIMD_AVX512;
return hallocy_supported_simd;
}
if ((cpu_info[1] & (1 << 5)) != 0) {
hallocy_supported_simd = HALLOCY_SIMD_AVX2;
return hallocy_supported_simd;
}
if ((cpu_info[1] & (1 << 5)) != 0) {
hallocy_supported_simd = HALLOCY_SIMD_AVX2;
return hallocy_supported_simd;
}
__cpuid(cpu_info, 1);
__cpuid(cpu_info, 1);
if ((cpu_info[2] & (1 << 28)) != 0) {
hallocy_supported_simd = HALLOCY_SIMD_AVX;
return hallocy_supported_simd;
}
if ((cpu_info[2] & (1 << 28)) != 0) {
hallocy_supported_simd = HALLOCY_SIMD_AVX;
return hallocy_supported_simd;
}
if ((cpu_info[3] & (1 << 26)) != 0) {
hallocy_supported_simd = HALLOCY_SIMD_SSE2;
return hallocy_supported_simd;
}
if ((cpu_info[3] & (1 << 26)) != 0) {
hallocy_supported_simd = HALLOCY_SIMD_SSE2;
return hallocy_supported_simd;
}
if ((cpu_info[3] & (1 << 25)) != 0) {
hallocy_supported_simd = HALLOCY_SIMD_SSE2;
return hallocy_supported_simd;
}
#endif
#else
#if defined(__aarch64__) || defined(__arm__)
int file_descriptor = open("/proc/cpuinfo", O_READONLY);
if (file_descriptor == -1) {
return hallocy_supported_simd;
}
if ((cpu_info[3] & (1 << 25)) != 0) {
hallocy_supported_simd = HALLOCY_SIMD_SSE2;
return hallocy_supported_simd;
}
#elif defined(LIN_NEON)
if (getauxval(16) & (1 << 12)) {
hallocy_supported_simd = HALLOCY_SIMD_NEON;
return hallocy_supported_simd;
}
#elif defined(LIN_SIMD)
unsigned int a, b, c, d;
__asm__ __volatile__ (
"cpuid"
: "=a" (a), "=b" (b), "=c" (c), "=d" (d)
: "a" (7)
);
char buffer[256];
int bytes_read = read(file_descriptor, buffer, sizeof(buffer));
while (bytes_read > 0) {
for (size_t i = 0; i < bytes_read - 4; i++) {
if (buffer[i] == 'n' && buffer[i + 1] == 'e' && buffer[i + 2] == 'o' && buffer[i + 3] == 'n') {
close(file_descriptor);
if ((b & (1 << 16)) != 0) {
hallocy_supported_simd = HALLOCY_SIMD_AVX512;
return hallocy_supported_simd;
}
hallocy_supported_simd = HALLOCY_SIMD_NEON;
return hallocy_supported_simd;
}
}
if ((b & (1 << 5)) != 0) {
hallocy_supported_simd = HALLOCY_SIMD_AVX2;
return hallocy_supported_simd;
}
bytes_read = read(file_descriptor, buffer, sizeof(buffer));
}
__asm__ __volatile__ (
"cpuid"
: "=a" (a), "=b" (b), "=c" (c), "=d" (d)
: "a" (1)
);
close(file_descriptor);
#else
unsigned int a, b, c, d;
__asm__ __volatile__ (
"cpuid"
: "=a" (a), "=b" (b), "=c" (c), "=d" (d)
: "a" (7)
);
if ((c & (1 << 28)) != 0) {
hallocy_supported_simd = HALLOCY_SIMD_AVX;
return hallocy_supported_simd;
}
if ((b & (1 << 16)) != 0) {
hallocy_supported_simd = HALLOCY_SIMD_AVX512;
return hallocy_supported_simd;
}
if ((c & (1 << 26)) != 0) {
hallocy_supported_simd = HALLOCY_SIMD_SSE2;
return hallocy_supported_simd;
}
if ((b & (1 << 5)) != 0) {
hallocy_supported_simd = HALLOCY_SIMD_AVX2;
return hallocy_supported_simd;
}
__asm__ __volatile__ (
"cpuid"
: "=a" (a), "=b" (b), "=c" (c), "=d" (d)
: "a" (1)
);
if ((c & (1 << 28)) != 0) {
hallocy_supported_simd = HALLOCY_SIMD_AVX;
return hallocy_supported_simd;
}
if ((c & (1 << 26)) != 0) {
hallocy_supported_simd = HALLOCY_SIMD_SSE2;
return hallocy_supported_simd;
}
if ((c & (1 << 25)) != 0) {
hallocy_supported_simd = HALLOCY_SIMD_SSE;
return hallocy_supported_simd;
}
#endif
if ((c & (1 << 25)) != 0) {
hallocy_supported_simd = HALLOCY_SIMD_SSE;
return hallocy_supported_simd;
}
#endif
hallocy_supported_simd = HALLOCY_SIMD_NONE;
return hallocy_supported_simd;
}
}