Compare commits
No commits in common. "9cef673a7e0d750f9f8233a6ddac478684c1e12c" and "db9d974f5e787ce92c8ddccf1453d86b32ec641e" have entirely different histories.
9cef673a7e
...
db9d974f5e
4 changed files with 120 additions and 102 deletions
|
|
@ -16,13 +16,11 @@ target_link_libraries(HallocyTest Hallocy)
|
||||||
if (MSVC)
|
if (MSVC)
|
||||||
target_compile_options(Hallocy PRIVATE /W4 /Zl)
|
target_compile_options(Hallocy PRIVATE /W4 /Zl)
|
||||||
else()
|
else()
|
||||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
|
target_compile_options(Hallocy PRIVATE -mavx512f -mavx512vl)
|
||||||
target_compile_options(Hallocy PRIVATE -mavx512f -mavx512vl -march=native)
|
target_compile_options(HallocyTest PRIVATE -mavx512f -mavx512vl)
|
||||||
target_compile_options(HallocyTest PRIVATE -mavx512f -mavx512vl -march=native)
|
|
||||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
|
target_compile_options(Hallocy PRIVATE -march=native)
|
||||||
target_compile_options(Hallocy PRIVATE -mfpu=neon)
|
target_compile_options(HallocyTest PRIVATE -march=native)
|
||||||
target_compile_options(HallocyTest PRIVATE -mfpu=neon)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
target_compile_options(Hallocy PRIVATE -Wall -Wextra -pedantic)
|
target_compile_options(Hallocy PRIVATE -Wall -Wextra -pedantic)
|
||||||
endif()
|
endif()
|
||||||
|
|
@ -23,20 +23,20 @@
|
||||||
#ifndef HALLOCY_SIMD
|
#ifndef HALLOCY_SIMD
|
||||||
#define HALLOCY_SIMD
|
#define HALLOCY_SIMD
|
||||||
|
|
||||||
#if defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM))
|
#if defined(_MSC_VER)
|
||||||
#define WIN_NEON
|
#if defined(_M_ARM64)
|
||||||
#include <arm_neon.h>
|
#include <arm64intr.h>
|
||||||
#elif defined(_MSC_VER)
|
#else
|
||||||
#define WIN_SIMD
|
|
||||||
#include <intrin.h>
|
#include <intrin.h>
|
||||||
#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
|
#endif
|
||||||
#define LIN_SIMD
|
|
||||||
#include <immintrin.h>
|
|
||||||
#elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__aarch64__))
|
|
||||||
#define LIN_NEON
|
|
||||||
#include <arm_neon.h>
|
|
||||||
#else
|
#else
|
||||||
#warning "SIMD is unsupported by this architecture or compiler (only x86/x64/ARM/ARM64 supported)."
|
#if defined(__aarch64__)
|
||||||
|
#include <arm64intr.h>
|
||||||
|
#elif defined(__arm__)
|
||||||
|
#include <arm_neon.h>
|
||||||
|
#else
|
||||||
|
#include <immintrin.h>
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define SIMD_64_WIDTH 8
|
#define SIMD_64_WIDTH 8
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,9 @@
|
||||||
#include "../../Include/Hallocy/Core/Memory.h"
|
#include "../../Include/Hallocy/Core/Memory.h"
|
||||||
#include "../../Include/Hallocy/Utils/Simd.h"
|
#include "../../Include/Hallocy/Utils/Simd.h"
|
||||||
|
|
||||||
|
#include <immintrin.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
|
||||||
HallocyError hallocy_set_memory(void *destination, int value, const size_t size) {
|
HallocyError hallocy_set_memory(void *destination, int value, const size_t size) {
|
||||||
if (destination == NULL) {
|
if (destination == NULL) {
|
||||||
return HALLOCY_ERROR_INVALID_POINTER;
|
return HALLOCY_ERROR_INVALID_POINTER;
|
||||||
|
|
@ -34,7 +37,7 @@ HallocyError hallocy_set_memory(void *destination, int value, const size_t size)
|
||||||
unsigned char value_bytes = (unsigned char)value;
|
unsigned char value_bytes = (unsigned char)value;
|
||||||
|
|
||||||
switch (hallocy_is_simd_supported()) {
|
switch (hallocy_is_simd_supported()) {
|
||||||
#if defined(LIN_NEON) || defined(WIN_NEON)
|
#if defined(_M_ARM64) || defined(__aarch64__) || defined(__arm__)
|
||||||
case HALLOCY_SIMD_NEON: {
|
case HALLOCY_SIMD_NEON: {
|
||||||
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
|
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
|
||||||
*destination_bytes = value_bytes;
|
*destination_bytes = value_bytes;
|
||||||
|
|
@ -48,7 +51,7 @@ HallocyError hallocy_set_memory(void *destination, int value, const size_t size)
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#elif defined(LIN_SIMD) || defined(WIN_SIMD)
|
#else
|
||||||
case HALLOCY_SIMD_AVX512: {
|
case HALLOCY_SIMD_AVX512: {
|
||||||
while (((size_t)destination_bytes % SIMD_512_WIDTH) != 0 && destination_bytes != end_address) {
|
while (((size_t)destination_bytes % SIMD_512_WIDTH) != 0 && destination_bytes != end_address) {
|
||||||
*destination_bytes = value_bytes;
|
*destination_bytes = value_bytes;
|
||||||
|
|
@ -138,7 +141,7 @@ HallocyError hallocy_copy_memory(void *destination, void *source, const size_t s
|
||||||
unsigned char *end_address = destination_bytes + size;
|
unsigned char *end_address = destination_bytes + size;
|
||||||
|
|
||||||
switch (hallocy_is_simd_supported()) {
|
switch (hallocy_is_simd_supported()) {
|
||||||
#if defined(LIN_NEON) || defined(WIN_NEON)
|
#if defined(_M_ARM64) || defined(__aarch64__) || defined(__arm__)
|
||||||
case HALLOCY_SIMD_NEON: {
|
case HALLOCY_SIMD_NEON: {
|
||||||
if ((size_t)destination_bytes % SIMD_128_WIDTH == (size_t)source_bytes % SIMD_128_WIDTH) {
|
if ((size_t)destination_bytes % SIMD_128_WIDTH == (size_t)source_bytes % SIMD_128_WIDTH) {
|
||||||
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
|
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
|
||||||
|
|
@ -150,14 +153,14 @@ HallocyError hallocy_copy_memory(void *destination, void *source, const size_t s
|
||||||
|
|
||||||
uint8x16_t simd_value;
|
uint8x16_t simd_value;
|
||||||
while (destination_bytes - end_address >= SIMD_128_WIDTH) {
|
while (destination_bytes - end_address >= SIMD_128_WIDTH) {
|
||||||
simd_value = vdupq_n_u8(*source_bytes);
|
simd_value = vdupq_n_u8(source_bytes);
|
||||||
vst1q_u8(destination_bytes, simd_value);
|
vst1q_u8(destination_bytes, simd_value);
|
||||||
destination_bytes += SIMD_128_WIDTH;
|
destination_bytes += SIMD_128_WIDTH;
|
||||||
source_bytes += SIMD_128_WIDTH;
|
source_bytes += SIMD_128_WIDTH;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#elif defined(LIN_SIMD) || defined(WIN_SIMD)
|
#else
|
||||||
case HALLOCY_SIMD_AVX512: {
|
case HALLOCY_SIMD_AVX512: {
|
||||||
if ((size_t)destination_bytes % SIMD_512_WIDTH == (size_t)source_bytes % SIMD_512_WIDTH) {
|
if ((size_t)destination_bytes % SIMD_512_WIDTH == (size_t)source_bytes % SIMD_512_WIDTH) {
|
||||||
while (((size_t)destination_bytes % SIMD_512_WIDTH) != 0 && destination_bytes != end_address) {
|
while (((size_t)destination_bytes % SIMD_512_WIDTH) != 0 && destination_bytes != end_address) {
|
||||||
|
|
@ -296,7 +299,7 @@ HallocyError hallocy_move_memory(void *destination, void *source, const size_t s
|
||||||
unsigned char *source_bytes = (unsigned char*)source + size;
|
unsigned char *source_bytes = (unsigned char*)source + size;
|
||||||
|
|
||||||
switch (hallocy_is_simd_supported()) {
|
switch (hallocy_is_simd_supported()) {
|
||||||
#if defined(LIN_NEON) || defined(WIN_NEON)
|
#if defined(_M_ARM64) || defined(__aarch64__) || defined(__arm__)
|
||||||
case HALLOCY_SIMD_NEON: {
|
case HALLOCY_SIMD_NEON: {
|
||||||
if ((size_t)destination_bytes % SIMD_128_WIDTH == (size_t)source_bytes % SIMD_128_WIDTH) {
|
if ((size_t)destination_bytes % SIMD_128_WIDTH == (size_t)source_bytes % SIMD_128_WIDTH) {
|
||||||
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
|
while (((size_t)destination_bytes % SIMD_128_WIDTH) != 0 && destination_bytes != end_address) {
|
||||||
|
|
@ -311,12 +314,12 @@ HallocyError hallocy_move_memory(void *destination, void *source, const size_t s
|
||||||
destination_bytes -= SIMD_128_WIDTH;
|
destination_bytes -= SIMD_128_WIDTH;
|
||||||
source_bytes -= SIMD_128_WIDTH;
|
source_bytes -= SIMD_128_WIDTH;
|
||||||
|
|
||||||
simd_value = vdupq_n_u8(*source_bytes);
|
simd_value = vdupq_n_u8(source_bytes);
|
||||||
vst1q_u8(destination_bytes, simd_value);
|
vst1q_u8(destination_bytes, simd_value);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#elif defined(LIN_SIMD) || defined(WIN_SIMD)
|
#else
|
||||||
case HALLOCY_SIMD_AVX512: {
|
case HALLOCY_SIMD_AVX512: {
|
||||||
if ((size_t)destination_bytes % SIMD_512_WIDTH == (size_t)source_bytes % SIMD_512_WIDTH) {
|
if ((size_t)destination_bytes % SIMD_512_WIDTH == (size_t)source_bytes % SIMD_512_WIDTH) {
|
||||||
while (((size_t)destination_bytes % SIMD_512_WIDTH) != 0 && destination_bytes != end_address) {
|
while (((size_t)destination_bytes % SIMD_512_WIDTH) != 0 && destination_bytes != end_address) {
|
||||||
|
|
@ -452,7 +455,7 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
|
||||||
unsigned char *end_address = left_side_bytes + size;
|
unsigned char *end_address = left_side_bytes + size;
|
||||||
|
|
||||||
switch (hallocy_is_simd_supported()) {
|
switch (hallocy_is_simd_supported()) {
|
||||||
#if defined(LIN_NEON) || defined(WIN_NEON)
|
#if defined(_M_ARM64) || defined(__aarch64__) || defined(__arm__)
|
||||||
case HALLOCY_SIMD_NEON: {
|
case HALLOCY_SIMD_NEON: {
|
||||||
if ((size_t)left_side_bytes % SIMD_128_WIDTH == (size_t)right_side_bytes % SIMD_128_WIDTH) {
|
if ((size_t)left_side_bytes % SIMD_128_WIDTH == (size_t)right_side_bytes % SIMD_128_WIDTH) {
|
||||||
while (((size_t)left_side_bytes % SIMD_128_WIDTH) != 0 && left_side_bytes != end_address) {
|
while (((size_t)left_side_bytes % SIMD_128_WIDTH) != 0 && left_side_bytes != end_address) {
|
||||||
|
|
@ -465,9 +468,9 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
while (left_side_bytes - end_address >= SIMD_128_WIDTH) {
|
while (end_address - destination_bytes >= SIMD_128_WIDTH) {
|
||||||
uint8x16_t simd_left_side_value = vdupq_n_u8(*left_side_bytes);
|
uint8x16_t simd_left_side_value = vdupq_n_u8(left_side_bytes);
|
||||||
uint8x16_t simd_right_side_value = vdupq_n_u8(*right_side_bytes);
|
uint8x16_t simd_right_side_value = vdupq_n_u8(right_side_bytes);
|
||||||
|
|
||||||
uint8x16_t result = vceqq_u8(simd_left_side_value, simd_right_side_value);
|
uint8x16_t result = vceqq_u8(simd_left_side_value, simd_right_side_value);
|
||||||
if (vmaxvq_u8(result) != 0xFF) {
|
if (vmaxvq_u8(result) != 0xFF) {
|
||||||
|
|
@ -479,7 +482,7 @@ bool hallocy_compare_memory(void *left_side, void *right_side, const size_t size
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#elif defined(LIN_SIMD) || defined(WIN_SIMD)
|
#else
|
||||||
case HALLOCY_SIMD_AVX512: {
|
case HALLOCY_SIMD_AVX512: {
|
||||||
if ((size_t)left_side_bytes % SIMD_512_WIDTH == (size_t)right_side_bytes % SIMD_512_WIDTH) {
|
if ((size_t)left_side_bytes % SIMD_512_WIDTH == (size_t)right_side_bytes % SIMD_512_WIDTH) {
|
||||||
while (((size_t)left_side_bytes % SIMD_512_WIDTH) != 0 && left_side_bytes != end_address) {
|
while (((size_t)left_side_bytes % SIMD_512_WIDTH) != 0 && left_side_bytes != end_address) {
|
||||||
|
|
|
||||||
|
|
@ -21,22 +21,19 @@
|
||||||
*/
|
*/
|
||||||
#include "../../Include/Hallocy/Utils/Simd.h"
|
#include "../../Include/Hallocy/Utils/Simd.h"
|
||||||
|
|
||||||
#ifdef LIN_NEON
|
|
||||||
#include <sys/auxv.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
HallocySimdType hallocy_is_simd_supported(void) {
|
HallocySimdType hallocy_is_simd_supported(void) {
|
||||||
static HallocySimdType hallocy_supported_simd = HALLOCY_SIMD_UNDEFINED;
|
static HallocySimdType hallocy_supported_simd = HALLOCY_SIMD_UNDEFINED;
|
||||||
if (hallocy_supported_simd != HALLOCY_SIMD_UNDEFINED) {
|
if (hallocy_supported_simd != HALLOCY_SIMD_UNDEFINED) {
|
||||||
return hallocy_supported_simd;
|
return hallocy_supported_simd;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(WIN_NEON)
|
#if defined(_MSC_VER)
|
||||||
|
#if defined(_M_ARM64)
|
||||||
if (isProcessorFeaturePresent(PF_ARM64_SVE)) {
|
if (isProcessorFeaturePresent(PF_ARM64_SVE)) {
|
||||||
hallocy_supported_simd = HALLOCY_SIMD_NEON;
|
hallocy_supported_simd = HALLOCY_SIMD_NEON;
|
||||||
return hallocy_supported_simd;
|
return hallocy_supported_simd;
|
||||||
}
|
}
|
||||||
#elif defined(WIN_SIMD)
|
#else
|
||||||
int cpu_info[4] = { 0 };
|
int cpu_info[4] = { 0 };
|
||||||
__cpuid(cpu_info, 7);
|
__cpuid(cpu_info, 7);
|
||||||
if ((cpu_info[1] & (1 << 16)) != 0) {
|
if ((cpu_info[1] & (1 << 16)) != 0) {
|
||||||
|
|
@ -65,12 +62,31 @@ HallocySimdType hallocy_is_simd_supported(void) {
|
||||||
hallocy_supported_simd = HALLOCY_SIMD_SSE2;
|
hallocy_supported_simd = HALLOCY_SIMD_SSE2;
|
||||||
return hallocy_supported_simd;
|
return hallocy_supported_simd;
|
||||||
}
|
}
|
||||||
#elif defined(LIN_NEON)
|
#endif
|
||||||
if (getauxval(16) & (1 << 12)) {
|
#else
|
||||||
|
#if defined(__aarch64__) || defined(__arm__)
|
||||||
|
int file_descriptor = open("/proc/cpuinfo", O_READONLY);
|
||||||
|
if (file_descriptor == -1) {
|
||||||
|
return hallocy_supported_simd;
|
||||||
|
}
|
||||||
|
|
||||||
|
char buffer[256];
|
||||||
|
int bytes_read = read(file_descriptor, buffer, sizeof(buffer));
|
||||||
|
while (bytes_read > 0) {
|
||||||
|
for (size_t i = 0; i < bytes_read - 4; i++) {
|
||||||
|
if (buffer[i] == 'n' && buffer[i + 1] == 'e' && buffer[i + 2] == 'o' && buffer[i + 3] == 'n') {
|
||||||
|
close(file_descriptor);
|
||||||
|
|
||||||
hallocy_supported_simd = HALLOCY_SIMD_NEON;
|
hallocy_supported_simd = HALLOCY_SIMD_NEON;
|
||||||
return hallocy_supported_simd;
|
return hallocy_supported_simd;
|
||||||
}
|
}
|
||||||
#elif defined(LIN_SIMD)
|
}
|
||||||
|
|
||||||
|
bytes_read = read(file_descriptor, buffer, sizeof(buffer));
|
||||||
|
}
|
||||||
|
|
||||||
|
close(file_descriptor);
|
||||||
|
#else
|
||||||
unsigned int a, b, c, d;
|
unsigned int a, b, c, d;
|
||||||
__asm__ __volatile__ (
|
__asm__ __volatile__ (
|
||||||
"cpuid"
|
"cpuid"
|
||||||
|
|
@ -109,6 +125,7 @@ HallocySimdType hallocy_is_simd_supported(void) {
|
||||||
return hallocy_supported_simd;
|
return hallocy_supported_simd;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
hallocy_supported_simd = HALLOCY_SIMD_NONE;
|
hallocy_supported_simd = HALLOCY_SIMD_NONE;
|
||||||
return hallocy_supported_simd;
|
return hallocy_supported_simd;
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue