#ifndef FALCONMAP_H
#define FALCONMAP_H
#include <arm_sve.h>
#include <unordered_map>
#include <arm_neon.h>
#include <arm_acle.h>
#include <string>
#include <cstdint>
#include <cstring>
#include <unordered_map>
#include <iostream>
#if !defined(__ARM_FEATURE_SVE)
#error "This code requires ARM SVE support"
#endif
static inline uint8_t check_equal(const uint8_t* p, const uint8_t* q, uint32_t len)
{
while (len > 0) {
svbool_t pg = svwhilelt_b8_u32(0, len);
svuint8_t a = svld1_u8(pg, p);
svuint8_t b = svld1_u8(pg, q);
svbool_t diff = svcmpne(pg, a, b);
if (svptest_any(svptrue_b8(), diff)) {
return false;
}
size_t sve_vec_bytes = svcntb();
p += sve_vec_bytes;
q += sve_vec_bytes;
len -= sve_vec_bytes;
}
return true;
}
inline uint64_t mix(uint64_t x, uint64_t y) {
x += 0x9e3779b97f4a7c15ULL;
x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL;
x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL;
return x ^ (x >> 31) ^ y;
}
template <typename K>
struct FalconHash {
std::size_t operator()(const K& k) const {
const uint8_t* data = reinterpret_cast<const uint8_t*>(k.data());
size_t len = k.size();
if (len == 0 || data == nullptr) {
return 0;
}
const size_t sve_vec_bytes = svcntd();
svuint64_t state = svdup_u64(0x1234567890abcdefULL);
svuint64_t mixer = svdup_u64(0x9e3779b97f4a7c15ULL);
uint64_t seed = len;
const uint64_t* ptr64 = reinterpret_cast<const uint64_t*>((reinterpret_cast<uintptr_t>(data) + 7) & ~7);
__builtin_prefetch(ptr64, 0, 3);
size_t offset = reinterpret_cast<const uint8_t*>(ptr64) - data;
uint64_t hash = seed;
for (size_t i = 0; i < offset && i < len; ++i) {
hash = __builtin_aarch64_crc32cb(hash, data[i]);
}
size_t remaining = (len > offset) ? (len - offset) : 0;
size_t num_qwords = remaining / 8;
while (num_qwords >= sve_vec_bytes) {
svbool_t pg = svptrue_b64();
svuint64_t vals = svld1_u64(pg, ptr64);
state = sveor_u64_x(pg, state, vals);
state = svmul_u64_x(pg, state, mixer);
uint64_t m = svaddv_u64(pg, mixer);
mixer = svdup_u64(m + 0x9e3779b97f4a7c15ULL);
ptr64 += sve_vec_bytes;
num_qwords -= sve_vec_bytes;
}
hash ^= svaddv_u64(svptrue_b64(), state);
const uint8_t* tail = reinterpret_cast<const uint8_t*>(ptr64);
while (num_qwords > 0) {
hash = mix(hash, *ptr64++);
--num_qwords;
tail += 8;
}
remaining = remaining % 8;
for (size_t i = 0; i < remaining; ++i) {
hash = __builtin_aarch64_crc32cb(hash, tail[i]);
}
hash = mix(hash, hash >> 32);
return static_cast<std::size_t>(hash);
}
};
template <typename K1, typename K2>
struct FalconEqual {
bool operator()(const K1 &a, const K2 &b) const {
if (a.size() != b.size()) {
return false;
}
const uint8_t* p = reinterpret_cast<const uint8_t*>(a.data());
const uint8_t* q = reinterpret_cast<const uint8_t*>(b.data());
uint32_t len = a.size();
if (len > 0 && (p == nullptr || q == nullptr)) {
return false;
}
const uint32_t sve_vec_bytes = svcntb();
while (len >= sve_vec_bytes) {
if (!check_equal(p, q, sve_vec_bytes)) {
return false;
}
p += sve_vec_bytes;
q += sve_vec_bytes;
len -= sve_vec_bytes;
}
bool result = memcmp(p, q, len);
return (result == 0);
}
};
#endif