* Copyright (c) 2021 Huawei Technologies Co.,Ltd.
*
* openGauss is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
*
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PSL v2 for more details.
* -------------------------------------------------------------------------
*
* hll.cpp
* Realize the basic functions of HLL algorithm, which are called by hll_function and hll_mpp.
*
* IDENTIFICATION
* src/gausskernel/cbb/utils/hll/hll.cpp
*
* -------------------------------------------------------------------------
*/
#include <postgres.h>
#include <math.h>
#include "miscadmin.h"
#include "utils/bloom_filter.h"
#include "utils/hll.h"
static filter::Murmur3 murmur3;
uint64_t HllHash(const void *key, const int len, const int seed)
{
Assert(key != NULL);
if (seed < 0) {
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("negative seed value is invalid")));
}
return (uint64_t)murmur3.hash64((const char *)key, len, seed);
}
void HllCheckPararange(const int32_t log2Registers, const int32_t log2Explicitsize, const int32_t log2Sparsesize,
const int32_t duplicateCheck)
{
if (log2Registers < HLL_LOG2_REGISTER_MIN || log2Registers > HLL_LOG2_REGISTER_MAX) {
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("log2m = %d is out of range, it should be in range %d to %d, or set -1 as default",
log2Registers, HLL_LOG2_REGISTER_MIN, HLL_LOG2_REGISTER_MAX)));
}
if (log2Explicitsize < HLL_LOG2_EXPLICIT_SIZE_MIN || log2Explicitsize > HLL_LOG2_EXPLICIT_SIZE_MAX) {
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("log2explicit = %d is out of range, it should be in range %d to %d, or set -1 as default",
log2Explicitsize, HLL_LOG2_EXPLICIT_SIZE_MIN, HLL_LOG2_EXPLICIT_SIZE_MAX)));
}
if (log2Sparsesize < HLL_LOG2_SPARSE_SIZE_MIN || log2Sparsesize > HLL_LOG2_SPARSE_SIZE_MAX) {
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("log2sparse = %d is out of range, it should be in range %d to %d, or set -1 as default",
log2Sparsesize, HLL_LOG2_SPARSE_SIZE_MIN, HLL_LOG2_SPARSE_SIZE_MAX)));
}
if (duplicateCheck < HLL_DUPLICATE_CHECK_MIN || duplicateCheck > HLL_DUPLICATE_CHECK_MAX) {
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("duplicatecheck = %d is out of range, it should be in range %d to %d, or set -1 as default",
duplicateCheck, HLL_DUPLICATE_CHECK_MIN, HLL_DUPLICATE_CHECK_MAX)));
}
}
void HllCheckParaequal(const HllPara hllpara1, const HllPara hllpara2)
{
if (hllpara1.log2Registers != hllpara2.log2Registers) {
ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("log2m does not match: source is %d and dest is %d",
hllpara1.log2Registers, hllpara2.log2Registers)));
}
if (hllpara1.log2Explicitsize != hllpara2.log2Explicitsize) {
ereport(ERROR,
(errcode(ERRCODE_DATA_EXCEPTION), errmsg("log2explicit does not match: source is %d and dest is %d",
hllpara1.log2Explicitsize, hllpara2.log2Explicitsize)));
}
if (hllpara1.log2Sparsesize != hllpara2.log2Sparsesize) {
ereport(ERROR,
(errcode(ERRCODE_DATA_EXCEPTION), errmsg("log2sparse does not match: source is %d and dest is %d",
hllpara1.log2Sparsesize, hllpara2.log2Sparsesize)));
}
if (hllpara1.duplicateCheck != hllpara2.duplicateCheck) {
ereport(ERROR,
(errcode(ERRCODE_DATA_EXCEPTION), errmsg("duplicatecheck does not match: source is %d and dest is %d",
hllpara1.duplicateCheck, hllpara2.duplicateCheck)));
}
}
void HllCopyPara(HllPara *hllpara1, const HllPara *hllpara2)
{
*hllpara1 = *hllpara2;
}
void HllParaSetDefault(HllPara *hllpara)
{
Assert(hllpara != NULL);
hllpara->log2Registers = HLL_LOG2_REGISTER;
hllpara->log2Explicitsize = HLL_LOG2_EXPLICIT_SIZE;
hllpara->log2Sparsesize = HLL_LOG2_SPARSE_SIZE;
hllpara->duplicateCheck = HLL_DUPLICATE_CHECK;
}
* pack hllpara to 32bit, we don't check the input para here, must be careful
* should run HllCheckPararange before
*/
int32_t HllParaPack(const HllPara hllpara)
{
return *(int32_t *)&hllpara;
}
void HllParaUnpack(int32_t typmod, HllPara *hllpara)
{
*hllpara = *(HllPara *)&typmod;
}
* pack a hllobject to uint8 stream
* hllsize is the length of hllbyte, hllsize = HLL_HEAD_SIZE + hllobject->lenUsed + 1
*/
void HllObjectPack(const HllObject hllobject, uint8_t *hllbyte, const size_t hllsize)
{
Assert(hllobject != NULL);
Assert(hllbyte != NULL);
if (hllsize <= (size_t)(HLL_HEAD_SIZE + hllobject->lenUsed)) {
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("hllsize = %zu is smaller than hllobjectsize=%d", hllsize, HLL_HEAD_SIZE + hllobject->lenUsed)));
}
if (hllobject->hlltype < HLL_EMPTY || hllobject->hlltype > HLL_FULL) {
ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("invalid hll type, type=%d", hllobject->hlltype)));
}
errno_t rc = memset_s(hllbyte, hllsize, '\0', hllsize);
securec_check(rc, "\0", "\0");
int i = 0;
rc = memcpy_s(hllbyte + i, sizeof(uint8_t) * HLL_FLAG_LEN, hllobject->hllHead, sizeof(uint8_t) * HLL_FLAG_LEN);
securec_check(rc, "\0", "\0");
i += HLL_FLAG_LEN;
uint64_t compressinfo = 0;
compressinfo |= (uint64_t)hllobject->hlltype;
compressinfo <<= HLL_PARA_BIT;
compressinfo |= (hllobject->hllpara.log2Registers - HLL_LOG2_REGISTER_MIN);
compressinfo <<= HLL_PARA_BIT;
compressinfo |= hllobject->hllpara.log2Explicitsize;
compressinfo <<= HLL_PARA_BIT;
compressinfo |= hllobject->hllpara.log2Sparsesize;
compressinfo <<= HLL_PARA_BIT;
compressinfo |= hllobject->hllpara.duplicateCheck;
compressinfo <<= HLL_LEN_BIT;
compressinfo |= hllobject->lenTotal;
compressinfo <<= HLL_LEN_BIT;
compressinfo |= hllobject->lenUsed;
rc = memcpy_s(hllbyte + i, sizeof(uint64_t), &compressinfo, sizeof(uint64_t));
securec_check(rc, "\0", "\0");
i += sizeof(uint64_t);
rc = memcpy_s(hllbyte + i, sizeof(double), &hllobject->NDVEstimator, sizeof(double));
securec_check(rc, "\0", "\0");
i += sizeof(double);
rc = memcpy_s(hllbyte + i, sizeof(uint64_t), &hllobject->lastHash, sizeof(uint64_t));
securec_check(rc, "\0", "\0");
i += sizeof(uint64_t);
if (hllobject->lenUsed > 0) {
rc = memcpy_s(hllbyte + HLL_HEAD_SIZE, hllobject->lenUsed, hllobject->hllData, hllobject->lenUsed);
securec_check(rc, "\0", "\0");
}
}
void HllObjectUnpack(HllObject hllobject, const uint8_t *hllbyte, const size_t hllsize, MemoryContext cxt)
{
Assert(hllobject != NULL);
Assert(hllbyte != NULL);
if (hllsize < HLL_HEAD_SIZE) {
ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("not a hll type, size=%zu is not enough", hllsize)));
}
int i = 0;
errno_t rc;
rc = memcpy_s(hllobject->hllHead, HLL_FLAG_LEN, hllbyte + i, HLL_FLAG_LEN);
securec_check(rc, "\0", "\0");
i += HLL_FLAG_LEN;
if (strncmp(hllobject->hllHead, "HLL", HLL_FLAG_LEN) != 0) {
ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("not a hll type, head is not match")));
}
uint64_t compressinfo = 0;
rc = memcpy_s(&compressinfo, sizeof(uint64_t), hllbyte + i, sizeof(uint64_t));
securec_check(rc, "\0", "\0");
i += sizeof(uint64_t);
hllobject->lenUsed = compressinfo & BITS_MAX_VALUE(HLL_LEN_BIT);
compressinfo >>= HLL_LEN_BIT;
hllobject->lenTotal = compressinfo & BITS_MAX_VALUE(HLL_LEN_BIT);
compressinfo >>= HLL_LEN_BIT;
hllobject->hllpara.duplicateCheck = compressinfo & BITS_MAX_VALUE(HLL_PARA_BIT);
compressinfo >>= HLL_PARA_BIT;
hllobject->hllpara.log2Sparsesize = compressinfo & BITS_MAX_VALUE(HLL_PARA_BIT);
compressinfo >>= HLL_PARA_BIT;
hllobject->hllpara.log2Explicitsize = compressinfo & BITS_MAX_VALUE(HLL_PARA_BIT);
compressinfo >>= HLL_PARA_BIT;
hllobject->hllpara.log2Registers = compressinfo & BITS_MAX_VALUE(HLL_PARA_BIT);
hllobject->hllpara.log2Registers += HLL_LOG2_REGISTER_MIN;
compressinfo >>= HLL_PARA_BIT;
hllobject->hlltype = (HllType)(compressinfo & BITS_MAX_VALUE(HLL_PARA_BIT));
if (hllobject->hlltype > HLL_FULL) {
ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("invalid hll type, type=%d", hllobject->hlltype)));
}
rc = memcpy_s(&hllobject->NDVEstimator, sizeof(double), hllbyte + i, sizeof(double));
securec_check(rc, "\0", "\0");
i += sizeof(double);
rc = memcpy_s(&hllobject->lastHash, sizeof(uint64_t), hllbyte + i, sizeof(uint64_t));
securec_check(rc, "\0", "\0");
i += sizeof(uint64_t);
if (hllsize <= (size_t)(HLL_HEAD_SIZE + hllobject->lenUsed)) {
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("hllsize = %zu is smaller than hllobjectsize=%d", hllsize, HLL_HEAD_SIZE + hllobject->lenUsed)));
}
if (hllobject->lenUsed > 0) {
MemoryContext old = MemoryContextSwitchTo(cxt);
hllobject->hllData = (uint8_t *)palloc0((hllobject->lenTotal) * sizeof(uint8_t));
(void)MemoryContextSwitchTo(old);
rc = memcpy_s(hllobject->hllData, hllobject->lenUsed, hllbyte + HLL_HEAD_SIZE, hllobject->lenUsed);
securec_check(rc, "\0", "\0");
}
}
Hll::Hll(MemoryContext context) : cxt(context) {}
Hll::~Hll() {}
void Hll::HllInit()
{
MemoryContext old = MemoryContextSwitchTo(cxt);
hllObject = (HllObjectData *)palloc0(sizeof(HllObjectData));
(void)MemoryContextSwitchTo(old);
errno_t rc = memcpy_s(hllObject->hllHead, HLL_FLAG_LEN, "HLL", HLL_FLAG_LEN);
securec_check(rc, "", "");
hllObject->hlltype = HLL_UNINIT;
HllParaSetDefault(&hllObject->hllpara);
hllObject->NDVEstimator = -1;
hllObject->lastHash = 0;
hllObject->hllData = NULL;
hllObject->lenTotal = 0;
hllObject->lenUsed = 0;
}
void Hll::HllFree()
{
pfree_ext(hllObject->hllData);
pfree_ext(hllObject);
}
void Hll::HllEmpty()
{
hllObject->hlltype = HLL_EMPTY;
hllObject->NDVEstimator = 0;
hllObject->lastHash = 0;
pfree_ext(hllObject->hllData);
hllObject->lenTotal = 0;
hllObject->lenUsed = 0;
}
void Hll::HllSetPara(const HllPara hllpara)
{
if (hllObject->hlltype > HLL_EMPTY) {
ereport(ERROR,
(errcode(ERRCODE_DATA_EXCEPTION), errmsg("only can set parameters on an uninit/empty hll object")));
}
HllCopyPara(&hllObject->hllpara, &hllpara);
}
void Hll::HllAdd(const uint64_t hashval)
{
if (hllObject->hllpara.duplicateCheck && hashval == hllObject->lastHash) {
return;
}
HllSetRegisterValue(hashval);
hllObject->lastHash = hashval;
}
void Hll::HllUnion(Hll aHll)
{
if (aHll.hllObject->hlltype <= HLL_EMPTY) {
return;
}
if (hllObject->hlltype <= HLL_EMPTY) {
return HllCopy(aHll);
}
HllCheckParaequal(hllObject->hllpara, aHll.hllObject->hllpara);
if (aHll.hllObject->hlltype == HLL_EXPLICIT && hllObject->hlltype == HLL_EXPLICIT) {
HllComibineExplicit(aHll);
} else {
HllTransformFull();
aHll.HllTransformFull();
HllCombineFull(aHll);
}
}
double Hll::HllCardinality()
{
if (hllObject->NDVEstimator == -1) {
HllCalculateNDV();
}
return hllObject->NDVEstimator;
}
void Hll::HllObjectPack(uint8_t *hllbyte, const size_t hllsize)
{
::HllObjectPack(hllObject, hllbyte, hllsize);
}
void Hll::HllObjectUnpack(const uint8_t *hllbyte, const size_t hllsize)
{
::HllObjectUnpack(hllObject, hllbyte, hllsize, cxt);
}
char *Hll::HllPrint()
{
MemoryContext old = MemoryContextSwitchTo(cxt);
size_t len = 1024;
char *hllinfo = (char *)palloc0(len);
(void)MemoryContextSwitchTo(old);
errno_t rc = memset_s(hllinfo, len, '\0', len);
securec_check(rc, "\0", "\0");
switch (hllObject->hlltype) {
case HLL_UNINIT:
case HLL_EMPTY:
case HLL_EXPLICIT:
case HLL_SPARSE:
case HLL_FULL: {
rc = snprintf_s(hllinfo, len, len - 1,
"type=%d(%s), log2m=%d, log2explicit=%d, log2sparse=%d, duplicatecheck=%d",
hllObject->hlltype, hllTypename[hllObject->hlltype], hllObject->hllpara.log2Registers,
hllObject->hllpara.log2Explicitsize, hllObject->hllpara.log2Sparsesize,
hllObject->hllpara.duplicateCheck);
securec_check_ss(rc, "\0", "\0");
break;
}
default:
ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("invalid hll type")));
}
return hllinfo;
}
HllType Hll::HllGetType()
{
return hllObject->hlltype;
}
uint8_t Hll::HllRegisters()
{
return hllObject->hllpara.log2Registers;
}
uint8_t Hll::HllExplicitsize()
{
return hllObject->hllpara.log2Explicitsize;
}
uint8_t Hll::HllSparsesize()
{
return hllObject->hllpara.log2Sparsesize;
}
uint8_t Hll::HllDuplicateCheck()
{
return hllObject->hllpara.duplicateCheck;
}
uint32_t Hll::HllDatalen()
{
return hllObject->lenUsed;
}
HllPara *Hll::HllGetPara()
{
return &hllObject->hllpara;
}
uint8_t Hll::HllGetByte(const uint32_t index)
{
if (index >= hllObject->lenUsed) {
ereport(ERROR,
(errcode(ERRCODE_DATA_EXCEPTION),
errmsg("hll index is out of range, current_size=%d", hllObject->lenUsed)));
}
return hllObject->hllData[index];
}
void Hll::HllSetByte(const uint32_t index, const uint8_t value)
{
if (index >= hllObject->lenUsed) {
ereport(ERROR,
(errcode(ERRCODE_DATA_EXCEPTION),
errmsg("hll index is out of range, current_size=%d", hllObject->lenUsed)));
}
hllObject->hllData[index] = value;
}
void Hll::HllDataExpand()
{
MemoryContext old = MemoryContextSwitchTo(cxt);
if (hllObject->lenTotal == 0) {
Assert(hllObject->hllData == NULL);
hllObject->hllData = (uint8_t *)palloc0(HLL_EXPAND_SIZE * sizeof(uint8_t));
} else {
Assert(hllObject->hllData != NULL);
uint32_t newSize = hllObject->lenTotal + HLL_EXPAND_SIZE;
hllObject->hllData = (uint8_t *)repalloc(hllObject->hllData, newSize * sizeof(uint8_t));
}
(void)MemoryContextSwitchTo(old);
hllObject->lenTotal += HLL_EXPAND_SIZE;
return;
}
void Hll::HllCopy(const Hll aHll)
{
switch (aHll.hllObject->hlltype) {
case HLL_EMPTY: {
HllCopyPara(&hllObject->hllpara, &aHll.hllObject->hllpara);
HllEmpty();
return;
}
case HLL_EXPLICIT:
case HLL_SPARSE:
case HLL_FULL: {
if (hllObject->lenTotal < aHll.hllObject->lenTotal) {
pfree_ext(hllObject->hllData);
MemoryContext old = MemoryContextSwitchTo(cxt);
hllObject->hllData = (uint8_t *)palloc0(aHll.hllObject->lenTotal * sizeof(uint8_t));
(void)MemoryContextSwitchTo(old);
hllObject->lenTotal = aHll.hllObject->lenTotal;
hllObject->lenUsed = aHll.hllObject->lenUsed;
} else {
hllObject->lenUsed = aHll.hllObject->lenUsed;
}
errno_t rc = memcpy_s(hllObject->hllData, hllObject->lenUsed, aHll.hllObject->hllData, hllObject->lenUsed);
securec_check(rc, "", "");
break;
}
default:
ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("invalid hll type")));
}
HllCopyPara(&hllObject->hllpara, &aHll.hllObject->hllpara);
hllObject->hlltype = aHll.hllObject->hlltype;
hllObject->NDVEstimator = aHll.hllObject->NDVEstimator;
hllObject->lastHash = aHll.hllObject->lastHash;
}
* update HllObject value of index
* we simply replace original value of hllData by insert value, but the length may not be equal
* we will set hllData[index] = insertValue[0], hllData[index + 1] = insertValue[1], ...
* then hllData[index + insertLen] = hllData[index + originLen], ...
*/
void Hll::HllUpdateValue(const uint32_t index, const uint8_t *insertValue, const uint32_t insertLen,
const uint32_t originLen)
{
if (((originLen == 0) && (index > hllObject->lenUsed)) ||
((originLen > 0) && (index + originLen > hllObject->lenUsed))) {
ereport(ERROR,
(errcode(ERRCODE_DATA_EXCEPTION),
errmsg("hll index is out of range, current_size=%d", hllObject->lenUsed)));
}
while (insertLen - originLen > (hllObject->lenTotal - hllObject->lenUsed)) {
HllDataExpand();
}
errno_t rc;
uint32_t moveLen = hllObject->lenUsed - index - originLen;
if ((insertLen != originLen) && (moveLen > 0)) {
rc =
memmove_s(hllObject->hllData + index + insertLen, moveLen, hllObject->hllData + index + originLen, moveLen);
securec_check(rc, "", "");
}
rc = memcpy_s(hllObject->hllData + index, insertLen, insertValue, insertLen);
securec_check(rc, "", "");
hllObject->lenUsed += insertLen - originLen;
}
void Hll::HllHashToValue(const uint64_t hashval, uint32_t *index, uint8_t *value)
{
uint8_t calzeroLen = INT64_LEN - hllObject->hllpara.log2Registers;
* ascending too */
*index = (uint32_t)(hashval >> calzeroLen);
*value = (uint8_t)(__builtin_ctzll(hashval) + 1);
if (*value > calzeroLen) {
*value = 0;
}
}
void Hll::HllSetRegisterValue(const uint64_t hashval)
{
switch (hllObject->hlltype) {
case HLL_UNINIT:
HllEmpty();
case HLL_EMPTY:
HllSetEmptyValue(hashval);
break;
case HLL_EXPLICIT:
HllSetExplicitValue(hashval);
break;
case HLL_SPARSE:
HllSetSparseValue(hashval);
break;
case HLL_FULL:
HllSetFullValue(hashval);
break;
default:
ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("invalid hll type")));
}
}
void Hll::HllSetEmptyValue(const uint64_t hashval)
{
Assert(hllObject->hlltype == HLL_EMPTY);
if (hllObject->hllpara.log2Explicitsize == 0 && hllObject->hllpara.log2Sparsesize == 0) {
HllTransformFull();
HllSetFullValue(hashval);
} else if (hllObject->hllpara.log2Explicitsize == 0) {
HllTransformSparse();
HllSetSparseValue(hashval);
} else {
hllObject->hlltype = HLL_EXPLICIT;
HllSetExplicitValue(hashval);
}
}
void Hll::HllSetExplicitValue(const uint64_t hashval)
{
Assert(hllObject->hlltype == HLL_EXPLICIT);
uint32_t explicitLen = hllObject->lenUsed / HLL_EXPLICIT_UINT;
uint64_t currentHash;
if ((hllObject->lenTotal - hllObject->lenUsed) < HLL_EXPLICIT_UINT) {
HllDataExpand();
}
uint32_t start = 0;
uint32_t last = explicitLen;
while (start < last) {
uint32_t mid = (start + last) / 2;
currentHash = HllGetExplictitValue(mid * HLL_EXPLICIT_UINT);
if (currentHash <= hashval) {
start = mid + 1;
} else {
last = mid;
}
}
uint32_t bindex = start;
uint8_t insertValue[HLL_EXPLICIT_UINT];
errno_t rc = memcpy_s(insertValue, sizeof(uint64_t), &hashval, sizeof(uint64_t));
securec_check(rc, "\0", "\0");
if ((bindex > 0) && (HllGetExplictitValue((bindex - 1) * HLL_EXPLICIT_UINT) == hashval)) {
return;
}
HllUpdateValue(HLL_EXPLICIT_UINT * bindex, insertValue, HLL_EXPLICIT_UINT, 0);
hllObject->NDVEstimator++;
HllExplicitCheck();
}
void Hll::HllSetSparseValue(const uint64_t hashval)
{
Assert(hllObject->hlltype == HLL_SPARSE);
uint32_t index = 0;
uint8_t value = 0;
HllHashToValue(hashval, &index, &value);
if (value == 0) {
return;
}
uint32_t byteIndex = 0;
uint32_t unitIndex = 0;
HllGetRegisterUnit(index, &byteIndex, &unitIndex);
HllUpdateSparseValue(byteIndex, unitIndex, value);
if (hllObject->lenUsed > BITS_TO_VALUE(hllObject->hllpara.log2Sparsesize)) {
HllTransformFull();
}
}
void Hll::HllSetFullValue(const uint64_t hashval)
{
Assert(hllObject->hlltype == HLL_FULL);
uint32_t index = 0;
uint8_t value = 0;
HllHashToValue(hashval, &index, &value);
if (value == 0) {
return;
}
if (value > HllGetByte(index)) {
HllSetByte(index, value);
hllObject->NDVEstimator = -1;
}
}
void Hll::HllTransformFull()
{
MemoryContext old = MemoryContextSwitchTo(cxt);
uint8_t *newdata = (uint8_t *)palloc0(BITS_TO_VALUE(hllObject->hllpara.log2Registers) * sizeof(uint8_t));
(void)MemoryContextSwitchTo(old);
switch (hllObject->hlltype) {
case HLL_EMPTY:
break;
case HLL_EXPLICIT: {
for (uint32_t i = 0; i < hllObject->lenUsed; i += HLL_EXPLICIT_UINT) {
uint64_t hashval = HllGetExplictitValue(i);
uint32_t index = 0;
uint8_t value = 0;
HllHashToValue(hashval, &index, &value);
newdata[index] = value;
}
break;
}
case HLL_SPARSE: {
uint32_t index = 0;
for (uint32_t i = 0; i < hllObject->lenUsed; i++) {
uint8_t byteValue = HllGetByte(i);
switch (BYTE_TYPE(byteValue)) {
case SPARSE_LONGZERO:
index += BYTE_GET_VALUE(byteValue) * HLL_LONGZERO_UNIT;
break;
case SPARSE_ZERO:
index += BYTE_GET_VALUE(byteValue);
break;
case DENSE_VALUE:
newdata[index++] = BYTE_GET_VALUE(byteValue);
break;
default:
ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("undefined hll register type")));
}
}
Assert(index == BITS_TO_VALUE(hllObject->hllpara.log2Registers));
break;
}
case HLL_FULL:
pfree_ext(newdata);
return;
default:
ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("undefined hll type")));
}
pfree_ext(hllObject->hllData);
hllObject->hllData = newdata;
hllObject->hlltype = HLL_FULL;
hllObject->lenUsed = BITS_TO_VALUE(hllObject->hllpara.log2Registers);
hllObject->lenTotal = hllObject->lenUsed;
}
void Hll::HllTransformSparse()
{
switch (hllObject->hlltype) {
case HLL_EMPTY:
HllEmptyToSparse();
break;
case HLL_EXPLICIT:
HllExplicitToSparse();
break;
case HLL_SPARSE:
break;
default:
ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("undefined hll type")));
}
}
void Hll::HllExplicitCheck()
{
Assert(hllObject->hlltype == HLL_EXPLICIT);
if (hllObject->lenUsed > BITS_TO_VALUE(hllObject->hllpara.log2Explicitsize)) {
if (hllObject->hllpara.log2Sparsesize > hllObject->hllpara.log2Explicitsize) {
HllTransformSparse();
if (hllObject->lenUsed > BITS_TO_VALUE(hllObject->hllpara.log2Sparsesize)) {
HllTransformFull();
}
} else {
HllTransformFull();
}
}
}
void Hll::HllEmptyToSparse()
{
Assert(hllObject->hlltype == HLL_EMPTY);
Assert(hllObject->hllData == NULL);
int log2byte = hllObject->hllpara.log2Registers - HLL_REGISTER_BITS - HLL_LOG2_LONGZERO_UNIT;
const int nbyte = (log2byte > 0) ? BITS_TO_VALUE(log2byte) : 1;
MemoryContext old = MemoryContextSwitchTo(cxt);
if (nbyte > 1) {
hllObject->hllData = (uint8_t *)palloc0(nbyte * sizeof(uint8_t));
errno_t rc = memset_s(hllObject->hllData, nbyte, HLL_LONGZERO_FULL, nbyte);
securec_check(rc, "", "");
hllObject->lenTotal = nbyte;
} else {
hllObject->hllData = (uint8_t *)palloc0(sizeof(uint8_t));
*hllObject->hllData = SET_LONGZERO_VALUE(BITS_TO_VALUE(hllObject->hllpara.log2Registers));
hllObject->lenTotal = 1;
}
(void)MemoryContextSwitchTo(old);
hllObject->lenUsed = hllObject->lenTotal;
hllObject->hlltype = HLL_SPARSE;
}
void Hll::HllExplicitToSparse()
{
Assert(hllObject->hlltype == HLL_EXPLICIT);
uint32_t zerocount = 0;
int lastindex = -1;
uint32_t sparseindex = 0;
* nbyte-1 bytes HLL_LONGZERO_FULL, 1-byte SPARSE_LONGZERO,
* 1-byte SPARSE_ZERO, 1-byte DENSE_VALUE */
int log2byte = hllObject->hllpara.log2Registers - HLL_REGISTER_BITS - HLL_LOG2_LONGZERO_UNIT;
const int nbyte = (log2byte > 0) ? BITS_TO_VALUE(log2byte) : 1;
uint32_t sparseMinsize = nbyte + 2;
uint32_t sparsesize = sparseMinsize;
MemoryContext old = MemoryContextSwitchTo(cxt);
uint8_t *newdata = (uint8_t *)palloc0(sparsesize * sizeof(uint8_t));
uint64_t *explicitdata = (uint64_t *)hllObject->hllData;
uint32_t explicitlen = hllObject->lenUsed / HLL_EXPLICIT_UINT;
for (uint32_t i = 0; i < explicitlen; i++) {
uint32_t index = 0;
uint8_t value = 0;
HllHashToValue(explicitdata[i], &index, &value);
if (value == 0) {
continue;
}
Assert((int)index >= lastindex);
uint8_t newvalue = SET_DENSE_VALUE(value);
if ((int)index == lastindex) {
if (newdata[sparseindex - 1] < newvalue) {
newdata[sparseindex - 1] = newvalue;
}
} else {
zerocount = index - lastindex - 1;
sparseindex += HllSetSparseZero(newdata + sparseindex, zerocount);
newdata[sparseindex++] = newvalue;
lastindex = index;
if (sparseindex + sparseMinsize > sparsesize) {
sparsesize += HLL_EXPAND_SIZE;
newdata = (uint8_t *)repalloc(newdata, sparsesize * sizeof(uint8_t));
}
}
}
(void)MemoryContextSwitchTo(old);
zerocount = BITS_TO_VALUE(hllObject->hllpara.log2Registers) - lastindex - 1;
sparseindex += HllSetSparseZero(newdata + sparseindex, zerocount);
pfree_ext(hllObject->hllData);
hllObject->hllData = newdata;
hllObject->lenTotal = sparsesize;
hllObject->lenUsed = sparseindex;
hllObject->hlltype = HLL_SPARSE;
}
uint32_t Hll::HllSetSparseZero(uint8_t *hlldata, uint32_t zerocount)
{
uint32_t index = 0;
if (zerocount >= HLL_LONGZERO_UNIT) {
int fullnum = zerocount >> (HLL_REGISTER_BITS + HLL_LOG2_LONGZERO_UNIT);
if (fullnum > 0) {
errno_t rc = memset_s(hlldata, fullnum, HLL_LONGZERO_FULL, fullnum);
securec_check(rc, "", "");
index += fullnum;
}
zerocount &= 0xfff;
if (zerocount >= HLL_LONGZERO_UNIT) {
hlldata[index++] = SET_LONGZERO_VALUE(zerocount);
zerocount &= BITS_MAX_VALUE(HLL_LOG2_LONGZERO_UNIT);
}
}
if (zerocount > 0) {
hlldata[index++] = SET_ZERO_VALUE(zerocount);
}
return index;
}
uint64_t Hll::HllGetExplictitValue(const uint32_t index)
{
uint64_t *explicitdata = (uint64_t *)hllObject->hllData;
return explicitdata[index / HLL_EXPLICIT_UINT];
}
* for sparse mode, if we want to locate the index of an register, it's on hllData[byteIndex],
* since a byte can save several register, we set the unitIndex.
* for example, if hllData = 00111111 00111111 00111111 00111111,
* which saves (255+1)*64*4=16384 registers, and if we set registerIndex = 256 * 64 + 100,
* then the byteIndex = 1, unitIndex = 100.
* must be careful for type SPARSE_LONGZERO, whose units is 64.
*/
void Hll::HllGetRegisterUnit(const uint32_t registerIndex, uint32_t *byteIndex, uint32_t *unitIndex)
{
if (registerIndex == 0) {
*byteIndex = 0;
*unitIndex = 0;
return;
}
uint32_t currentRegisterSum = 0;
uint32_t currentRegister = 0;
uint32_t i;
for (i = 0; i < hllObject->lenUsed; i++) {
const uint8_t byteValue = HllGetByte(i);
switch (BYTE_TYPE(byteValue)) {
case SPARSE_LONGZERO:
currentRegister = BYTE_GET_VALUE(byteValue) << HLL_LOG2_LONGZERO_UNIT;
break;
case SPARSE_ZERO:
currentRegister = BYTE_GET_VALUE(byteValue);
break;
case DENSE_VALUE:
currentRegister = 1;
break;
default:
ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("undefined hll register type")));
}
currentRegisterSum += currentRegister;
if (currentRegisterSum > registerIndex) {
break;
}
}
if (i == hllObject->lenUsed) {
if (registerIndex >= BITS_TO_VALUE(hllObject->hllpara.log2Registers)) {
ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("hll index is out of range")));
} else {
ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("hllData missed in sparse mode")));
}
}
*byteIndex = i;
*unitIndex = registerIndex - (currentRegisterSum - currentRegister);
}
* update hllObject in sparsemode
* return false indicates no update
* must nmake sure that longzero type is in front of zero type,
* so the continuous 2 bytes can never be SPARSE_ZERO SPARSE_LONGZERO
*/
void Hll::HllUpdateSparseValue(const uint32_t byteIndex, const uint32_t unitIndex, const uint8_t newValue)
{
if (newValue == 0) {
return;
}
uint8_t byteValue = HllGetByte(byteIndex);
switch (BYTE_TYPE(byteValue)) {
case SPARSE_LONGZERO:
return HllUpdateSparseType1(byteIndex, unitIndex, newValue);
case SPARSE_ZERO:
return HllUpdateSparseType2(byteIndex, unitIndex, newValue);
case DENSE_VALUE:
return HllUpdateSparseType3(byteIndex, unitIndex, newValue);
default:
ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("undefined hll register type")));
}
}
void Hll::HllUpdateSparseType1(const uint32_t byteIndex, const uint32_t unitIndex, const uint8_t newValue)
{
uint8_t byteValue = HllGetByte(byteIndex);
Assert(BYTE_TYPE(byteValue) == SPARSE_LONGZERO);
if (newValue == 0) {
return;
}
* for example, if we update 00000011(means there are 4*64 zero_register), and unitIndex = 64*2+1
* so there are 2*64+1 zero_register left, 1*64+62 right */
uint32_t left = unitIndex;
uint32_t right = BYTE_GET_VALUE(byteValue) * HLL_LONGZERO_UNIT - left - 1;
* for worst condition, SPARSE_LONGZERO -> SPARSE_LONGZERO+SPARSE_ZERO+DENSE_VALUE+SPARSE_LONGZERO+SPARSE_ZERO */
uint8_t newByte[5];
int index = 0;
int originLen = 1;
if (left >= HLL_LONGZERO_UNIT) {
HllSetSparseByte(&newByte[index++], left, 0);
left &= BITS_MAX_VALUE(HLL_LOG2_LONGZERO_UNIT);
}
if (left > 0) {
HllSetSparseByte(&newByte[index++], left, 0);
}
HllSetSparseByte(&newByte[index++], 1, newValue);
if (byteIndex < hllObject->lenUsed - 1) {
uint8_t laterByte = HllGetByte(byteIndex + 1);
if (BYTE_TYPE(laterByte) == SPARSE_ZERO) {
originLen++;
right += BYTE_GET_VALUE(laterByte);
}
}
if (right >= HLL_LONGZERO_UNIT) {
HllSetSparseByte(&newByte[index++], right, 0);
right &= BITS_MAX_VALUE(HLL_LOG2_LONGZERO_UNIT);
}
if (right > 0) {
HllSetSparseByte(&newByte[index++], right, 0);
}
HllUpdateValue(byteIndex, newByte, index, originLen);
hllObject->NDVEstimator = -1;
}
void Hll::HllUpdateSparseType2(const uint32_t byteIndex, const uint32_t unitIndex, const uint8_t newValue)
{
uint8_t byteValue = HllGetByte(byteIndex);
Assert(BYTE_TYPE(byteValue) == SPARSE_ZERO);
if (newValue == 0) {
return;
}
* for example, if we update 01000011(means there are 4 zero_register), and unitIndex = 2
* so there are 2 zero_register left, 1 right */
uint32_t left = unitIndex;
uint32_t right = BYTE_GET_VALUE(byteValue) - left - 1;
* for worst condition, SPARSE_ZERO -> SPARSE_ZERO+DENSE_VALUE+SPARSE_ZERO */
uint8_t newByte[3] = {0};
int index = 0;
if (left > 0) {
HllSetSparseByte(&newByte[index++], left, 0);
}
HllSetSparseByte(&newByte[index++], 1, newValue);
if (right > 0) {
HllSetSparseByte(&newByte[index++], right, 0);
}
HllUpdateValue(byteIndex, newByte, index, 1);
hllObject->NDVEstimator = -1;
}
void Hll::HllUpdateSparseType3(const uint32_t byteIndex, const uint32_t unitIndex, const uint8_t newValue)
{
uint8_t byteValue = HllGetByte(byteIndex);
Assert(BYTE_TYPE(byteValue) == DENSE_VALUE);
Assert(unitIndex == 0);
if (newValue <= BYTE_GET_VALUE(byteValue)) {
return;
}
HllSetByte(byteIndex, SET_DENSE_VALUE(newValue));
hllObject->NDVEstimator = -1;
}
void Hll::HllSetSparseByte(uint8_t *destByte, const uint32_t count, const uint8_t value)
{
Assert(count != 0);
if (value == 0) {
if (count >= HLL_LONGZERO_UNIT) {
*destByte = SET_LONGZERO_VALUE(count);
} else {
*destByte = SET_ZERO_VALUE(count);
}
} else if (count == 1) {
*destByte = SET_DENSE_VALUE(value);
} else {
ereport(ERROR,
(errcode(ERRCODE_DATA_EXCEPTION), errmsg("can't set sparse byte cause hll data is out of range")));
}
}
void Hll::HllCombineFull(Hll aHll)
{
Assert(hllObject->hlltype == HLL_FULL);
Assert(aHll.hllObject->hlltype == HLL_FULL);
for (uint32_t i = 0; i < hllObject->lenUsed; i++) {
if (hllObject->hllData[i] < aHll.hllObject->hllData[i]) {
hllObject->hllData[i] = aHll.hllObject->hllData[i];
}
}
hllObject->NDVEstimator = -1;
hllObject->lastHash = 0;
}
void Hll::HllComibineExplicit(Hll aHll)
{
Assert(hllObject->hlltype == HLL_EXPLICIT);
Assert(aHll.hllObject->hlltype == HLL_EXPLICIT);
MemoryContext old = MemoryContextSwitchTo(cxt);
uint8_t *newdata = (uint8_t *)palloc0((hllObject->lenUsed + aHll.hllObject->lenUsed) * sizeof(uint8_t));
(void)MemoryContextSwitchTo(old);
uint32_t p1 = 0;
uint32_t p2 = 0;
uint32_t p = 0;
errno_t rc;
uint32_t len1 = hllObject->lenUsed;
uint32_t len2 = aHll.hllObject->lenUsed;
while ((p1 < len1) && (p2 < len2)) {
uint64_t hashval1 = HllGetExplictitValue(p1);
uint64_t hashval2 = aHll.HllGetExplictitValue(p2);
if (hashval1 < hashval2) {
rc = memcpy_s(newdata + p, HLL_EXPLICIT_UINT, hllObject->hllData + p1, HLL_EXPLICIT_UINT);
p1 += HLL_EXPLICIT_UINT;
} else if (hashval1 > hashval2) {
rc = memcpy_s(newdata + p, HLL_EXPLICIT_UINT, aHll.hllObject->hllData + p2, HLL_EXPLICIT_UINT);
p2 += HLL_EXPLICIT_UINT;
} else {
rc = memcpy_s(newdata + p, HLL_EXPLICIT_UINT, hllObject->hllData + p1, HLL_EXPLICIT_UINT);
p1 += HLL_EXPLICIT_UINT;
p2 += HLL_EXPLICIT_UINT;
}
p += HLL_EXPLICIT_UINT;
securec_check(rc, "", "");
}
uint32_t leftLen = len1 + len2 - p1 - p2;
if (p1 < len1) {
rc = memcpy_s(newdata + p, leftLen, hllObject->hllData + p1, leftLen);
securec_check(rc, "", "");
} else if (p2 < len2) {
rc = memcpy_s(newdata + p, leftLen, aHll.hllObject->hllData + p2, leftLen);
securec_check(rc, "", "");
}
p += leftLen;
pfree_ext(hllObject->hllData);
hllObject->hllData = newdata;
hllObject->lenTotal = len1 + len2;
hllObject->lenUsed = p;
hllObject->lastHash = 0;
hllObject->NDVEstimator = hllObject->lenUsed / HLL_EXPLICIT_UINT;
HllExplicitCheck();
}
void Hll::HllGetHistogram(uint32_t *histogram)
{
uint32_t i;
uint8_t value;
switch (hllObject->hlltype) {
case HLL_EMPTY:
return;
case HLL_EXPLICIT: {
for (i = 0; i < hllObject->lenUsed; i += HLL_EXPLICIT_UINT) {
uint64_t hashval = HllGetExplictitValue(i);
uint32_t index = 0;
uint8_t value = 0;
HllHashToValue(hashval, &index, &value);
histogram[value]++;
}
histogram[0] = BITS_TO_VALUE(hllObject->hllpara.log2Registers) - hllObject->lenUsed / HLL_EXPLICIT_UINT;
return;
}
case HLL_SPARSE: {
for (i = 0; i < hllObject->lenUsed; i++) {
value = HllGetByte(i);
switch (BYTE_TYPE(value)) {
case SPARSE_LONGZERO:
histogram[0] += BYTE_GET_VALUE(value) * HLL_LONGZERO_UNIT;
break;
case SPARSE_ZERO:
histogram[0] += BYTE_GET_VALUE(value);
break;
case DENSE_VALUE:
histogram[BYTE_GET_VALUE(value)]++;
break;
default:
ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("undefined hll register type")));
}
}
return;
}
case HLL_FULL: {
for (i = 0; i < hllObject->lenUsed; i++) {
histogram[HllGetByte(i)]++;
}
return;
}
default:
ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("undefined hll type")));
}
}
double Hll::HllGetTau(double x)
{
if (x == 0 || x == 1) {
return 0;
}
double y = 1;
double z = 1 - x;
double tau = 0;
while (abs(tau - z) > DBL_EPSILON) {
CHECK_FOR_INTERRUPTS();
x = sqrt(x);
tau = z;
y *= 0.5;
z -= (1 - x) * (1 - x) * y;
}
return tau / 3;
}
double Hll::HllGetSigma(double x)
{
if (x == 1) {
return INFINITY;
}
double y = 1;
double z = x;
double sigma = 0;
while (abs(sigma - z) > DBL_EPSILON) {
CHECK_FOR_INTERRUPTS();
x *= x;
sigma = z;
z += x * y;
y *= 2;
}
return sigma;
}
void Hll::HllCalculateNDV()
{
int i;
* something to repair it */
switch (hllObject->hlltype) {
case HLL_EMPTY:
hllObject->NDVEstimator = 0;
return;
case HLL_EXPLICIT:
* an estimated value, not exact value */
case HLL_SPARSE:
case HLL_FULL: {
uint32_t histogram[64] = { 0 };
HllGetHistogram(histogram);
uint32_t finalIndex = INT64_LEN - hllObject->hllpara.log2Registers;
double registers = BITS_TO_VALUE(hllObject->hllpara.log2Registers);
double z = 0;
z += registers * HllGetTau(1 - histogram[finalIndex + 1] / registers);
for (i = finalIndex; i > 0; i--) {
z += histogram[i];
z *= 0.5;
}
z += registers * HllGetSigma(histogram[0] / registers);
hllObject->NDVEstimator = HLL_ALPHA_INF * registers * registers / z;
return;
}
default:
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("undefined hll type")));
}
}