* Copyright (c) 2021 Huawei Technologies Co.,Ltd.
*
* openGauss is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
*
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PSL v2 for more details.
* -------------------------------------------------------------------------
*
* hll.h
* Realize the basic functions of HLL algorithm, which are called by hll_function and hll_mpp.
*
* The original HyperLogLog algorithm was proposed by Flajolet P[1], which were modified by Google engineers[2].
* Redis HLL[3] set sparse and dense mode to save hll object, in order to save memory; while Postgres HLL is much less
* complex. We get a balance from them and privide our new HLL design. Moreover, the cardinality estimation approach
* used in our HLL is derived from Otmar Ertl[4].
* [1] Flajolet P, Fusy É, Gandouet O, et al. Hyperloglog: the analysis of a near-optimal cardinality estimation
* algorithm[C]. 2007.
* [2] Heule S, Nunkesser M, Hall A. HyperLogLog in practice: algorithmic engineering of a state of the art cardinality
* estimation algorithm[C]//Proceedings of the 16th International Conference on Extending Database Technology.
* 2013: 683-692.
* [3] Salvatore Sanlippo. Redis new data structure: The HyperLogLog. http://antirez.com/news/75, 2014.
* [4] Ertl O. New cardinality estimation algorithms for hyperloglog sketches[J]. arXiv preprint arXiv:1702.01284, 2017.
*
* The design of HyperLogLog is as follows:
* 1. Use MurmurHash3 to get a 64-bit hash value
* 2. We use the first m bits as an index, and calculate how many continuous zero ocuured in left bits, which is set as
* value. That means for each hash value, we get a (key, value), which is also named as register in some papers.
* 3. We use three type to save (key, value) pairs, which are defined in enum HllType: HLL_EXPLICIT/HLL_SPARSE/HLL_FULL
* a. In HLL_EXPLICIT mode, each 8-byte saves a hashvalue. We make sure the hashvalue saved in HLL_EXPLICIT mode is
* in order, that is helpful in hll_add and hll_union function.
* b. In HLL_SPARSE mode, we set 4 type to saves values. Since each value ranges from 0 to (64-m), so 6bits is enough
* to save a value, but it's better to use 8 bits (1 byte) for locating and calculation. So we use 8 bits to saves
* it, the first 2 bits are treated as flag, which is defined in enum RegisterType, the left 6 bits save values.
* SPARSE_LONGZERO: 00(count/64-1). Units is 64, used to save zero registers. For example, 00000111 means there
* are (7+1)*64 zero registers.
* SPARSE_ZERO: 01(count-1). Units is 1, used to save zero registers. For example, 01000111 means there are (7+1)
* zero registers.
* DENSE_VALUE: 10(value-1), used to save one not-zero register. For example, 10000111 means there are one
* register, which value is 7+1.
* SPARSE_VALUE: 11(value-1)(4bit)(count-1)(2bit), used to save several registers. For example, 11000111 means
* there are 4 registers, each value is 2.
* c. In HLL_FULL mode, we create an array of length (2^m), and save each value in index key.
* 4. We use variable-length uint8 array to save hllData, when create a new hll, we set hllData nullptr. We use a
* parameter named LenTotal to record the size of hllData, and lenUsed to record how many bytes are used to save
* (key, value) pairs. If lenLeft=(LenTotal-lenUsed) is not enough, expand hllData.
* 5. Cardinality estimation:
* case HLL_EMPTY: 0.
* case HLL_EXPLICIT: lenUsed/8. Since each 8-byte saves a hashvalue.
* other case: a new approach drived in Otmar Ertl's paper.
*
* IDENTIFICATION
* src/include/utils/hll.h
*
* -------------------------------------------------------------------------
*/
#ifndef HLL_H
#define HLL_H
#include "fmgr.h"
#include <float.h>
#ifndef DBL_EPSILON
#define DBL_EPSILON 2.2204460492503131e-16
#endif
enum HllType {
HLL_UNINIT = 0,
HLL_EMPTY,
HLL_EXPLICIT,
HLL_SPARSE,
HLL_FULL,
HLL_UNDEFINED
};
const char hllTypename[6][15] = {"HLL_UNINIT", "HLL_EMPTY", "HLL_EXPLICIT", "HLL_SPARSE", "HLL_FULL", "HLL_UNDEFINED"};
enum RegisterType {
SPARSE_LONGZERO = 0,
SPARSE_ZERO,
DENSE_VALUE,
SPARSE_VALUE,
};
#define BYTEVAL_LENGTH(byteval) (VARSIZE(byteval) - VARHDRSZ)
#define BITS_TO_VALUE(bits) ((uint64_t)1 << (bits))
#define BITS_MAX_VALUE(bits) (((uint64_t)1 << (bits)) - 1)
#define HLL_LOG2_REGISTER 14
#define HLL_LOG2_EXPLICIT_SIZE 10
#define HLL_LOG2_SPARSE_SIZE 12
#define HLL_DUPLICATE_CHECK 0
#define HLL_LOG2_REGISTER_MIN 10
#define HLL_LOG2_EXPLICIT_SIZE_MIN 0
#define HLL_LOG2_SPARSE_SIZE_MIN 0
#define HLL_DUPLICATE_CHECK_MIN 0
#define HLL_LOG2_REGISTER_MAX 16
#define HLL_LOG2_EXPLICIT_SIZE_MAX 12
#define HLL_LOG2_SPARSE_SIZE_MAX 14
#define HLL_DUPLICATE_CHECK_MAX 1
#define INT8_LEN 8
#define INT64_LEN 64
#define HLL_ALPHA_INF 0.721347520444481703680
#define HLL_EXPAND_SIZE 256
#define HLL_REGISTER_BITS 6
#define HLL_EXPLICIT_UINT 8
#define HLL_LONGZERO_UNIT 64
#define HLL_LOG2_LONGZERO_UNIT 6
#define HLL_LONGZERO_FULL 0x3F
#define HLL_ZERO_FULL 0x7F
#define BYTE_TYPE(byte) (RegisterType)((byte) >> HLL_REGISTER_BITS)
#define TYPE_BYTE(type) ((RegisterType)(type) << HLL_REGISTER_BITS)
#define SET_LONGZERO_VALUE(value) ((((value) >> HLL_LOG2_LONGZERO_UNIT) - 1) | TYPE_BYTE(SPARSE_LONGZERO))
#define SET_ZERO_VALUE(value) (((value)-1) | TYPE_BYTE(SPARSE_ZERO))
#define SET_DENSE_VALUE(value) (((value)-1) | TYPE_BYTE(DENSE_VALUE))
#define BYTE_GET_VALUE(byte) (((byte)&0x3F) + 1)
#define HLL_REGISTER_MAX_VALUE BITS_MAX_VALUE(HLL_REGISTER_BITS)
uint64_t HllHash(const void *key, const int len, const int seed);
struct HllPara {
uint8_t log2Registers;
uint8_t log2Explicitsize;
uint8_t log2Sparsesize;
uint8_t duplicateCheck;
};
void HllCheckPararange(const int32_t log2Registers, const int32_t log2Explicitsize, const int32_t log2Sparsesize,
const int32_t duplicateCheck);
void HllCheckParaequal(const HllPara hllpara1, const HllPara hllpara2);
void HllCopyPara(HllPara *hllpara1, const HllPara *hllpara2);
void HllParaSetDefault(HllPara *hllpara);
int32_t HllParaPack(const HllPara hllpara);
void HllParaUnpack(int32_t typmod, HllPara *hllpara);
* HllObjectData saves all the hll data
* we use uint8_t* to save hllData, lenUsed is the length of used hlldata, lenLeft=(LenTotal-lenUsed) is the length of
* left space. if lenLeft is not enouth, expand HLL_EXPAND_SIZE
*/
#define HLL_FLAG_LEN 3
#define HLL_HEAD_SIZE 27
#define HLL_PARA_BIT 4
#define HLL_LEN_BIT 17
struct HllObjectData {
char hllHead[HLL_FLAG_LEN];
HllType hlltype;
HllPara hllpara;
uint32_t lenTotal;
uint32_t lenUsed;
uint8_t *hllData;
double NDVEstimator;
uint64_t lastHash;
};
typedef HllObjectData *HllObject;
void HllObjectPack(const HllObject hllobject, uint8_t *hllbyte, const size_t hllsize);
void HllObjectUnpack(HllObject hllobject, const uint8_t *hllbyte, const size_t hllsize,
MemoryContext cxt);
class Hll : public BaseObject {
public:
Hll(MemoryContext context);
virtual ~Hll();
void HllInit();
void HllFree();
void HllEmpty();
void HllSetPara(const HllPara hllpara);
void HllAdd(const uint64_t hashval);
void HllUnion(Hll aHll);
double HllCardinality();
void HllObjectPack(uint8_t *hllbyte, const size_t hllsize);
void HllObjectUnpack(const uint8_t *hllbyte, const size_t hllsize);
char *HllPrint();
HllType HllGetType();
uint8_t HllRegisters();
uint8_t HllExplicitsize();
uint8_t HllSparsesize();
uint8_t HllDuplicateCheck();
uint32_t HllDatalen();
HllPara *HllGetPara();
private:
HllObject hllObject;
MemoryContext cxt;
uint8_t HllGetByte(const uint32_t index);
void HllSetByte(const uint32_t index, const uint8_t value);
void HllDataExpand();
void HllCopy(const Hll aHll);
void HllUpdateValue(const uint32_t index, const uint8_t *insertValue, const uint32_t insertLen,
const uint32_t originLen);
void HllHashToValue(const uint64_t hashval, uint32_t *index, uint8_t *value);
void HllSetRegisterValue(const uint64_t hashval);
void HllSetEmptyValue(const uint64_t hashval);
void HllSetExplicitValue(const uint64_t hashval);
void HllSetSparseValue(const uint64_t hashval);
void HllSetFullValue(const uint64_t hashval);
void HllTransformFull();
void HllTransformSparse();
void HllEmptyToSparse();
void HllExplicitToSparse();
uint32_t HllSetSparseZero(uint8_t *hlldata, const uint32_t zerocount);
uint64_t HllGetExplictitValue(const uint32_t index);
void HllExplicitCheck();
void HllGetRegisterUnit(const uint32_t registerIndex, uint32_t *byteIndex, uint32_t *unitIndex);
void HllUpdateSparseValue(const uint32_t byteIndex, const uint32_t unitIndex, const uint8_t newValue);
void HllUpdateSparseType1(const uint32_t byteIndex, const uint32_t unitIndex, const uint8_t newValue);
void HllUpdateSparseType2(const uint32_t byteIndex, const uint32_t unitIndex, const uint8_t newValue);
void HllUpdateSparseType3(const uint32_t byteIndex, const uint32_t unitIndex, const uint8_t newValue);
void HllSetSparseByte(uint8_t *destByte, const uint32_t count, const uint8_t value);
void HllCombineFull(Hll aHll);
void HllComibineExplicit(Hll aHll);
void HllCalculateNDV();
void HllGetHistogram(uint32_t *histogram);
double HllGetTau(double x);
double HllGetSigma(double x);
};
#endif