*
* analyze.cpp
* the openGauss statistics generator
*
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2021, openGauss Contributors
*
*
* IDENTIFICATION
* src/gausskernel/optimizer/commands/analyze.cpp
*
* -------------------------------------------------------------------------
*/
#include "access/nbtree.h"
#include "postgres.h"
#include "knl/knl_variable.h"
#include <math.h>
#include "access/heapam.h"
#include "access/transam.h"
#include "access/tupconvert.h"
#include "access/tuptoaster.h"
#include "access/ustore/knl_upage.h"
#include "access/ustore/knl_uvisibility.h"
#include "access/visibilitymap.h"
#include "access/tableam.h"
#include "access/xact.h"
#include "catalog/catalog.h"
#include "catalog/index.h"
#include "catalog/indexing.h"
#include "catalog/pg_collation.h"
#include "catalog/pg_inherits_fn.h"
#include "catalog/pg_namespace.h"
#include "catalog/pg_statistic.h"
#include "catalog/pg_statistic_ext.h"
#include "catalog/pg_statistic_lock.h"
#include "catalog/pg_statistic_history.h"
#include "catalog/pg_hashbucket_fn.h"
#include "catalog/namespace.h"
#include "catalog/storage_gtt.h"
#include "commands/dbcommands.h"
#include "commands/tablecmds.h"
#include "commands/tablespace.h"
#include "commands/vacuum.h"
#include "db4ai/db4ai_api.h"
#include "executor/executor.h"
#include "executor/spi.h"
#include "foreign/fdwapi.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"
#include "nodes/makefuncs.h"
#include "nodes/nodeFuncs.h"
#include "nodes/plannodes.h"
#include "nodes/print.h"
#include "parser/parse_oper.h"
#include "parser/parse_relation.h"
#include "parser/parse_type.h"
#include "pgstat.h"
#include "pgxc/groupmgr.h"
#include "postmaster/autovacuum.h"
#include "storage/cucache_mgr.h"
#include "storage/buf/bufmgr.h"
#include "storage/lmgr.h"
#include "storage/proc.h"
#include "storage/procarray.h"
#include "utils/acl.h"
#include "utils/aiomem.h"
#include "utils/attoptcache.h"
#include "utils/batchstore.h"
#include "utils/datum.h"
#include "utils/extended_statistics.h"
#include "utils/guc.h"
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/pg_rusage.h"
#include "utils/snapmgr.h"
#include "utils/sortsupport.h"
#include "utils/syscache.h"
#include "utils/timestamp.h"
#include "tcop/utility.h"
#include "tcop/dest.h"
#include "access/multixact.h"
#include "optimizer/aioptimizer.h"
#ifdef PGXC
#include "pgxc/pgxc.h"
#endif
#ifdef ENABLE_MULTIPLE_NODES
#include "tsdb/utils/ts_relcache.h"
#endif
#if defined(ENABLE_UT) || defined(ENABLE_QUNIT)
#define static
#endif
typedef struct AnlIndexData {
IndexInfo* indexInfo;
double tupleFract;
VacAttrStats** vacattrstats;
int attr_cnt;
} AnlIndexData;
typedef struct AnlPrefetchList {
BlockNumber* blocklist;
uint32 anl_idx;
uint32 load_count;
uint32 size;
} AnlPrefetchList;
typedef struct AnlPrefetch {
AnlPrefetchList* blocklist;
AnlPrefetchList fetchlist1;
AnlPrefetchList fetchlist2;
bool init;
} AnlPrefetch;
THR_LOCAL int default_statistics_target = 100;
* Currently, sample average tuple number on each ORC file for DFS table analyze.
* In order to use eliminate algorithm, the parameter ANALYZE_FACTOR is given.
* When using the paramter, each ORC file will be scanned the twice average numbers on. In this
* way, the eliminate algorithm will be used.
* The value of parameter comes from tpcds500x and tpcds1000x test.
* The test result is the following:
* 1. 3 Physical machine(21dn)
*
* 500x(fixed analyze) 500x(svn6120 analyze)
* non-partition DFS table 4421.739s 4405.214s
* partition DFS table 7197s 10271s
*
* 2. 12 Physical machine(84dn)
* 1000x(fixed analyze) 1000x(svn6061 analyze)
* non-partition DFS table 4680s 6805s
*/
#define ESTIMATE_BLOCK_FACTOR 0.65
#define EPSILON 1.0E-06
#define ANALYZE_RELATIVE_ERROR ((double)(0.5))
#define MAX_ATTR_MCV_STAT_TARGET ((int)(100))
#define MAX_ATTR_HIST_STAT_TARGET ((int)(301))
#define MIN_MCV_COUNT 2
#define DEFAULT_ATTR_TARGET 100
#define DEFAULT_COLUMN_NUM 2
#define DEFAULT_SAMPLERATE 0.02
#define DEFAULT_EST_TARGET_ROWS 300
const char* const ANALYZE_TEMP_TABLE_PREFIX = "pg_analyze";
* If get sample rows on datanode or not. there are two cases:
* 1. sampleRate >= 0 for hash or roundrobin table;
* 2. sampleRate == 0 for replication table.
*/
#define NEED_GET_SAMPLE_ROWS_DN(vacstmt) \
(IS_PGXC_DATANODE && ((DISTTYPE_REPLICATION != (vacstmt)->disttype && \
(vacstmt)->pstGlobalStatEx[(vacstmt)->tableidx].sampleRate >= 0) || \
(DISTTYPE_REPLICATION == (vacstmt)->disttype && \
(vacstmt)->pstGlobalStatEx[(vacstmt)->tableidx].sampleRate == 0)))
#define NEED_DELETE_STATS(vacstmt, totalrows) \
((IS_PGXC_DATANODE && ((totalrows) == 0)) || (IS_PGXC_COORDINATOR && !IsConnFromCoord() && 0 == (totalrows) && \
!(vacstmt)->pstGlobalStatEx[(vacstmt)->tableidx].isReplication))
* We need do analyze to compute statistic for sample rows only on datanode or
* required sample rows on coordinator.
*/
#define NEED_ANALYZE_SAMPLEROWS(vacstmt) ((IS_PGXC_DATANODE && !IS_SINGLE_NODE) || !(vacstmt)->sampleTableRequired)
* We need do analyze to compute statistic for sample table only
* required sample table on coordinator.
*/
#define NEED_ANALYZE_SAMPLETABLE(vacstmt) \
(IS_PGXC_COORDINATOR && !IsConnFromCoord() && !(vacstmt)->pstGlobalStatEx[(vacstmt)->tableidx].isReplication)
#define NEED_DO_SECOND_ANALYZE(analyzemode, vacstmt) \
(IS_PGXC_DATANODE && IsConnFromCoord() && \
((ANALYZENORMAL == (analyzemode) && (vacstmt)->pstGlobalStatEx[(vacstmt)->tableidx].sampleRate > 0)))
#define NEED_SEND_TOTALROWCNT_TO_CN(analyzemode, vacstmt, totalrows) \
(ANALYZENORMAL == (analyzemode) && (totalrows) > 0)
static void do_analyze_rel(Relation onerel, VacuumStmt* vacstmt, BlockNumber relpages, bool inh, int elevel,
AnalyzeMode mode = ANALYZENORMAL);
int compare_rows(const void* a, const void* b);
static void compute_index_stats(Relation onerel, double totalrows, AnlIndexData* indexdata, int nindexes,
HeapTuple* rows, int numrows, MemoryContext col_context);
static VacAttrStats* examine_attribute(Relation onerel, int attnum, Node* index_expr);
template <bool estimate_table_rownum>
static int64 acquire_sample_rows(
Relation onerel, int elevel, HeapTuple* rows, int64 targrows, double* totalrows, double* totaldeadrows);
template <bool estimate_table_rownum>
static int64 acquire_inherited_sample_rows(
Relation onerel, int elevel, HeapTuple* rows, int64 targrows, double* totalrows, double* totaldeadrows);
template <bool estimate_table_rownum>
static int64 acquirePartitionedSampleRows(Relation onerel, VacuumStmt* vacstmt, int elevel, HeapTuple* rows,
int64 targrows, double* totalrows, double* totaldeadrows, VacAttrStats** vacattrstats, int attrAnalyzeNum);
static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool* isNull, Relation rel);
static Datum ind_fetch_func(VacAttrStatsP stats, int rownum, bool* isNull, Relation rel);
static int get_one_tuplesize(Relation onerel, int total_width);
static int64 get_workmem_allow_rows(Relation onerel, int total_width);
template <bool estimate_table_rownum>
static int64 AcquireSampleCStoreRows(Relation onerel, int elevel, HeapTuple* rows, int64 targrows, double* totalrows,
double* totaldeadrows, VacAttrStats** vacattrstats, int attrAnalyzeNum);
static void send_totalrowcnt_to_cn(VacuumStmt* vacstmt, AnalyzeMode analyzemode, double totalrows);
static void do_sampling_normal_second(
Relation onerel, VacuumStmt* vacstmt, int64 numrows, double totalrows, HeapTuple* rows);
static bool equalTupleDescAttrsTypeid(TupleDesc tupdesc1, TupleDesc tupdesc2);
void CstoreAnalyzePrefetch(
CStoreScanDesc cstoreScan, BlockNumber* cuidList, int32 cuidListCount, int analyzeAttrNum, int32* attrSeq);
static void get_sample_rows_for_query(
MemoryContext speccontext, Relation rel, VacuumStmt* vacstmt, int64* num_sample_rows, HeapTuple** samplerows);
static ArrayType* es_construct_extended_statistics(VacAttrStats* stats, int k);
template <bool isSingleColumn>
static void update_stats_catalog(
Relation pgstat, MemoryContext oldcontext, Oid relid, char relkind, bool inh, VacAttrStats* stats, int natts,
char relpersistence);
static BlockNumber estimate_psort_index_blocks(TupleDesc desc, double totalTuples);
static BlockNumber estimate_btree_index_blocks(TupleDesc desc, double totalTuples, double table_factor = 1.0);
typedef struct {
MemoryContext memorycontext;
int64 num_sample;
HeapTuple* samplerows;
TupleDesc tupDesc;
} AnalyzeSampleRowsSpecInfo;
typedef struct {
MemoryContext memoryContext;
int num_cols;
int64 num_rows;
ArrayBuildState** output;
bool compute_histgram;
int64 non_mcv_num;
HistgramInfo hist_list;
TupleDesc spi_tupDesc;
} AnalyzeResultMultiColAsArraySpecInfo;
static bool do_analyze_samplerows(Relation onerel, VacuumStmt* vacstmt, int attr_cnt, VacAttrStats** vacattrstats,
bool hasindex, int nindexes, AnlIndexData* indexdata, bool inh, Relation* Irel, double* totalrows, int64* numrows,
HeapTuple* rows, AnalyzeMode analyzemode, MemoryContext caller_context,
Oid save_userid, int save_sec_context, int save_nestlevel);
static void do_analyze_sampletable(Relation onerel, VacuumStmt* vacstmt, int attr_cnt, VacAttrStats** vacattrstats,
bool hasindex, int nindexes, AnlIndexData* indexdata, bool inh, Relation* Irel, double totalrows, int64 numrows,
AnalyzeMode analyzemode);
static void do_analyze_compute_attr_stats(bool inh, Relation onerel, VacuumStmt* vacstmt, VacAttrStats* stats,
double numTotalRows, int64 numSampleRows, AnalyzeMode analyzemode);
static void set_stats_dndistinct(VacAttrStats* stats, VacuumStmt* vacstmt, int tableidx);
static void update_pages_and_tuples_pgclass(Relation onerel, VacuumStmt* vacstmt, int attr_cnt,
VacAttrStats** vacattrstats, bool hasindex, int nindexes, AnlIndexData* indexdata, Relation* Irel,
BlockNumber relpages, double totalrows, double totaldeadrows, int64 numrows, bool inh);
static bool analyze_index_sample_rows(MemoryContext speccontext, Relation rel, VacuumStmt* vacstmt, int nindexes,
AnlIndexData* indexdata, int64* num_sample_rows, HeapTuple** samplerows);
static void compute_histgram_final(int* slot_idx, HistgramInfo* hist_list, VacAttrStats* stats);
static void compute_histgram_internal(AnalyzeResultMultiColAsArraySpecInfo* spec);
static void analyze_rel_internal(Relation onerel, Oid grandparentOid, Oid parentOid, Oid tabOid,
VacuumStmt* vacstmt, BufferAccessStrategy bstrategy,
AnalyzeMode analyzemode = ANALYZENORMAL);
static void analyze_tmptbl_debug_dn(AnalyzeTempTblDebugStage type, AnalyzeMode analyzemode, List* tmpSampleTblNameList,
Oid* sampleTableOid, Relation* tmpRelation, HeapTuple tuple, TupleConversionMap* tupmap);
static TupleConversionMap* check_tmptbl_tupledesc(Relation onerel, TupleDesc temptbl_desc);
static double compute_com_size(VacuumStmt* vacstmt, Relation rel, int tableidx);
#ifndef ENABLE_MULTIPLE_NODES
static void update_numSamplerows(int attr_cnt, VacAttrStats** vacattrstats, HeapTuple* rows, int64 numrows);
#endif
* analyze_get_relation() -- get one relation by relid before do analyze.
* @in relid - the relation id for analyze or vacuum
* @in vacstmt - the statment for analyze or vacuum command
*/
Relation analyze_get_relation(Oid relid, VacuumStmt* vacstmt)
{
Relation onerel = NULL;
bool GetLock = false;
LOCKMODE lockmode = NEED_EST_TOTAL_ROWS_DN(vacstmt) ? AccessShareLock : ShareUpdateExclusiveLock;
* Check for user-requested abort.
*/
CHECK_FOR_INTERRUPTS();
* Open the relation, getting ShareUpdateExclusiveLock to ensure that two
* ANALYZEs don't run on it concurrently. (This also locks out a
* concurrent VACUUM, which doesn't matter much at the moment but might
* matter if we ever try to accumulate stats on dead tuples.) If the rel
* has been dropped since we last saw it, we don't need to process it.
*/
bool isTableOrPartition = vacuumRelation(vacstmt->flags) || vacuumMainPartition(vacstmt->flags)
|| vacuumPartition(vacstmt->flags);
if (isTableOrPartition && !(vacstmt->options & VACOPT_NOWAIT)) {
onerel = try_relation_open(relid, lockmode);
GetLock = true;
} else if (isTableOrPartition && ConditionalLockRelationOid(relid, lockmode)) {
onerel = try_relation_open(relid, NoLock);
GetLock = true;
}
if (!GetLock) {
onerel = NULL;
if (IsAutoVacuumWorkerProcess() && u_sess->attr.attr_storage.Log_autovacuum_min_duration >= 0)
ereport(LOG,
(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
errmsg("skipping analyze of \"%s\" --- lock not available", vacstmt->relation->relname)));
}
return onerel;
}
* Description: The entry for analyze relation.
*
* Parameters:
* @in relid: relation oid
* @in vacstmt: the statment for analyze or vacuum command
* @in bstrategy: buffer access strategy objects
*
* Returns: void
*/
void analyze_rel(Oid relid, Oid grandparentOid, Oid parentOid, Oid tabOid,
VacuumStmt* vacstmt, BufferAccessStrategy bstrategy)
{
* try open the relation, we will skip the relation if try open failed.
* the relation will close in analyze_rel_internal().
*/
Relation onerel = analyze_get_relation(relid, vacstmt);
if (STMT_RETRY_ENABLED) {
} else if (onerel != NULL && onerel->rd_rel != NULL &&
onerel->rd_rel->relpersistence == RELPERSISTENCE_TEMP && !validateTempNamespace(onerel->rd_rel->relnamespace)) {
relation_close(onerel, NEED_EST_TOTAL_ROWS_DN(vacstmt) ? AccessShareLock : ShareUpdateExclusiveLock);
ereport(ERROR,
(errcode(ERRCODE_INVALID_TEMP_OBJECTS),
errmsg("Temp table's data is invalid because datanode %s restart. "
"Quit your session to clean invalid temp tables.",
g_instance.attr.attr_common.PGXCNodeName)));
}
* do analyze If have onerel, otherwise,we should CommitTransaction
* if it has StartTransaction already.
*/
if (onerel) {
analyze_rel_internal(onerel, grandparentOid, parentOid, tabOid, vacstmt, bstrategy, ANALYZENORMAL);
* Reset attcacheoff values in the tupleDesc of the relation
* to prevent ustore tuples from using the heap offset
* values calculated while computing stats.
*/
if (RelationIsUstoreFormat(onerel)) {
for (int i = 0; i < onerel->rd_att->natts; i++) {
if (onerel->rd_att->attrs[i].attcacheoff >= 0) {
onerel->rd_att->attrs[i].attcacheoff = -1;
}
}
}
}
}
* Description: catch error for update or delete pg_statistic concurrency.
*
* Parameters:
* @in relid: the relation oid for analyze or vacuum
* @in attnum: the column no for analyze concurrency updating
* @in oldcontext: old memory context saved before catch error data
* @in funcname: function name of caller
* Returns: void
*/
void analyze_concurrency_process(Oid relid, int16 attnum, MemoryContext oldcontext, const char* funcname)
{
ErrorData* edata = NULL;
(void)MemoryContextSwitchTo(oldcontext);
edata = CopyErrorData();
FlushErrorState();
if (edata->sqlerrcode == ERRCODE_T_R_SERIALIZATION_FAILURE) {
ereport(NOTICE,
(errcode(ERRCODE_SUCCESSFUL_COMPLETION),
errmsg("skipping \"%s.%s\" --- encounter concurrency analyze in function %s on %s: %s.",
get_rel_name(relid),
(attnum > 0) ? get_attname(relid, attnum) : "extended_stats",
funcname,
g_instance.attr.attr_common.PGXCNodeName,
edata->message),
handle_in_client(true)));
FreeErrorData(edata);
} else {
ReThrowError(edata);
}
}
BlockNumber GetSubPartitionNumberOfBlocks(Relation rel, Partition part, LOCKMODE lockmode)
{
Relation partRel = partitionGetRelation(rel, part);
List *subPartList = NIL;
ListCell *partCell = NULL;
BlockNumber relpages = 0;
subPartList = relationGetPartitionList(partRel, lockmode);
foreach (partCell, subPartList) {
Partition subPart = (Partition)lfirst(partCell);
relpages += PartitionGetNumberOfBlocks(partRel, subPart);
}
releasePartitionList(partRel, &subPartList, NoLock);
releaseDummyRelation(&partRel);
return relpages;
}
* Description: Analyze one relation
*
* Parameters:
* @in onerel: relation for analyze
* @in grandparentOid, parentOid, tabOid is object for analyze
* (1) if tabOid is subpartition, both grangparentOid and parentOid are valid
* (2) if tabOid is level-1 partition, grandparentOid is InvalidOid, and
* parentOid is valid
* (3) if tabOid is table, both grandparentOid and parentOid is InvalidOid
* @in vacstmt: the statment for analyze or vacuum command
* @in bstrategy: buffer access strategy objects
* @in analyzemode - identify which type the table, dfs table or delta table
*/
static void analyze_rel_internal(Relation onerel, Oid grandparentOid, Oid parentOid, Oid tabOid,
VacuumStmt* vacstmt, BufferAccessStrategy bstrategy, AnalyzeMode analyzemode)
{
AcquireSampleRowsFunc acquirefunc = NULL;
int elevel;
int messageLevel;
BlockNumber relpages = 0;
List* partList = NIL;
LOCKMODE lockmode = NEED_EST_TOTAL_ROWS_DN(vacstmt) ? AccessShareLock : ShareUpdateExclusiveLock;
AssertEreport(onerel, MOD_OPT, "onrel can not be NULL when doing analyze");
u_sess->analyze_cxt.vac_strategy = bstrategy;
messageLevel = WARNING;
elevel = DEBUG2;
if (vacstmt->options & VACOPT_VERBOSE) {
messageLevel = VERBOSEMESSAGE;
elevel = VERBOSEMESSAGE;
}
* Check permissions --- this should match vacuum's check!
*/
AclResult aclresult = pg_class_aclcheck(RelationGetPgClassOid(onerel, false), GetUserId(), ACL_VACUUM);
if (aclresult != ACLCHECK_OK && !(pg_class_ownercheck(RelationGetPgClassOid(onerel, false), GetUserId()) ||
(pg_database_ownercheck(u_sess->proc_cxt.MyDatabaseId, GetUserId()) && !onerel->rd_rel->relisshared) ||
(isOperatoradmin(GetUserId()) && u_sess->attr.attr_security.operation_mode))) {
if (!(vacstmt->options & VACOPT_VACUUM)) {
if (onerel->rd_rel->relisshared)
ereport(messageLevel,
(errmsg("skipping \"%s\" --- only system admin can analyze it", RelationGetRelationName(onerel))));
else if (onerel->rd_rel->relnamespace == PG_CATALOG_NAMESPACE)
ereport(messageLevel,
(errmsg("skipping \"%s\" --- only system admin or database owner can analyze it",
RelationGetRelationName(onerel))));
else
ereport(messageLevel,
(errmsg("skipping \"%s\" --- only table or database owner can analyze it",
RelationGetRelationName(onerel))));
}
relation_close(onerel, lockmode);
return;
}
* Silently ignore tables that are temp tables of other backends ---
* trying to analyze these is rather pointless, since their contents are
* probably not up-to-date on disk. (We don't throw a warning here; it
* would just lead to chatter during a database-wide ANALYZE.)
*/
if (RELATION_IS_OTHER_TEMP(onerel)) {
relation_close(onerel, lockmode);
return;
}
if (RELATION_IS_GLOBAL_TEMP(onerel) && !gtt_storage_attached(RelationGetRelid(onerel))) {
relation_close(onerel, ShareUpdateExclusiveLock);
return;
}
* We can ANALYZE any table except pg_statistic、pg_statistic_history、pg_statistic_lock. See update_attstats
*/
Oid relid = RelationGetRelid(onerel);
if (relid == StatisticRelationId || relid == StatisticHistoryRelationId || relid == StatisticLockRelationId) {
AssertEreport(RelationIsNonpartitioned(onerel), MOD_OPT, "pg_statistic can not be a partitioned table.");
if (!IsInitdb && !IS_SINGLE_NODE) {
elog(WARNING, "System catalog pg_statistic、pg_statistic_history、pg_statistic_lock "
"can not be analyzed, skip it.");
}
relation_close(onerel, lockmode);
return;
}
* Check that it's a plain table or foreign table; we used to do this in
* get_rel_oids() but seems safer to check after we've locked the
* relation.
*/
if (onerel->rd_rel->relkind == RELKIND_RELATION ||
onerel->rd_rel->relkind == RELKIND_MATVIEW) {
if (RelationIsPartitioned(onerel)) {
if (OidIsValid(grandparentOid)) {
Partition part = partitionOpen(onerel, parentOid, lockmode);
Relation partRel = partitionGetRelation(onerel, part);
Partition subpart = partitionOpen(partRel, tabOid, lockmode);
relpages = PartitionGetNumberOfBlocks(partRel, subpart);
vacstmt->parentpart = part;
vacstmt->onepartrel = partRel;
vacstmt->onepart = subpart;
vacstmt->issubpartition = true;
} else {
Partition part = NULL;
ListCell* partCell = NULL;
if (vacuumPartition(vacstmt->flags)) {
Oid partOid = tabOid;
part = partitionOpen(onerel, partOid, lockmode);
partList = list_make1(part);
} else {
partList = relationGetPartitionList(onerel, lockmode);
}
foreach (partCell, partList) {
part = (Partition)lfirst(partCell);
if (RelationIsSubPartitioned(onerel)) {
relpages += GetSubPartitionNumberOfBlocks(onerel, part, lockmode);
} else {
relpages += PartitionGetNumberOfBlocks(onerel, part);
}
}
}
} else {
relpages = RelationGetNumberOfBlocks(onerel);
}
} else if (onerel->rd_rel->relkind == RELKIND_FOREIGN_TABLE
|| onerel->rd_rel->relkind == RELKIND_STREAM) {
* @hdfs
* For a foreign table, call FDW's hook function to see whether it
* supports analysis or not.
*/
bool retValue = false;
FdwRoutine* fdwroutine = GetFdwRoutineForRelation(onerel, false);
if (NULL != fdwroutine->AnalyzeForeignTable) {
if (isObsOrHdfsTableFormTblOid(RelationGetRelid(onerel)) ||
(IS_OBS_CSV_TXT_FOREIGN_TABLE(RelationGetRelid(onerel)) && !isWriteOnlyFt(RelationGetRelid(onerel)))) {
retValue = fdwroutine->AnalyzeForeignTable(
onerel, &acquirefunc, &relpages, (void*)vacstmt->HDFSDnWorkFlow, false);
} else {
retValue = fdwroutine->AnalyzeForeignTable(onerel, &acquirefunc, &relpages, 0, false);
}
if (!retValue) {
messageLevel = isMysqlFDWFromTblOid(RelationGetRelid(onerel)) ? LOG : messageLevel;
ereport(messageLevel,
(errmsg(
"Skipping \"%s\" --- cannot analyze this foreign table.", RelationGetRelationName(onerel))));
relation_close(onerel, lockmode);
return;
}
} else {
ereport(messageLevel,
(errmsg("Table %s doesn't support analysis operation.", RelationGetRelationName(onerel))));
relation_close(onerel, lockmode);
return;
}
} else {
if (!(vacstmt->options & VACOPT_VACUUM))
ereport(messageLevel,
(errmsg("skipping \"%s\" --- cannot analyze non-tables or special system tables",
RelationGetRelationName(onerel))));
if (RelationIsPartitioned(onerel)) {
releasePartitionList(onerel, &partList, lockmode);
}
relation_close(onerel, lockmode);
return;
}
* OK, let's do it. First let other backends know I'm in ANALYZE.
*/
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
t_thrd.pgxact->vacuumFlags |= PROC_IN_ANALYZE;
LWLockRelease(ProcArrayLock);
if (RelationIsPartitioned(onerel)) {
vacstmt->partList = partList;
}
* Do the normal non-recursive ANALYZE.
*/
do_analyze_rel(onerel, vacstmt, relpages, false, elevel, analyzemode);
* If there are child tables, do recursive ANALYZE.
*/
if (onerel->rd_rel->relhassubclass)
do_analyze_rel(onerel, vacstmt, relpages, true, elevel, analyzemode);
* Close source relation now, but keep lock so that no one deletes it
* before we commit. (If someone did, they'd fail to clean up the entries
* we made in pg_statistic. Also, releasing the lock before commit would
* expose us to concurrent-update failures in update_attstats.)
*/
if (RelationIsPartitioned(onerel)) {
releasePartitionList(onerel, &partList, NoLock);
if (OidIsValid(grandparentOid)) {
partitionClose(vacstmt->onepartrel, vacstmt->onepart, NoLock);
releaseDummyRelation(&vacstmt->onepartrel);
partitionClose(onerel, vacstmt->parentpart, NoLock);
}
}
relation_close(onerel, NoLock);
* Reset my PGXACT flag. Note: we need this here, and not in vacuum_rel,
* because the vacuum flag is cleared by the end-of-xact code.
*/
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
t_thrd.pgxact->vacuumFlags &= ~PROC_IN_ANALYZE;
LWLockRelease(ProcArrayLock);
}
bool whether_use_percent(VacAttrStats** vacattrstats, int attr_cnt, int nindexes, AnlIndexData* indexdata)
{
int i;
Form_pg_attribute attr;
if (default_statistics_target < 0)
return true;
for (i = 0; i < attr_cnt; i++) {
AssertEreport(vacattrstats[i] != NULL, MOD_OPT, "vacattrstats[i] can not be NULL");
attr = vacattrstats[i]->attrs[0];
if (attr->attstattarget < -1) {
return true;
}
}
for (int ind = 0; ind < nindexes; ind++) {
AssertEreport(indexdata != NULL, MOD_OPT, "indexdata[i] can not be NULL");
AnlIndexData* thisdata = &indexdata[ind];
for (i = 0; i < thisdata->attr_cnt; i++) {
attr = thisdata->vacattrstats[i]->attrs[0];
if (attr->attstattarget < -1) {
return true;
}
}
}
return false;
}
static void caculate_target_rows(
Form_pg_attribute attr, int64 totalrows, int64* target_rows, int64 workmem_allow_rows, bool use_percent)
{
int64 temp_rows = 0;
if (use_percent) {
if (attr->attstattarget < -1) {
attr->attstattarget = (attr->attstattarget + 1) * (-1);
temp_rows = totalrows * attr->attstattarget / 100;
} else if (attr->attstattarget < 0) {
if (default_statistics_target < 0) {
attr->attstattarget = default_statistics_target * (-1);
temp_rows = totalrows * attr->attstattarget / 100;
} else {
attr->attstattarget = default_statistics_target;
temp_rows = DEFAULT_EST_TARGET_ROWS * (int64)attr->attstattarget;
}
} else
temp_rows = DEFAULT_EST_TARGET_ROWS * (int64)attr->attstattarget;
*target_rows = rtl::max(*target_rows, temp_rows);
}
*target_rows = rtl::min(*target_rows, workmem_allow_rows);
attr->attstattarget = *target_rows / DEFAULT_EST_TARGET_ROWS;
if (attr->attstattarget < 1)
attr->attstattarget = 1;
if (attr->attstattarget > 10000)
attr->attstattarget = 10000;
}
static int get_total_width(
VacAttrStats** vacattrstats, int attr_cnt, int nindexes, AnlIndexData* indexdata, Relation rel)
{
int total_width1 = 0;
int total_width = 0;
int i = 0;
int32 item_width = 0;
bool isPartition = RelationIsPartition(rel);
Form_pg_attribute attr;
for (i = 0; i < attr_cnt; i++) {
for (unsigned int j = 0; j < vacattrstats[i]->num_attrs; ++j) {
attr = vacattrstats[i]->attrs[j];
item_width = get_attavgwidth(RelationGetRelid(rel), attr->attnum, isPartition);
if (item_width <= 0) {
item_width = get_typavgwidth(attr->atttypid, attr->atttypmod);
Assert(item_width > 0);
}
total_width += item_width;
}
}
for (int ind = 0; ind < nindexes; ind++) {
AnlIndexData* thisdata = &indexdata[ind];
total_width1 = 0;
for (i = 0; i < thisdata->attr_cnt; i++) {
for (unsigned int j = 0; j < thisdata->vacattrstats[i]->num_attrs; ++j) {
attr = thisdata->vacattrstats[i]->attrs[j];
item_width = get_attavgwidth(RelationGetRelid(rel), attr->attnum, isPartition);
if (item_width <= 0) {
item_width = get_typavgwidth(attr->atttypid, attr->atttypmod);
Assert(item_width > 0);
}
total_width1 += item_width;
}
}
if (total_width < total_width1) {
total_width = total_width1;
}
}
return total_width;
}
int64 get_target_rows(Relation onerel, VacAttrStats** vacattrstats, int attr_cnt, int nindexes, AnlIndexData* indexdata,
int64 totalrows, int64 targrows, bool use_percent)
{
int i = 0;
Form_pg_attribute attr;
int total_width = 0;
int64 target_rows = DEFAULT_SAMPLE_ROWCNT;
int64 workmem_allow_rows = 0;
total_width = get_total_width(vacattrstats, attr_cnt, nindexes, indexdata, onerel);
* Notice : get target rows control by work_mem is the temp method,
* we should resolve the problem in palloc each tuple dynamic later.
*/
workmem_allow_rows = get_workmem_allow_rows(onerel, total_width);
if (target_rows > workmem_allow_rows)
workmem_allow_rows = target_rows;
if (!use_percent)
target_rows = targrows;
for (i = 0; i < attr_cnt; i++) {
attr = vacattrstats[i]->attrs[0];
caculate_target_rows(attr, totalrows, &target_rows, workmem_allow_rows, use_percent);
}
for (int ind = 0; ind < nindexes; ind++) {
AnlIndexData* thisdata = &indexdata[ind];
for (i = 0; i < thisdata->attr_cnt; i++) {
attr = thisdata->vacattrstats[i]->attrs[0];
caculate_target_rows(attr, totalrows, &target_rows, workmem_allow_rows, use_percent);
}
}
return target_rows;
}
* Get real tuple size using to caculate workmem_allow_rows.
*/
static int64 get_workmem_allow_rows(Relation onerel, int total_width)
{
int64 workMem = u_sess->attr.attr_memory.work_mem * 1024L;
int64 workmem_allow_rows = 0;
int oneTupleSize = 0;
oneTupleSize = get_one_tuplesize(onerel, total_width);
workmem_allow_rows = workMem / oneTupleSize;
return workmem_allow_rows;
}
static int get_one_tuplesize(Relation onerel, int total_width)
{
int tupleMemSize = 0;
if (!RelationIsColStore(onerel) && RowRelationIsCompressed(onerel))
tupleMemSize = MaxHeapTupleSize + HEAPTUPLESIZE;
else
tupleMemSize = MAXALIGN(total_width) + HEAPTUPLESIZE;
return tupleMemSize;
}
* get_total_rows
* To get the total rows
*
* @param (in) estimate_table_rownum:
* Flag to mark wheather estimate sample row number or get sample rows
*
* @return:
* (estimate_table_rownum = true ) : NULL, estimated sample row number stored in "int64 *numrows"
* (estimate_table_rownum = false) : sample data tuples
*/
template <bool estimate_table_rownum>
HeapTuple* get_total_rows(Relation onerel, VacuumStmt* vacstmt, BlockNumber relpages, bool inh, int elevel,
VacAttrStats** vacattrstats, int attr_cnt, int64 targrows, double* totalrows, double* totaldeadrows,
int64* numrows)
{
#define DEFAULT_PARTITION_EST_TARGET_ROWS 3
HeapTuple* rows = NULL;
int64 target_rows = 0;
* IsForeignTable
*/
Oid foreignTableId = 0;
bool isForeignTable = false;
DEBUG_MOD_START_TIMER(MOD_AUTOVAC);
int64 bufferRead = 0;
int64 bufferHit = 0;
if (onerel->pgstat_info != NULL) {
bufferRead = onerel->pgstat_info->t_counts.t_blocks_fetched;
bufferHit = onerel->pgstat_info->t_counts.t_blocks_hit;
}
if (vacstmt->relation != NULL) {
foreignTableId = RelationGetRelid(onerel);
isForeignTable = IsHDFSTableAnalyze(foreignTableId);
}
* Acquire the sample rows
*/
if (estimate_table_rownum) {
if (!(RELATION_IS_PARTITIONED(onerel) && !isForeignTable))
target_rows = DEFAULT_EST_TARGET_ROWS;
else {
* set target rows as the num of partition in order to
* be sure we can scan one block for every partition at least when the num of partition is more than 300.
*/
target_rows =
Max(DEFAULT_EST_TARGET_ROWS, list_length(vacstmt->partList) * DEFAULT_PARTITION_EST_TARGET_ROWS);
}
} else {
target_rows = targrows;
}
rows = (HeapTuple*)palloc0(target_rows * sizeof(HeapTuple));
if (inh) {
if (onerel->rd_rel->relhassubclass) {
*numrows = acquire_inherited_sample_rows<estimate_table_rownum>(
onerel, elevel, rows, target_rows, totalrows, totaldeadrows);
}
} else if (RELATION_IS_PARTITIONED(onerel) && !isForeignTable) {
*numrows = acquirePartitionedSampleRows<estimate_table_rownum>(
onerel, vacstmt, elevel, rows, target_rows, totalrows, totaldeadrows, vacattrstats, attr_cnt);
} else if (isForeignTable || onerel->rd_rel->relkind == RELKIND_STREAM ||
(onerel->rd_rel->relkind == RELKIND_FOREIGN_TABLE &&
(isOracleFDWFromTblOid(RelationGetRelid(onerel)) ||
#ifdef ENABLE_MOT
isMOTFromTblOid(RelationGetRelid(onerel)) ||
#endif
isPostgresFDWFromTblOid(RelationGetRelid(onerel))))) {
* @hdfs processing foreign table sampling operation
* get foreign table FDW routine
*/
FdwRoutine* tableFdwRoutine = GetFdwRoutineByRelId(foreignTableId);
* A foreign table supports sampling operation or not.
*/
if (NULL == tableFdwRoutine->AcquireSampleRows) {
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_COLUMN),
errmsg("Table %s does not support sampling operation for analyze.",
RelationGetRelationName(RelationIdGetRelation(foreignTableId)))));
} else {
if (isObsOrHdfsTableFormTblOid(foreignTableId) || IS_OBS_CSV_TXT_FOREIGN_TABLE(foreignTableId)) {
*numrows = (tableFdwRoutine->AcquireSampleRows)(onerel,
elevel,
rows,
target_rows,
totalrows,
totaldeadrows,
(void*)vacstmt->HDFSDnWorkFlow,
estimate_table_rownum);
} else {
*numrows = (tableFdwRoutine->AcquireSampleRows)(
onerel, elevel, rows, target_rows, totalrows, totaldeadrows, 0, estimate_table_rownum);
}
*totaldeadrows = 0;
}
} else {
if (RelationIsColStore(onerel)) {
if (RelationIsCUFormat(onerel)) {
*numrows = AcquireSampleCStoreRows<estimate_table_rownum>(
onerel, elevel, rows, target_rows, totalrows, totaldeadrows, vacattrstats, attr_cnt);
}
} else {
* for hash bucket table without range partitions
* we pretend it as a parition table and re-use the
* parition sampling function.
*/
if (RELATION_OWN_BUCKET(onerel))
*numrows = acquirePartitionedSampleRows<estimate_table_rownum>(
onerel, vacstmt, elevel, rows, target_rows, totalrows, totaldeadrows, vacattrstats, attr_cnt);
else
*numrows = acquire_sample_rows<estimate_table_rownum>(
onerel, elevel, rows, target_rows, totalrows, totaldeadrows);
}
}
if (onerel->pgstat_info != NULL) {
bufferRead = onerel->pgstat_info->t_counts.t_blocks_fetched - bufferRead;
bufferHit = onerel->pgstat_info->t_counts.t_blocks_hit - bufferHit;
}
DEBUG_MOD_STOP_TIMER(MOD_AUTOVAC, "%s get sample rows for table \"%s\". "
"totalrows: %ld, target_rows: %ld, numrows: %ld. Buffer: %ld hit/%ld read.",
(estimate_table_rownum ? "Estimate" : "Real"),
NameStr(onerel->rd_rel->relname),
targrows,
target_rows,
*numrows,
bufferHit,
bufferRead);
if (estimate_table_rownum) {
for (int i = 0; i < *numrows; i++) {
if (rows[i]) {
pfree_ext(rows[i]);
rows[i] = NULL;
}
}
pfree_ext(rows);
return NULL;
} else {
return rows;
}
}
static BlockNumber estimate_cstore_blocks(
Relation rel, VacAttrStats** vacAttrStats, int attrCnt, double sampleTuples, double totalTuples, bool dfsStore)
{
int tuple_width = 0;
BlockNumber total_pages = 0;
int i;
bool isPartition = RelationIsPartition(rel);
if (totalTuples <= 0) {
return 0;
}
for (i = 1; i <= RelationGetNumberOfAttributes(rel); i++) {
Form_pg_attribute att = &rel->rd_att->attrs[i - 1];
int32 item_width = -1;
if (att->attisdropped)
continue;
item_width = get_typlen(att->atttypid);
if (item_width > 0) {
tuple_width += item_width;
continue;
}
if (sampleTuples > 0) {
bool found = false;
for (int attIdx = 0; attIdx < attrCnt; ++attIdx) {
VacAttrStats* stats = vacAttrStats[attIdx];
if (att->attnum == stats->tupattnum) {
item_width = stats->stawidth;
found = true;
break;
}
}
if (found) {
tuple_width += item_width;
continue;
}
}
if (item_width <= 0)
item_width = get_attavgwidth(RelationGetRelid(rel), i, isPartition);
if (item_width <= 0)
item_width = get_typavgwidth(att->atttypid, att->atttypmod);
Assert(item_width > 0);
tuple_width += item_width;
}
if (dfsStore)
total_pages = ceil((totalTuples * tuple_width * ESTIMATE_BLOCK_FACTOR) / BLCKSZ);
else
total_pages = ceil(totalTuples / RelDefaultFullCuSize) * (RelDefaultFullCuSize * tuple_width / BLCKSZ);
if (totalTuples > 0 && totalTuples < total_pages)
total_pages = totalTuples;
if (totalTuples > 0 && total_pages == 0)
total_pages = 1;
return total_pages;
}
#ifdef ENABLE_MULTIPLE_NODES
static BlockNumber estimate_tsstore_blocks(Relation rel, int attrCnt, double totalTuples)
{
int tuple_width = 0;
BlockNumber total_pages = 0;
int i;
const int text_width = 12;
if (0 >= totalTuples) {
return 0;
}
for (i = 1; i <= attrCnt; i++) {
Form_pg_attribute att = &rel->rd_att->attrs[i - 1];
int32 item_width = -1;
if (att->attisdropped)
continue;
item_width = get_typlen(att->atttypid);
if (item_width > 0) {
tuple_width += item_width;
continue;
}
if (att->atttypid == TEXTOID) {
tuple_width += text_width;
continue;
}
item_width = get_typavgwidth(att->atttypid, att->atttypmod);
Assert(item_width > 0);
tuple_width += item_width;
}
total_pages = ceil(totalTuples / RelDefaultFullCuSize) * (RelDefaultFullCuSize * tuple_width / BLCKSZ);
if (0 < totalTuples && totalTuples < total_pages)
total_pages = totalTuples;
if (0 < totalTuples && 0 >= total_pages)
total_pages = 1;
return total_pages;
}
#endif
* get_non_leaf_pages
* Given a perfect tree's leaf pages, estimate the number of non leaf pages.
*/
static double get_non_leaf_pages(double leafPages, int totalWidth, bool hasToast)
{
double nonLeafPages = 0.0;
double numOfBranches = hasToast ? (double)TOAST_TUPLES_PER_PAGE : \
(double)(BLCKSZ - SizeOfPageHeaderData) * 100.0 / (BTREE_NONLEAF_FILLFACTOR * (double)totalWidth);
if (numOfBranches <= 1) {
return 1.0;
}
while (leafPages >= 1) {
leafPages /= numOfBranches;
nonLeafPages += ceil(leafPages);
}
return nonLeafPages + 1.0;
}
static BlockNumber estimate_psort_index_blocks(TupleDesc desc, double totalTuples)
{
int totalWidth = 0;
int attrCnt = desc->natts;
FormData_pg_attribute* attrs = desc->attrs;
for (int attIdx = 0; attIdx < attrCnt; ++attIdx) {
Form_pg_attribute thisatt = &attrs[attIdx];
totalWidth += get_typavgwidth(thisatt->atttypid, thisatt->atttypmod);
}
return (ceil(totalTuples / RelDefaultFullCuSize) * (RelDefaultFullCuSize * totalWidth / BLCKSZ));
}
* estimate_btree_index_blocks
* Estimate number of blocks occupied by index.
* @in desc: tuple discriptor,
* @in totalTuples: total tuples,
* @in table_factor: the table expansion factor,
* @out totalPages: return number of blocks.
*/
static BlockNumber estimate_btree_index_blocks(TupleDesc desc, double totalTuples, double table_factor)
{
int totalWidth = 0;
int attrCnt = desc->natts;
FormData_pg_attribute* attrs = desc->attrs;
BlockNumber totalPages = 0;
bool hasToast = false;
for (int attIdx = 0; attIdx < attrCnt; ++attIdx) {
Form_pg_attribute thisatt = &attrs[attIdx];
totalWidth += get_typavgwidth(thisatt->atttypid, thisatt->atttypmod);
AssertEreport(totalWidth > 0,
MOD_OPT,
"The estimated average width of values of the type is not larger than 0"
"when setting the estimated output width of a base relation.");
}
if (!ENABLE_SQL_BETA_FEATURE(PAGE_EST_OPT)) {
return ceil(totalTuples / RelDefaultFullCuSize) * (RelDefaultFullCuSize * totalWidth / BLCKSZ);
}
if (totalWidth >= (int)TOAST_TUPLE_TARGET) {
hasToast = true;
}
* Assume we have only one big balanced ternary tree for index from all datanodes. We estimate its leaf pages by:
* Find a perfectly filled tree and divide it by its fill factor (default, 90).
* If the tree contains toast tuples, we use TOAST_TUPLES_PER_PAGE instead of totalWidth for estimation.
*/
double leafPages = hasToast ? ceil(totalTuples / (double)TOAST_TUPLES_PER_PAGE) : \
ceil(ceil((totalWidth * totalTuples) / BLCKSZ) * 100.0 / (double)BTREE_DEFAULT_FILLFACTOR);
double nonLeafPages = get_non_leaf_pages(leafPages, totalWidth, hasToast);
totalPages = ceil((leafPages + nonLeafPages * 100.0 / (double)BTREE_NONLEAF_FILLFACTOR) * table_factor);
ereport(DEBUG2,
(errmodule(MOD_OPT),
(errmsg("Estimating index blocks with sql_beta_feature = PAGE_EST_OPT(original: %u, optimized: %u).",
(unsigned int)(ceil(totalTuples / RelDefaultFullCuSize) * \
(RelDefaultFullCuSize * totalWidth / BLCKSZ)),
(unsigned int)totalPages))));
return totalPages;
}
* get_vacattrstats_by_vacstmt() -- set up statistic structure for all attributes or index by the statment for analyze
* using for comput and update pg_statistic
*
* Input Param:
* onerel: the relation for analyze or vacuum
* vacstmt: the statment for analyze or vacuum command
*
* Output Param:
* attr_cnt: the count of attributes for analyze
* nindexes: the num of index for analyze
* indexdata: per-index data for analyze
* hasindex: are there have index or not for analyze
* inh: Is inherit table for analzye or not
* Irel: malloc a new index relation for analyze
*/
static VacAttrStats** get_vacattrstats_by_vacstmt(Relation onerel, VacuumStmt* vacstmt, int* attr_cnt, int* nindexes,
AnlIndexData** indexdata, bool* hasindex, bool inh, Relation** Irel, bool isLog = true)
{
int tcnt, i, ind;
VacAttrStats** vacattrstats = NULL;
List* list_multi_column = NIL;
* Determine which columns to analyze
*
* Note that system attributes are never analyzed, so we just reject them
* at the lookup stage. We also reject duplicate column mentions. (We
* could alternatively ignore duplicates, but analyzing a column twice
* won't work; we'd end up making a conflicting update in pg_statistic.)
*/
if (vacstmt->va_cols != NIL) {
Bitmapset* bms_single_column = NULL;
es_get_attnum_of_statistic_items(onerel, vacstmt, &bms_single_column, &list_multi_column);
unsigned int num_of_statistic_items = bms_num_members(bms_single_column) + list_length(list_multi_column);
elog(ES_LOGLEVEL, "num_of_statistic_items = [%u]", num_of_statistic_items);
vacattrstats = (VacAttrStats**)palloc0(num_of_statistic_items * sizeof(VacAttrStats*));
tcnt = 0;
for (int attnum = -1; (attnum = bms_next_member(bms_single_column, attnum)) > 0;) {
vacattrstats[tcnt] = examine_attribute(onerel, attnum, NULL);
if (vacattrstats[tcnt] != NULL) {
++tcnt;
}
}
} else {
* If there are no stats explicitly assigned,
* we need to search system table to get pre-declared extended statistics
*/
*attr_cnt = onerel->rd_att->natts;
list_multi_column = es_explore_declared_stats(onerel->rd_id, vacstmt, inh);
vacattrstats = (VacAttrStats**)palloc((*attr_cnt + list_length(list_multi_column)) * sizeof(VacAttrStats*));
tcnt = 0;
for (i = 1; i <= *attr_cnt; i++) {
vacattrstats[tcnt] = examine_attribute(onerel, i, NULL);
if (vacattrstats[tcnt] != NULL)
tcnt++;
}
}
elog(ES_LOGLEVEL, "num of valid single column statistic = [%d]", tcnt);
* set up statistic structure for multiple column
* we only handle extended statistics in percent-sample mode
*/
if (default_statistics_target < 0 && list_multi_column != NIL) {
ListCell* lc = NULL;
foreach (lc, list_multi_column) {
Bitmapset* bms_multi_column = (Bitmapset*)lfirst(lc);
if (bms_num_members(bms_multi_column) < 2) {
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR_OR_ACCESS_RULE_VIOLATION),
errmsg("Multi-column statistic needs at least two columns.")));
}
vacattrstats[tcnt] = examine_attribute(onerel, bms_multi_column, isLog);
if (vacattrstats[tcnt] != NULL) {
++tcnt;
}
}
elog(ES_LOGLEVEL, "num of extended statistic = [%d]", list_length(list_multi_column));
}
*attr_cnt = tcnt;
* Open all indexes of the relation, and see if there are any analyzable
* columns in the indexes. We do not analyze index columns if there was
* an explicit column list in the ANALYZE command, however. If we are
* doing a recursive scan, we don't want to touch the parent's indexes at
* all.
*/
if (!inh && !NEED_EST_TOTAL_ROWS_DN(vacstmt)
* we can't collect index stats for commands: analyze subparented-table partition (p1)
* 1. obviously, we can't analyze global index since we are going to analyze a level-1 partition
* 2. for local index, unlike table partition, there is no concept of level-1 index-partition
* so we won't be able to update pg_partition/pg_statistic tables
*/
&& (!vacuumPartition(vacstmt->flags) || vacstmt->issubpartition || !RelationIsPartitioned(onerel))) {
vac_open_indexes(onerel, AccessShareLock, nindexes, Irel, vacuumPartition(vacstmt->flags));
} else {
*Irel = NULL;
*nindexes = 0;
}
*hasindex = (*nindexes > 0);
*indexdata = NULL;
if (*hasindex && *Irel != NULL) {
Relation* IdxRel = *Irel;
AnlIndexData* tmpindexdata = NULL;
tmpindexdata = *indexdata = (AnlIndexData*)palloc0(*nindexes * sizeof(AnlIndexData));
for (ind = 0; ind < *nindexes; ind++) {
AnlIndexData* thisdata = &tmpindexdata[ind];
IndexInfo* indexInfo = NULL;
thisdata->indexInfo = indexInfo = BuildIndexInfo(IdxRel[ind]);
thisdata->tupleFract = 1.0;
if (indexInfo->ii_Expressions != NIL && vacstmt->va_cols == NIL) {
ListCell* indexpr_item = list_head(indexInfo->ii_Expressions);
thisdata->vacattrstats = (VacAttrStats**)palloc(indexInfo->ii_NumIndexAttrs * sizeof(VacAttrStats*));
tcnt = 0;
for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++) {
int keycol = indexInfo->ii_KeyAttrNumbers[i];
if (keycol == 0) {
Node* indexkey = NULL;
if (indexpr_item == NULL)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmodule(MOD_OPT),
errmsg("too few entries in indexprs list")));
indexkey = (Node*)lfirst(indexpr_item);
indexpr_item = lnext(indexpr_item);
thisdata->vacattrstats[tcnt] = examine_attribute(IdxRel[ind], i + 1, indexkey);
if (thisdata->vacattrstats[tcnt] != NULL)
tcnt++;
}
}
thisdata->attr_cnt = tcnt;
}
}
}
return vacattrstats;
}
* Set up a working context before analyze
*/
static MemoryContext do_analyze_preprocess(Oid relowner, PGRUsage* ru0, TimestampTz* starttime, Oid* save_userid,
int* save_sec_context, int* save_nestlevel, AnalyzeMode analyzemode)
{
MemoryContext caller_context = NULL;
* Set up a working context so that we can easily free whatever junk gets
* created.
*/
if (analyzemode == ANALYZENORMAL) {
u_sess->analyze_cxt.analyze_context = AllocSetContextCreate(CurrentMemoryContext,
"Analyze",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
}
caller_context = MemoryContextSwitchTo(u_sess->analyze_cxt.analyze_context);
* Switch to the table owner's userid, so that any index functions are run
* as that user. Also lock down security-restricted operations and
* arrange to make GUC variable changes local to this command.
*/
GetUserIdAndSecContext(save_userid, save_sec_context);
SetUserIdAndSecContext(relowner, (unsigned int)*save_sec_context | SECURITY_RESTRICTED_OPERATION);
*save_nestlevel = NewGUCNestLevel();
if (IsAutoVacuumWorkerProcess() && u_sess->attr.attr_storage.Log_autovacuum_min_duration >= 0) {
pg_rusage_init(ru0);
if (u_sess->attr.attr_storage.Log_autovacuum_min_duration > 0)
*starttime = GetCurrentTimestamp();
}
return caller_context;
}
* Delete working context after analyze
*/
static void do_analyze_finalize(
MemoryContext caller_context, Oid save_userid, int save_sec_context, int save_nestlevel, AnalyzeMode analyzemode)
{
AtEOXact_GUC(false, save_nestlevel);
SetUserIdAndSecContext(save_userid, save_sec_context);
(void)MemoryContextSwitchTo(caller_context);
if (analyzemode == ANALYZENORMAL) {
MemoryContextDelete(u_sess->analyze_cxt.analyze_context);
u_sess->analyze_cxt.analyze_context = NULL;
}
return;
}
static inline void cleanup_indexes(int nindexes, Relation* Irel, const Relation onerel, int elevel)
{
for (int ind = 0; ind < nindexes; ind++) {
IndexBulkDeleteResult* stats = NULL;
IndexVacuumInfo ivinfo;
ivinfo.index = Irel[ind];
ivinfo.analyze_only = true;
ivinfo.estimated_count = true;
ivinfo.message_level = elevel;
ivinfo.invisibleParts = NULL;
* if not handle the return value of index_vacuum_cleanup(),
* ivinfo.num_heap_tuples might be useless.
*/
ivinfo.num_heap_tuples = onerel->rd_rel->reltuples;
ivinfo.strategy = u_sess->analyze_cxt.vac_strategy;
if (RelationIsPartitioned(Irel[ind])) {
Partition indexPartition = NULL;
Relation indexPartitionRel = NULL;
ListCell* cell = NULL;
List* indexPartitionList = indexGetPartitionList(Irel[ind], AccessShareLock);
* It is import to understand the nindexes indexes might be of different type,
* btree, hash ,gin, gist etc.
* here, we decide to seperate partitioned index with a list of partition,
* transfer partition into dummy relation, replace index relation(ivinfo.index)
* then call of index_vacuum_cleanup.
* another way is to seperate, in the underground implementation function,
* for example hash index or gin index implementation,
* and those are ususally system inbuilt functions.
* obviousll, the latter one will spend much more effort
* as it must deal with every index ethod.
*/
foreach (cell, indexPartitionList) {
indexPartition = (Partition)lfirst(cell);
indexPartitionRel = partitionGetRelation(Irel[ind], indexPartition);
ivinfo.index = indexPartitionRel;
if (!RelationIsColStore(onerel))
stats = index_vacuum_cleanup(&ivinfo, NULL);
if (stats != NULL)
pfree_ext(stats);
releaseDummyRelation(&indexPartitionRel);
}
releasePartitionList(Irel[ind], &indexPartitionList, AccessShareLock);
} else {
if (!RelationIsColStore(onerel))
stats = index_vacuum_cleanup(&ivinfo, NULL);
if (stats != NULL)
pfree_ext(stats);
}
}
}
#ifndef ENABLE_MULTIPLE_NODES
static void update_numSamplerows(int attr_cnt, VacAttrStats** vacattrstats, HeapTuple* rows, int64 numrows)
{
if (u_sess->attr.attr_sql.enable_functional_dependency) {
for (int i = 0; i < attr_cnt; i++) {
vacattrstats[i]->rows = rows;
vacattrstats[i]->numSamplerows = numrows;
}
}
return;
}
#endif
static void do_analyze_rel_start_log(bool inh, int elevel, Relation onerel)
{
char* namespace_name = get_namespace_name(RelationGetNamespace(onerel));
if (inh)
ereport(elevel,
(errmodule(MOD_AUTOVAC),
errmsg("analyzing \"%s.%s\" inheritance tree",
namespace_name,
RelationGetRelationName(onerel))));
else
ereport(elevel,
(errmodule(MOD_AUTOVAC),
errmsg("analyzing \"%s.%s\"",
namespace_name,
RelationGetRelationName(onerel))));
pfree_ext(namespace_name);
}
static int64 do_analyze_calculate_sample_target_rows(int attr_cnt, VacAttrStats** vacattrstats, int nindexes,
AnlIndexData* indexdata)
{
int64 targrows = 100;
for (int i = 0; i < attr_cnt; i++) {
if (targrows < vacattrstats[i]->minrows)
targrows = vacattrstats[i]->minrows;
}
for (int ind = 0; ind < nindexes; ind++) {
AnlIndexData* thisdata = &indexdata[ind];
for (int i = 0; i < thisdata->attr_cnt; i++) {
if (targrows < thisdata->vacattrstats[i]->minrows)
targrows = thisdata->vacattrstats[i]->minrows;
}
}
return targrows;
}
static void do_analyze_rel_end_log(TimestampTz starttime, Relation onerel, PGRUsage* ru0)
{
if (IsAutoVacuumWorkerProcess() && u_sess->attr.attr_storage.Log_autovacuum_min_duration >= 0) {
if (u_sess->attr.attr_storage.Log_autovacuum_min_duration == 0 ||
TimestampDifferenceExceeds(
starttime, GetCurrentTimestamp(), u_sess->attr.attr_storage.Log_autovacuum_min_duration)) {
char* dbname = get_and_check_db_name(u_sess->proc_cxt.MyDatabaseId);
char* namespace_name = get_namespace_name(RelationGetNamespace(onerel));
ereport(LOG,
(errmsg("automatic analyze of table \"%s.%s.%s\" system usage: %s",
dbname,
namespace_name,
RelationGetRelationName(onerel),
pg_rusage_show(ru0))));
pfree_ext(dbname);
pfree_ext(namespace_name);
}
}
}
* do_analyze_rel() -- analyze one relation, recursively or not
*
* Note that "acquirefunc" is only relevant for the non-inherited case.
* If we supported foreign tables in inheritance trees,
* acquire_inherited_sample_rows would need to determine the appropriate
* acquirefunc for each child table.
*/
static void do_analyze_rel(Relation onerel, VacuumStmt* vacstmt, BlockNumber relpages, bool inh, int elevel,
AnalyzeMode analyzemode)
{
int attr_cnt = 0;
int i = 0;
Relation* Irel = NULL;
int nindexes = 0;
bool hasindex = false;
VacAttrStats** vacattrstats = NULL;
AnlIndexData* indexdata = NULL;
int64 numrows = 0;
int64 targrows = 0;
double totalrows = 0;
double totaldeadrows = 0;
HeapTuple* rows = NULL;
PGRUsage ru0;
TimestampTz starttime = 0;
MemoryContext caller_context = NULL;
Oid save_userid = 0;
int save_sec_context = 0;
int save_nestlevel = 0;
int tableidx = (analyzemode == ANALYZENORMAL) ? analyzemode : (analyzemode - 1);
vacstmt->tableidx = tableidx;
* (1) For replicate table
* We do not support percent-sample mode for replicate table,
* and we need to use this mode in extended statistics,
* so, we let datanode use percent-sample mode when extended statistic is detected,
* and then, coordinate node could fetch statistic from datanode.
*
* (2) check availability for extended statistics
*/
bool replicate_needs_extstats = false;
es_check_availability_for_table(vacstmt, onerel, inh, &replicate_needs_extstats);
do_analyze_rel_start_log(inh, elevel, onerel);
caller_context = do_analyze_preprocess(onerel->rd_rel->relowner,
&ru0,
&starttime,
&save_userid,
&save_sec_context,
&save_nestlevel,
analyzemode);
vacattrstats =
get_vacattrstats_by_vacstmt(onerel, vacstmt, &attr_cnt, &nindexes, &indexdata, &hasindex, inh, &Irel);
* Stop analyze progress if there are no VacAttrStats instance initialized.
* This happened on collect extended statistic using 'analyze t ((a, b))'
* when set default_statistics_target to a positive number.
*/
if (attr_cnt <= 0) {
vac_close_indexes(nindexes, Irel, NoLock);
do_analyze_finalize(caller_context, save_userid, save_sec_context, save_nestlevel, analyzemode);
if (analyzemode <= ANALYZENORMAL) {
if (default_statistics_target >= 0)
elog(INFO, "Please set default_statistics_target to a negative value to collect extended statistics.");
else
elog(INFO, "No columns in %s can be used to collect statistics.", NameStr(onerel->rd_rel->relname));
}
return;
}
* Determine how many rows we need to sample, using the worst case from
* all analyzable columns. We use a lower bound of 100 rows to avoid
* possible overflow in Vitter's algorithm. (Note: that will also be the
* target in the corner case where there are no analyzable columns.)
*/
targrows = do_analyze_calculate_sample_target_rows(attr_cnt, vacattrstats, nindexes, indexdata);
* If get sample rows on datanode or coordinator or not. there are two cases:
* 1. sampleRate > 0 for hash or roundrobin table on datanode;
* 2. sampleRate == 0 for replication table on datanode;
* 3. for system table on all nodes.
*/
if (onerel->rd_id < FirstNormalObjectId || NEED_GET_SAMPLE_ROWS_DN(vacstmt)) {
* CN has finish caculate sampleRate and DN get total rows and sample.
* if sampleRate more or equal 0, it means CN should get sample rows from DN.
*/
if (NEED_GET_SAMPLE_ROWS_DN(vacstmt)) {
bool use_percent = whether_use_percent(vacattrstats, attr_cnt, nindexes, indexdata);
if (use_percent) {
rows = get_total_rows<true>(onerel,
vacstmt,
relpages,
inh,
elevel,
vacattrstats,
attr_cnt,
targrows,
&totalrows,
&totaldeadrows,
&numrows);
vacstmt->pstGlobalStatEx[vacstmt->tableidx].totalRowCnts = totalrows;
}
targrows =
get_target_rows(onerel, vacattrstats, attr_cnt, nindexes, indexdata, totalrows, targrows, use_percent);
}
* Get target rows only for system table or
* sample rate more than 1 on all datanodes.
*/
rows = get_total_rows<false>(onerel,
vacstmt,
relpages,
inh,
elevel,
vacattrstats,
attr_cnt,
targrows,
&totalrows,
&totaldeadrows,
&numrows);
}
* If sampleRate is -1, it means DN send estimate total row count to CN.
*/
if (NEED_EST_TOTAL_ROWS_DN(vacstmt)) {
rows = get_total_rows<true>(onerel,
vacstmt,
relpages,
inh,
elevel,
vacattrstats,
attr_cnt,
targrows,
&totalrows,
&totaldeadrows,
&numrows);
vacstmt->pstGlobalStatEx[vacstmt->tableidx].totalRowCnts = totalrows;
vacstmt->pstGlobalStatEx[vacstmt->tableidx].topMemSize =
compute_com_size(vacstmt, onerel, vacstmt->tableidx) * GetOneTupleSize(vacstmt, onerel) / 1024;
if (analyzemode == ANALYZENORMAL) {
send_totalrowcnt_to_cn(vacstmt, analyzemode, totalrows);
}
if (!inh) {
vac_close_indexes(nindexes, Irel, NoLock);
}
do_analyze_finalize(caller_context, save_userid, save_sec_context, save_nestlevel, analyzemode);
return;
}
* We need do analyze to compute statistic for sample rows only on datanode or
* required sample rows on coordinator.
*
* collect extended statistic for replicate table will use 'sampletable' method in data-node
*/
if (NEED_ANALYZE_SAMPLEROWS(vacstmt) && (!replicate_needs_extstats)) {
* Compute the statistics. Temporary results during the calculations for
* each column are stored in a child context. The calc routines are
* responsible to make sure that whatever they store into the VacAttrStats
* structure is allocated in u_sess->analyze_cxt.analyze_context.
*/
bool ret = do_analyze_samplerows(onerel,
vacstmt,
attr_cnt,
vacattrstats,
hasindex,
nindexes,
indexdata,
inh,
Irel,
&totalrows,
&numrows,
rows,
analyzemode,
caller_context,
save_userid,
save_sec_context,
save_nestlevel);
* If the relation's attribute have modified in local under analyzing,
* we shoule return, because caller_context has finalize.
*/
if (!ret) {
return;
}
} else if (NEED_ANALYZE_SAMPLETABLE(vacstmt) || replicate_needs_extstats) {
if (vacstmt->pstGlobalStatEx[vacstmt->tableidx].totalRowCnts > 0) {
* There is a concurrency condition:
* coordinator received estimate totalRowCnts from all datanodes then estimate sample rate
* for the first step. And there is insert or delete rows concurrency before coordinator
* received real totalRowCnts from all datanodes.
* The sampleRate is not match with the final totalRowCnts, it will result to compute
* error samplerows and error statistics. So we should compute right sampleRate again.
*/
#ifndef ENABLE_MULTIPLE_NODES
update_numSamplerows(attr_cnt, vacattrstats, rows, numrows);
#endif
(void)compute_sample_size(vacstmt, 0, NULL, onerel->rd_id, vacstmt->tableidx);
numrows = ceil(vacstmt->pstGlobalStatEx[vacstmt->tableidx].totalRowCnts *
vacstmt->pstGlobalStatEx[vacstmt->tableidx].sampleRate);
totalrows = vacstmt->pstGlobalStatEx[vacstmt->tableidx].totalRowCnts;
* We need do analyze to compute statistic for sample table only
* required sample table on coordinator.
*
* We only collect extended statistics for replicate table when it is not empty
*/
if (vacstmt->pstGlobalStatEx[vacstmt->tableidx].exec_query ||
(replicate_needs_extstats && numrows > 0 && totalrows > 0)) {
vacstmt->pstGlobalStatEx[vacstmt->tableidx].exec_query = true;
do_analyze_sampletable(onerel,
vacstmt,
attr_cnt,
vacattrstats,
hasindex,
nindexes,
indexdata,
inh,
Irel,
totalrows,
numrows,
analyzemode);
}
} else {
* Supported replicate table uses query to collect statistics,
* set 'exec_query' to mark it's analyze mode even it is empty
*/
if (replicate_needs_extstats) {
vacstmt->pstGlobalStatEx[vacstmt->tableidx].exec_query = true;
}
for (i = 0; i < attr_cnt; ++i) {
VacAttrStats* stats = vacattrstats[i];
if (stats->num_attrs > 1) {
stats->stats_valid = true;
update_attstats(RelationGetRelid(onerel), STARELKIND_CLASS, inh, 1, &stats,
RelationGetRelPersistence(onerel));
}
}
}
}
if (!inh) {
update_pages_and_tuples_pgclass(onerel,
vacstmt,
attr_cnt,
vacattrstats,
hasindex,
nindexes,
indexdata,
Irel,
relpages,
totalrows,
totaldeadrows,
numrows,
inh);
* Report ANALYZE to the stats collector, too. However, if doing
* inherited stats we shouldn't report, because the stats collector only
* tracks per-table stats.
*/
pgstat_report_analyze(onerel, totalrows, totaldeadrows);
}
if (!(vacstmt->options & VACOPT_VACUUM)) {
cleanup_indexes(nindexes, Irel, onerel, elevel);
}
vac_close_indexes(nindexes, Irel, NoLock);
do_analyze_rel_end_log(starttime, onerel, &ru0);
do_analyze_finalize(caller_context, save_userid, save_sec_context, save_nestlevel, analyzemode);
return;
}
* Compute statistics about indexes of a relation
*/
static void compute_index_stats(Relation onerel, double totalrows, AnlIndexData* indexdata, int nindexes,
HeapTuple* rows, int numrows, MemoryContext col_context)
{
MemoryContext ind_context, old_context;
Datum values[INDEX_MAX_KEYS];
bool isnull[INDEX_MAX_KEYS];
int ind, i;
ind_context = AllocSetContextCreate(u_sess->analyze_cxt.analyze_context,
"Analyze Index",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
old_context = MemoryContextSwitchTo(ind_context);
for (ind = 0; ind < nindexes; ind++) {
AnlIndexData* thisdata = &indexdata[ind];
IndexInfo* indexInfo = thisdata->indexInfo;
int attr_cnt = thisdata->attr_cnt;
TupleTableSlot* slot = NULL;
EState* estate = NULL;
ExprContext* econtext = NULL;
List* predicate = NIL;
Datum* exprvals = NULL;
bool* exprnulls = NULL;
int numindexrows, tcnt, rowno;
double totalindexrows;
if (attr_cnt == 0 && indexInfo->ii_Predicate == NIL)
continue;
* Need an EState for evaluation of index expressions and
* partial-index predicates. Create it in the per-index context to be
* sure it gets cleaned up at the bottom of the loop.
*/
estate = CreateExecutorState();
econtext = GetPerTupleExprContext(estate);
slot = MakeSingleTupleTableSlot(RelationGetDescr(onerel));
econtext->ecxt_scantuple = slot;
if (estate->es_is_flt_frame){
predicate = (List*)ExecPrepareQualByFlatten(indexInfo->ii_Predicate, estate);
} else {
predicate = (List*)ExecPrepareExpr((Expr*)indexInfo->ii_Predicate, estate);
}
exprvals = (Datum*)palloc(numrows * attr_cnt * sizeof(Datum));
exprnulls = (bool*)palloc(numrows * attr_cnt * sizeof(bool));
numindexrows = 0;
tcnt = 0;
for (rowno = 0; rowno < numrows; rowno++) {
HeapTuple heapTuple = rows[rowno];
* Reset the per-tuple context each time, to reclaim any cruft
* left behind by evaluating the predicate or index expressions.
*/
ResetExprContext(econtext);
(void)ExecStoreTuple(heapTuple, slot, InvalidBuffer, false);
if (predicate != NIL) {
if (!ExecQual(predicate, econtext))
continue;
}
numindexrows++;
if (attr_cnt > 0) {
* Evaluate the index row to compute expression values. We
* could do this by hand, but FormIndexDatum is convenient.
*/
FormIndexDatum(indexInfo, slot, estate, values, isnull);
* Save just the columns we care about. We copy the values
* into ind_context from the estate's per-tuple context.
*/
for (i = 0; i < attr_cnt; i++) {
VacAttrStats* stats = thisdata->vacattrstats[i];
int attnum = stats->attrs[0]->attnum;
if (isnull[attnum - 1]) {
exprvals[tcnt] = (Datum)0;
exprnulls[tcnt] = true;
} else {
exprvals[tcnt] =
datumCopy(values[attnum - 1], stats->attrtype[0]->typbyval, stats->attrtype[0]->typlen);
exprnulls[tcnt] = false;
}
tcnt++;
}
}
}
* Having counted the number of rows that pass the predicate in the
* sample, we can estimate the total number of rows in the index.
*/
thisdata->tupleFract = (double)numindexrows / (double)numrows;
totalindexrows = ceil(thisdata->tupleFract * totalrows);
* Now we can compute the statistics for the expression columns.
*/
if (numindexrows > 0) {
(void)MemoryContextSwitchTo(col_context);
for (i = 0; i < attr_cnt; i++) {
VacAttrStats* stats = thisdata->vacattrstats[i];
AttributeOpts* aopt = get_attribute_options(stats->attrs[0]->attrelid, stats->attrs[0]->attnum);
stats->exprvals = exprvals + i;
stats->exprnulls = exprnulls + i;
stats->rowstride = attr_cnt;
(*stats->compute_stats)(stats, ind_fetch_func, numindexrows, totalindexrows, onerel);
* If the n_distinct option is specified, it overrides the
* above computation. For indices, we always use just
* n_distinct, not n_distinct_inherited.
*/
if (aopt != NULL && fabs(aopt->n_distinct) > EPSILON)
stats->stadistinct = aopt->n_distinct;
MemoryContextResetAndDeleteChildren(col_context);
}
}
(void)MemoryContextSwitchTo(ind_context);
ExecDropSingleTupleTableSlot(slot);
FreeExecutorState(estate);
MemoryContextResetAndDeleteChildren(ind_context);
}
(void)MemoryContextSwitchTo(old_context);
MemoryContextDelete(ind_context);
}
* examine_attribute
* construct VacAttrStats for multi-column stats
*/
VacAttrStats* examine_attribute(Relation onerel, Bitmapset* bms_attnums, bool isLog)
{
for (int attnum = -1; (attnum = bms_next_member(bms_attnums, attnum)) > 0;) {
Form_pg_attribute attr = &onerel->rd_att->attrs[attnum - 1];
if (!es_is_valid_column_to_analyze(attr)) {
return NULL;
}
if (!GetDefaultOpClass(attr->atttypid, BTREE_AM_OID) && !GetDefaultOpClass(attr->atttypid, HASH_AM_OID)) {
if (isLog) {
ereport(WARNING,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Column %s with type %s is unanalyzable.",
attr->attname.data,
format_type_be(attr->atttypid))));
}
return NULL;
}
}
unsigned int num_attrs = bms_num_members(bms_attnums);
VacAttrStats* stats = es_make_vacattrstats(num_attrs);
int index = 0;
for (int attnum = -1; (attnum = bms_next_member(bms_attnums, attnum)) > 0;) {
Form_pg_attribute attr = &onerel->rd_att->attrs[attnum - 1];
stats->attrs[index] = (Form_pg_attribute)palloc0(ATTRIBUTE_FIXED_PART_SIZE);
errno_t reterrno = memcpy_s(stats->attrs[index], ATTRIBUTE_FIXED_PART_SIZE, attr, ATTRIBUTE_FIXED_PART_SIZE);
securec_check(reterrno, "\0", "\0");
stats->attrtypid[index] = attr->atttypid;
stats->attrtypmod[index] = attr->atttypmod;
stats->attrtype[index] = es_get_attrtype(attr->atttypid);
++index;
}
stats->anl_context = u_sess->analyze_cxt.analyze_context;
stats->tupattnum = stats->attrs[0]->attnum;
return stats;
}
* examine_attribute -- pre-analysis of a single column
*
* Determine whether the column is analyzable; if so, create and initialize
* a VacAttrStats struct for it. If not, return NULL.
*
* If index_expr isn't NULL, then we're trying to analyze an expression index,
* and index_expr is the expression tree representing the column's data.
*/
static VacAttrStats* examine_attribute(Relation onerel, int attnum, Node* index_expr)
{
HeapTuple typtuple;
int i;
bool ok = false;
Form_pg_attribute attr = &onerel->rd_att->attrs[attnum - 1];
if (!es_is_valid_column_to_analyze(attr)) {
return NULL;
}
* Create the VacAttrStats struct. Note that we only have a copy of the
* fixed fields of the pg_attribute tuple.
*/
VacAttrStats* stats = es_make_vacattrstats(1);
stats->attrs[0] = (Form_pg_attribute)palloc0(ATTRIBUTE_FIXED_PART_SIZE);
errno_t reterrno = memcpy_s(stats->attrs[0], ATTRIBUTE_FIXED_PART_SIZE, attr, ATTRIBUTE_FIXED_PART_SIZE);
securec_check(reterrno, "\0", "\0");
* When analyzing an expression index, believe the expression tree's type
* not the column datatype --- the latter might be the opckeytype storage
* type of the opclass, which is not interesting for our purposes. (Note:
* if we did anything with non-expression index columns, we'd need to
* figure out where to get the correct type info from, but for now that's
* not a problem.) It's not clear whether anyone will care about the
* typmod, but we store that too just in case.
*/
if (index_expr != NULL) {
stats->attrtypid[0] = exprType(index_expr);
stats->attrtypmod[0] = exprTypmod(index_expr);
} else {
stats->attrtypid[0] = attr->atttypid;
stats->attrtypmod[0] = attr->atttypmod;
}
typtuple = SearchSysCacheCopy1(TYPEOID, ObjectIdGetDatum(stats->attrtypid[0]));
if (!HeapTupleIsValid(typtuple))
ereport(ERROR,
(errcode(ERRCODE_CACHE_LOOKUP_FAILED), errmsg("cache lookup failed for type %u", stats->attrtypid[0])));
stats->attrtype[0] = (Form_pg_type)GETSTRUCT(typtuple);
stats->anl_context = u_sess->analyze_cxt.analyze_context;
stats->tupattnum = attnum;
* The fields describing the stats->stavalues[n] element types default to
* the type of the data being analyzed, but the type-specific typanalyze
* function can change them if it wants to store something else.
*/
for (i = 0; i < STATISTIC_NUM_SLOTS; i++) {
stats->statypid[i] = stats->attrtypid[0];
stats->statyplen[i] = stats->attrtype[0]->typlen;
stats->statypbyval[i] = stats->attrtype[0]->typbyval;
stats->statypalign[i] = stats->attrtype[0]->typalign;
}
* Call the type-specific typanalyze function. If none is specified, use
* std_typanalyze().
*/
if (OidIsValid(stats->attrtype[0]->typanalyze)) {
ok = DatumGetBool(OidFunctionCall1(stats->attrtype[0]->typanalyze, PointerGetDatum(stats)));
* we will modify minrows for type-specific typanalyze function as traditional
* analyze mode, since we fallback to analyze this column with traditional analyze
* just see comment in function do_analyze_sampletable
*
* 1. OK = 'true' means we can do specific typanalyze
* 2. default_statistics_target < 0 means just for analyze with samptable mode
* 3. stats->minrows < 0 means rows wanted for stats
*/
if (ok && default_statistics_target < 0 && stats->minrows < 0)
stats->minrows = DEFAULT_EST_TARGET_ROWS * compute_attr_target(stats->attrs[0]);
} else
ok = std_typanalyze(stats);
if (!ok || stats->compute_stats == NULL || stats->minrows <= 0) {
tableam_tops_free_tuple(typtuple);
pfree_ext(stats->attrs[0]);
pfree_ext(stats->attrs);
pfree_ext(stats);
return NULL;
}
return stats;
}
* BlockSampler_Init -- prepare for random sampling of blocknumbers
*
* BlockSampler is used for stage one of our new two-stage tuple
* sampling mechanism as discussed on pgsql-hackers 2004-04-02 (subject
* "Large DB"). It selects a random sample of samplesize blocks out of
* the nblocks blocks in the table. If the table has less than
* samplesize blocks, all blocks are selected.
*
* Since we know the total number of blocks in advance, we can use the
* straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
* algorithm.
*/
void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize)
{
bs->N = nblocks;
* If we decide to reduce samplesize for tables that have less or not much
* more than samplesize blocks, here is the place to do it.
*/
bs->n = samplesize;
bs->t = 0;
bs->m = 0;
}
bool BlockSampler_HasMore(BlockSampler bs)
{
return (bs->t < bs->N) && (bs->m < bs->n);
}
BlockNumber BlockSampler_Next(BlockSampler bs)
{
BlockNumber K = bs->N - bs->t;
int k = bs->n - bs->m;
double p;
double V;
AssertEreport(BlockSampler_HasMore(bs), MOD_OPT, "");
if ((BlockNumber)k >= K) {
bs->m++;
return bs->t++;
}
* It is not obvious that this code matches Knuth's Algorithm S.
* Knuth says to skip the current block with probability 1 - k/K.
* If we are to skip, we should advance t (hence decrease K), and
* repeat the same probabilistic test for the next block. The naive
* implementation thus requires an anl_random_fract() call for each block
* number. But we can reduce this to one anl_random_fract() call per
* selected block, by noting that each time the while-test succeeds,
* we can reinterpret V as a uniform random number in the range 0 to p.
* Therefore, instead of choosing a new V, we just adjust p to be
* the appropriate fraction of its former value, and our next loop
* makes the appropriate probabilistic test.
*
* We have initially K > k > 0. If the loop reduces K to equal k,
* the next while-test must fail since p will become exactly zero
* (we assume there will not be roundoff error in the division).
* (Note: Knuth suggests a "<=" loop condition, but we use "<" just
* to be doubly sure about roundoff error.) Therefore K cannot become
* less than k, which means that we cannot fail to select enough blocks.
* ----------
*/
V = anl_random_fract();
p = 1.0 - (double)k / (double)K;
while (V < p) {
bs->t++;
K--;
p *= 1.0 - (double)k / (double)K;
}
bs->m++;
return bs->t++;
}
* get list block number for acquire sample rows when uses ADIO
*/
uint32 BlockSampler_NextList(BlockSampler bs, AnlPrefetchList* list)
{
uint32 idx = 0;
while (BlockSampler_HasMore(bs)) {
if (idx >= list->size) {
break;
}
list->blocklist[idx] = BlockSampler_Next(bs);
idx++;
}
return idx;
}
template <bool isColumnStore>
BlockNumber BlockSampler_NextBlock(
void* rel_or_cuDesc, BlockSampler bs, AnlPrefetch* anlprefetch, int analyzeAttrNum = 0, int32* attrSeq = NULL)
{
uint32 blocks = 0;
BlockNumber blo = 0;
Relation onerel = (Relation)rel_or_cuDesc;
CStoreScanDesc cstoreScan = (CStoreScanDesc)rel_or_cuDesc;
if (!anlprefetch->init) {
blocks = BlockSampler_NextList(bs, &anlprefetch->fetchlist1);
if (blocks != 0) {
if (isColumnStore == false) {
PageListPrefetch(onerel, MAIN_FORKNUM, anlprefetch->fetchlist1.blocklist, blocks, 0, 0);
ereport(DEBUG1,
(errmodule(MOD_ADIO),
errmsg("analyze prefetch for %s, start(%u), count(%u)",
RelationGetRelationName(onerel),
anlprefetch->fetchlist1.blocklist[0],
blocks)));
} else {
CstoreAnalyzePrefetch(cstoreScan, anlprefetch->fetchlist1.blocklist, blocks, analyzeAttrNum, attrSeq);
ereport(DEBUG1,
(errmodule(MOD_ADIO),
errmsg("analyze prefetch for %s, start(%u), count(%u)",
RelationGetRelationName(cstoreScan->m_CStore->m_relation),
anlprefetch->fetchlist1.blocklist[0],
blocks)));
}
}
anlprefetch->fetchlist1.anl_idx = 0;
anlprefetch->fetchlist1.load_count = blocks;
blocks = BlockSampler_NextList(bs, &anlprefetch->fetchlist2);
if (blocks != 0) {
if (isColumnStore == false) {
PageListPrefetch(onerel, MAIN_FORKNUM, anlprefetch->fetchlist2.blocklist, blocks, 0, 0);
ereport(DEBUG1,
(errmodule(MOD_ADIO),
errmsg("analyze prefetch for %s, start(%u), count(%u)",
RelationGetRelationName(onerel),
anlprefetch->fetchlist2.blocklist[0],
blocks)));
} else {
CstoreAnalyzePrefetch(cstoreScan, anlprefetch->fetchlist2.blocklist, blocks, analyzeAttrNum, attrSeq);
ereport(DEBUG1,
(errmodule(MOD_ADIO),
errmsg("analyze prefetch for %s, start(%u), count(%u)",
RelationGetRelationName(cstoreScan->m_CStore->m_relation),
anlprefetch->fetchlist2.blocklist[0],
blocks)));
}
}
anlprefetch->fetchlist2.anl_idx = 0;
anlprefetch->fetchlist2.load_count = blocks;
anlprefetch->blocklist = &anlprefetch->fetchlist1;
anlprefetch->init = true;
}
if (anlprefetch->blocklist == NULL) {
ereport(ERROR, (errmodule(MOD_ADIO), errmsg("analyze prefetch block list can not be NULL")));
}
if (anlprefetch->blocklist->load_count == 0) {
return InvalidBlockNumber;
}
blo = anlprefetch->blocklist->blocklist[anlprefetch->blocklist->anl_idx];
anlprefetch->blocklist->anl_idx++;
if (anlprefetch->blocklist->anl_idx >= anlprefetch->blocklist->load_count) {
blocks = BlockSampler_NextList(bs, anlprefetch->blocklist);
if (blocks != 0) {
if (isColumnStore == false) {
PageListPrefetch(onerel, MAIN_FORKNUM, anlprefetch->blocklist->blocklist, blocks, 0, 0);
ereport(DEBUG1,
(errmodule(MOD_ADIO),
errmsg("analyze prefetch for %s, start(%u), count(%u)",
RelationGetRelationName(onerel),
anlprefetch->blocklist->blocklist[0],
blocks)));
} else {
CstoreAnalyzePrefetch(cstoreScan, anlprefetch->blocklist->blocklist, blocks, analyzeAttrNum, attrSeq);
ereport(DEBUG1,
(errmodule(MOD_ADIO),
errmsg("analyze prefetch for %s, start(%u), count(%u)",
RelationGetRelationName(cstoreScan->m_CStore->m_relation),
anlprefetch->blocklist->blocklist[0],
blocks)));
}
}
anlprefetch->blocklist->anl_idx = 0;
anlprefetch->blocklist->load_count = blocks;
anlprefetch->blocklist =
(anlprefetch->blocklist == &anlprefetch->fetchlist1) ? &anlprefetch->fetchlist2 : &anlprefetch->fetchlist1;
}
return blo;
}
* @Description: get one block for analyze
* @Param[IN/OUT] anlprefetch: anlprefetch context if used adio
* @Param[IN/OUT] bs: BlockSampler
* @Param[IN/OUT] estimate_table_rownum: estimate table row num
* @Param[IN/OUT] onerel: relation
* @Return: block id, InvalidBlockNumber means no block for analyze
* @See also:
*/
template <bool isColumnStore>
BlockNumber BlockSampler_GetBlock(void* rel_or_cuDesc, BlockSampler bs, AnlPrefetch* anlprefetch, int analyzeAttrNum,
int32* attrSeq, bool estimate_table_rownum)
{
if (estimate_table_rownum) {
if (BlockSampler_HasMore(bs)) {
return BlockSampler_Next(bs);
}
}
ADIO_RUN()
{
return BlockSampler_NextBlock<isColumnStore>(rel_or_cuDesc, bs, anlprefetch, analyzeAttrNum, attrSeq);
}
ADIO_ELSE()
{
if (BlockSampler_HasMore(bs)) {
return BlockSampler_Next(bs);
}
}
ADIO_END();
return InvalidBlockNumber;
}
* acquire_sample_rows -- acquire a random sample of rows from the table
*
* Selected rows are returned in the caller-allocated array rows[], which
* must have at least targrows entries.
* The actual number of rows selected is returned as the function result.
* We also estimate the total numbers of live and dead rows in the table,
* and return them into *totalrows and *totaldeadrows, respectively.
*
* The returned list of tuples is in order by physical position in the table.
* (We will rely on this later to derive correlation estimates.)
*
* As of May 2004 we use a new two-stage method: Stage one selects up
* to targrows random blocks (or all blocks, if there aren't so many).
* Stage two scans these blocks and uses the Vitter algorithm to create
* a random sample of targrows rows (or less, if there are less in the
* sample of blocks). The two stages are executed simultaneously: each
* block is processed as soon as stage one returns its number and while
* the rows are read stage two controls which ones are to be inserted
* into the sample.
*
* Although every row has an equal chance of ending up in the final
* sample, this sampling method is not perfect: not every possible
* sample has an equal chance of being selected. For large relations
* the number of different blocks represented by the sample tends to be
* too small. We can live with that for now. Improvements are welcome.
*
* An important property of this sampling method is that because we do
* look at a statistically unbiased set of blocks, we should get
* unbiased estimates of the average numbers of live and dead rows per
* block. The previous sampling method put too much credence in the row
* density near the start of the table.
*/
#define MAX_ESTIMATE_SAMPLE_PAGES 5
#define MAX_ESTIMATE_PART_SAMPLE_PAGES 3
#define MAX_ESTIMATE_RETRY_TIMES 3
template <bool estimate_table_rownum>
static int64 acquire_sample_rows(
Relation onerel, int elevel, HeapTuple* rows, int64 targrows, double* totalrows, double* totaldeadrows)
{
int64 numrows = 0;
double samplerows = 0;
double liverows = 0;
double deadrows = 0;
double rowstoskip = -1;
BlockNumber totalblocks;
TransactionId OldestXmin;
BlockSamplerData bs;
double rstate;
BlockNumber targblock = 0;
BlockNumber sampleblock = 0;
BlockNumber retrycount = 1;
AnlPrefetch anlprefetch;
int64 ori_targrows = targrows;
anlprefetch.blocklist = NULL;
bool isAnalyzing = true;
AssertEreport(targrows > 0, MOD_OPT, "Target row number must be greater than 0 when sampling.");
totalblocks = RelationGetNumberOfBlocks(onerel);
* in pretty analyze mode, we estimate total rows from MAX_ESTIMATE_SAMPLE_PAGES
* pages to promise sampled data density should be as uniform as possible.
*/
if (SUPPORT_PRETTY_ANALYZE && estimate_table_rownum) {
* set targrows as sample page num, so we can use BlockSampler to sample
* pages as random as possible
*/
if (targrows > MAX_ESTIMATE_SAMPLE_PAGES)
targrows = MAX_ESTIMATE_SAMPLE_PAGES;
* we should sample all partitions , so we sample no more than
* MAX_ESTIMATE_PART_SAMPLE_PAGES for each partition
*/
if (RelationIsPartition(onerel))
targrows = MAX_ESTIMATE_PART_SAMPLE_PAGES;
}
OldestXmin = GetOldestXmin(onerel);
retry:
BlockSampler_Init(&bs, totalblocks, targrows);
rstate = anl_init_selection_state(targrows);
ADIO_RUN()
{
if (1 == retrycount) {
uint32 quantity =
Min(u_sess->attr.attr_storage.prefetch_quantity, (g_instance.attr.attr_storage.NBuffers / 4));
anlprefetch.fetchlist1.size =
(uint32)((quantity > (totalblocks / 2 + 1)) ? (totalblocks / 2 + 1) : quantity);
anlprefetch.fetchlist1.blocklist = (BlockNumber*)palloc(sizeof(BlockNumber) * anlprefetch.fetchlist1.size);
anlprefetch.fetchlist1.anl_idx = 0;
anlprefetch.fetchlist1.load_count = 0;
anlprefetch.fetchlist2.size = anlprefetch.fetchlist1.size;
anlprefetch.fetchlist2.blocklist = (BlockNumber*)palloc(sizeof(BlockNumber) * anlprefetch.fetchlist2.size);
anlprefetch.fetchlist2.anl_idx = 0;
anlprefetch.fetchlist2.load_count = 0;
anlprefetch.init = false;
}
ereport(DEBUG2,
(errmodule(MOD_ADIO),
errmsg("analyze prefetch for %s prefetch quantity(%u)",
RelationGetRelationName(onerel),
anlprefetch.fetchlist1.size)));
}
ADIO_END();
while (InvalidBlockNumber !=
(targblock = BlockSampler_GetBlock<false>(onerel, &bs, &anlprefetch, 0, NULL, estimate_table_rownum))) {
Buffer targbuffer;
Page targpage;
OffsetNumber targoffset, maxoffset;
vacuum_delay_point();
sampleblock++;
* We must maintain a pin on the target page's buffer to ensure that
* the maxoffset value stays good (else concurrent VACUUM might delete
* tuples out from under us). Hence, pin the page until we are done
* looking at it. We also choose to hold sharelock on the buffer
* throughout --- we could release and re-acquire sharelock for each
* tuple, but since we aren't doing much work per tuple, the extra
* lock traffic is probably better avoided.
*/
targbuffer = ReadBufferExtended(onerel, MAIN_FORKNUM, targblock, RBM_NORMAL, u_sess->analyze_cxt.vac_strategy);
LockBuffer(targbuffer, BUFFER_LOCK_SHARE);
targpage = BufferGetPage(targbuffer);
if (RelationIsUstoreFormat(onerel)) {
* The attcacheoff of ustore tuples in tupleDesc of the relation can now be
* reset to avoid their usage while reading heap tuples from rows array.
*/
for (int i = 0; i < onerel->rd_att->natts; i++) {
if (onerel->rd_att->attrs[i].attcacheoff >= 0) {
onerel->rd_att->attrs[i].attcacheoff = -1;
}
}
TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(onerel), false, onerel->rd_tam_ops);
maxoffset = UHeapPageGetMaxOffsetNumber(targpage);
for (targoffset = FirstOffsetNumber; targoffset <= maxoffset; targoffset++) {
RowPtr *lp = UPageGetRowPtr(targpage, targoffset);
bool sampleIt = false;
TransactionId xid;
UHeapTuple targTuple;
* For UHeap, we need to count delete committed rows towards dead rows
* which would have been same, if the tuple was present in heap.
*/
if (RowPtrIsDeleted(lp)) {
deadrows += 1;
continue;
}
* We ignore unused and redirect line pointers. DEAD line pointers
* should be counted as dead, because we need vacuum to run to get rid
* of them. Note that this rule agrees with the way that
* heap_page_prune() counts things.
*/
if (!RowPtrIsNormal(lp)) {
if (RowPtrIsDeleted(lp)) {
deadrows += 1;
}
continue;
}
if (!RowPtrHasStorage(lp)) {
continue;
}
targTuple = UHeapGetTuple(onerel, targbuffer, targoffset);
switch (UHeapTupleSatisfiesOldestXmin(targTuple, OldestXmin,
targbuffer, true, &targTuple, &xid, NULL, onerel)) {
case UHEAPTUPLE_LIVE:
sampleIt = true;
liverows += 1;
break;
case UHEAPTUPLE_DEAD:
case UHEAPTUPLE_RECENTLY_DEAD:
deadrows += 1;
break;
case UHEAPTUPLE_INSERT_IN_PROGRESS:
* Insert-in-progress rows are not counted. We assume that
* when the inserting transaction commits or aborts, it will
* send a stats message to increment the proper count. This
* works right only if that transaction ends after we finish
* analyzing the table; if things happen in the other order,
* its stats update will be overwritten by ours. However, the
* error will be large only if the other transaction runs long
* enough to insert many tuples, so assuming it will finish
* after us is the safer option.
*
* A special case is that the inserting transaction might be
* our own. In this case we should count and sample the row,
* to accommodate users who load a table and analyze it in one
* transaction. (pgstat_report_analyze has to adjust the
* numbers we send to the stats collector to make this come
* out right.)
*/
if (TransactionIdIsCurrentTransactionId(xid)) {
sampleIt = true;
liverows += 1;
}
break;
case UHEAPTUPLE_DELETE_IN_PROGRESS:
* We count delete-in-progress rows as still live, using the
* same reasoning given above; but we don't bother to include
* them in the sample.
*
* If the delete was done by our own transaction, however, we
* must count the row as dead to make pgstat_report_analyze's
* stats adjustments come out right. (Note: this works out
* properly when the row was both inserted and deleted in our
* xact.)
*/
if (TransactionIdIsCurrentTransactionId(xid)) {
deadrows += 1;
} else {
liverows += 1;
}
break;
default:
elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
break;
}
if (sampleIt) {
ExecStoreTuple(targTuple, slot, InvalidBuffer, false);
* The first targrows sample rows are simply copied into the
* reservoir. Then we start replacing tuples in the sample until
* we reach the end of the relation. This algorithm is from Jeff
* Vitter's paper (see full citation below). It works by
* repeatedly computing the number of tuples to skip before
* selecting a tuple, which replaces a randomly chosen element of
* the reservoir (current set of tuples). At all times the
* reservoir is a true random sample of the tuples we've passed
* over so far, so when we fall off the end of the relation we're
* done.
*/
if (numrows < targrows) {
rows[numrows++] = ExecCopySlotTuple(slot);
} else {
if (rowstoskip < 0) {
rowstoskip = anl_get_next_S(samplerows, targrows, &rstate);
}
if (rowstoskip <= 0) {
* Found a suitable tuple, so save it, replacing one
* old tuple at random
*/
int64 k = (int64)(targrows * anl_random_fract());
AssertEreport(k >= 0 && k < targrows, MOD_OPT,
"Index number out of range when replacing tuples.");
if (!estimate_table_rownum) {
heap_freetuple(rows[k]);
rows[k] = ExecCopySlotTuple(slot);
}
}
rowstoskip -= 1;
}
samplerows += 1;
}
if (targTuple) {
UHeapFreeTuple(targTuple);
}
}
ExecDropSingleTupleTableSlot(slot);
* The attcacheoff of ustore tuples in tupleDesc of the relation can now be
* reset to avoid their usage while reading heap tuples from rows array.
*/
for (int i = 0; i < onerel->rd_att->natts; i++) {
if (onerel->rd_att->attrs[i].attcacheoff >= 0) {
onerel->rd_att->attrs[i].attcacheoff = -1;
}
}
goto uheap_end;
}
maxoffset = PageGetMaxOffsetNumber(targpage);
for (targoffset = FirstOffsetNumber; targoffset <= maxoffset; targoffset++) {
ItemId itemid;
HeapTupleData targtuple;
bool sample_it = false;
if (ENABLE_WORKLOAD_CONTROL)
IOSchedulerAndUpdate(IO_TYPE_READ, 10, IO_TYPE_ROW);
targtuple.t_tableOid = InvalidOid;
targtuple.t_bucketId = InvalidBktId;
HeapTupleCopyBaseFromPage(&targtuple, targpage);
#ifdef PGXC
targtuple.t_xc_node_id = InvalidOid;
#endif
itemid = PageGetItemId(targpage, targoffset);
* We ignore unused and redirect line pointers. DEAD line
* pointers should be counted as dead, because we need vacuum to
* run to get rid of them. Note that this rule agrees with the
* way that heap_page_prune() counts things.
*/
if (!ItemIdIsNormal(itemid)) {
if (ItemIdIsDead(itemid))
deadrows += 1;
continue;
}
ItemPointerSet(&targtuple.t_self, targblock, targoffset);
targtuple.t_tableOid = RelationGetRelid(onerel);
targtuple.t_bucketId = RelationGetBktid(onerel);
targtuple.t_data = (HeapTupleHeader)PageGetItem(targpage, itemid);
targtuple.t_len = ItemIdGetLength(itemid);
switch (HeapTupleSatisfiesVacuum(&targtuple, OldestXmin, targbuffer, isAnalyzing)) {
case HEAPTUPLE_LIVE:
sample_it = true;
liverows += 1;
break;
case HEAPTUPLE_DEAD:
case HEAPTUPLE_RECENTLY_DEAD:
deadrows += 1;
break;
case HEAPTUPLE_INSERT_IN_PROGRESS:
* Insert-in-progress rows are not counted. We assume
* that when the inserting transaction commits or aborts,
* it will send a stats message to increment the proper
* count. This works right only if that transaction ends
* after we finish analyzing the table; if things happen
* in the other order, its stats update will be
* overwritten by ours. However, the error will be large
* only if the other transaction runs long enough to
* insert many tuples, so assuming it will finish after us
* is the safer option.
*
* A special case is that the inserting transaction might
* be our own. In this case we should count and sample
* the row, to accommodate users who load a table and
* analyze it in one transaction. (pgstat_report_analyze
* has to adjust the numbers we send to the stats
* collector to make this come out right.)
*/
if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(targpage, targtuple.t_data))) {
sample_it = true;
liverows += 1;
}
break;
case HEAPTUPLE_DELETE_IN_PROGRESS:
* We count delete-in-progress rows as still live, using
* the same reasoning given above; but we don't bother to
* include them in the sample.
*
* If the delete was done by our own transaction, however,
* we must count the row as dead to make
* pgstat_report_analyze's stats adjustments come out
* right. (Note: this works out properly when the row was
* both inserted and deleted in our xact.)
*
.* The net effect of these choices is that we act as
.* though an IN_PROGRESS transaction hasn't happened yet,
.* except if it is our own transaction, which we assume
.* has happened.
.*
.* This approach ensures that we behave sanely if we see
.* both the pre-image and post-image rows for a row being
.* updated by a concurrent transaction: we will sample the
.* pre-image but not the post-image. We also get sane
.* results if the concurrent transaction never commits.
*/
if (TransactionIdIsCurrentTransactionId(HeapTupleGetUpdateXid(&targtuple)))
deadrows += 1;
else {
sample_it = true;
liverows += 1;
}
break;
default:
ereport(
ERROR, (errcode(ERRCODE_CASE_NOT_FOUND), errmsg("unexpected HeapTupleSatisfiesVacuum result")));
break;
}
if (sample_it) {
* The first targrows sample rows are simply copied into the
* reservoir. Then we start replacing tuples in the sample
* until we reach the end of the relation. This algorithm is
* from Jeff Vitter's paper (see full citation below). It
* works by repeatedly computing the number of tuples to skip
* before selecting a tuple, which replaces a randomly chosen
* element of the reservoir (current set of tuples). At all
* times the reservoir is a true random sample of the tuples
* we've passed over so far, so when we fall off the end of
* the relation we're done.
*/
if (numrows < targrows) {
if (!estimate_table_rownum)
rows[numrows++] = heapCopyTuple(&targtuple, onerel->rd_att, targpage);
else
numrows++;
} else {
* t in Vitter's paper is the number of records already
* processed. If we need to compute a new S value, we
* must use the not-yet-incremented value of samplerows as
* t.
*/
if (rowstoskip < 0)
rowstoskip = anl_get_next_S(samplerows, targrows, &rstate);
if (rowstoskip <= 0) {
* Found a suitable tuple, so save it, replacing one
* old tuple at random
*/
int64 k = (int64)(targrows * anl_random_fract());
AssertEreport(
k >= 0 && k < targrows, MOD_OPT, "Index number out of range when replacing tuples.");
if (!estimate_table_rownum) {
tableam_tops_free_tuple(rows[k]);
rows[k] = heapCopyTuple(&targtuple, onerel->rd_att, targpage);
}
}
rowstoskip -= 1;
}
samplerows += 1;
}
}
uheap_end:
UnlockReleaseBuffer(targbuffer);
if (estimate_table_rownum && !SUPPORT_PRETTY_ANALYZE) {
if (liverows > 0)
break;
totalblocks--;
ereport(LOG,
(errmsg("ANALYZE INFO : estimate total rows of \"%s\" - no lived rows in blockno: %u",
RelationGetRelationName(onerel),
targblock)));
}
}
if (estimate_table_rownum) {
if (liverows > 0) {
AssertEreport(0 < sampleblock && 0 < totalblocks, MOD_OPT, "This should not happend when liverows>0");
*totalrows = totalblocks * (liverows / sampleblock);
} else if (sampleblock >= totalblocks || retrycount > MAX_ESTIMATE_RETRY_TIMES) {
* if all tuples is dead or we have tried too many times
* (MAX_ESTIMATE_RETRY_TIMES + 1), just return 0
*/
*totalrows = 0.0;
} else if (SUPPORT_PRETTY_ANALYZE) {
* if we have tried MAX_ESTIMATE_RETRY_TIMES, just sample
* tuples as normal analyze sample
*/
if (MAX_ESTIMATE_RETRY_TIMES == retrycount)
targrows = ori_targrows;
* retry to fetch lived rows in pretty analyze mode unless
* 1. fetch lived tuple (liverows > 0)
* 2. retry enough times (MAX_ESTIMATE_RETRY_TIMES)
*/
retrycount++;
goto retry;
}
ADIO_RUN()
{
pfree_ext(anlprefetch.fetchlist1.blocklist);
pfree_ext(anlprefetch.fetchlist2.blocklist);
}
ADIO_END();
ereport(elevel,
(errmsg("ANALYZE INFO : estimate total rows of \"%s\": scanned %u pages of "
"total %u pages with %u retry times, containing %.0f "
"live rows and %.0f dead rows, estimated %.0f total rows",
RelationGetRelationName(onerel),
sampleblock,
totalblocks,
retrycount,
liverows,
deadrows,
*totalrows)));
*totaldeadrows = 0.0;
* we just count lived rows and dead rows, and have never sample tuples to fill
* rows[], so return 0 here to avoid undesirable memory free
*/
return 0;
}
* If we didn't find as many tuples as we wanted then we're done. No sort
* is needed, since they're already in order.
*
* Otherwise we need to sort the collected tuples by position
* (itempointer). It's not worth worrying about corner cases where the
* tuples are already sorted.
*/
if (numrows == targrows) {
qsort((void*)rows, numrows, sizeof(HeapTuple), compare_rows);
}
* Estimate total numbers of live and dead rows in relation, extrapolating
* on the assumption that the average tuple density in pages we didn't
* scan is the same as in the pages we did scan. Since what we scanned is
* a random sample of the pages in the relation, this should be a good
* assumption.
*/
if (bs.m > 0) {
*totalrows = floor((liverows / bs.m) * totalblocks + 0.5);
*totaldeadrows = floor((deadrows / bs.m) * totalblocks + 0.5);
} else {
*totalrows = 0.0;
*totaldeadrows = 0.0;
}
ADIO_RUN()
{
pfree_ext(anlprefetch.fetchlist1.blocklist);
pfree_ext(anlprefetch.fetchlist2.blocklist);
}
ADIO_END();
* Emit some interesting relation info, only in datanode
*/
if (IS_PGXC_DATANODE) {
ereport(elevel,
(errmsg("ANALYZE INFO : \"%s\": scanned %d of %u pages, "
"containing %.0f live rows and %.0f dead rows; "
"%ld rows in sample, %.0f estimated total rows",
RelationGetRelationName(onerel),
bs.m,
totalblocks,
liverows,
deadrows,
numrows,
*totalrows)));
}
return numrows;
}
template <int attlen, bool hasNull>
Datum GetValue(CU* cuPtr, int rowIdx)
{
return (Datum)cuPtr->GetValue<attlen, hasNull>(rowIdx);
}
void InitGetValFunc(int attlen, GetValFunc* getValFuncPtr, int col)
{
switch (attlen) {
case sizeof(uint8): {
getValFuncPtr[col][0] = &GetValue<1, false>;
getValFuncPtr[col][1] = &GetValue<1, true>;
break;
}
case sizeof(uint16): {
getValFuncPtr[col][0] = &GetValue<2, false>;
getValFuncPtr[col][1] = &GetValue<2, true>;
break;
}
case sizeof(uint32): {
getValFuncPtr[col][0] = &GetValue<4, false>;
getValFuncPtr[col][1] = &GetValue<4, true>;
break;
}
case sizeof(uint64): {
getValFuncPtr[col][0] = &GetValue<8, false>;
getValFuncPtr[col][1] = &GetValue<8, true>;
break;
}
case 12: {
getValFuncPtr[col][0] = &GetValue<12, false>;
getValFuncPtr[col][1] = &GetValue<12, true>;
break;
}
case 16: {
getValFuncPtr[col][0] = &GetValue<16, false>;
getValFuncPtr[col][1] = &GetValue<16, true>;
break;
}
case -1: {
getValFuncPtr[col][0] = &GetValue<-1, false>;
getValFuncPtr[col][1] = &GetValue<-1, true>;
break;
}
case -2: {
getValFuncPtr[col][0] = &GetValue<-2, false>;
getValFuncPtr[col][1] = &GetValue<-2, true>;
break;
}
default:
ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("unsupported datatype")));
}
}
* @Description: analyze prefetch cu
* @Param[IN] analyzeAttrNum: analyze column num
* @Param[IN] attrSeq: cloumn order list
* @Param[IN] cstoreScan: cstore scan desc
* @Param[IN] cuidList: cu id list for prefetch
* @Param[IN] cuidListCount: cu id list count
* @See also:
*/
void CstoreAnalyzePrefetch(
CStoreScanDesc cstoreScan, BlockNumber* cuidList, int32 cuidListCount, int analyzeAttrNum, int32* attrSeq)
{
AioDispatchCUDesc_t** dList = NULL;
t_thrd.cstore_cxt.InProgressAioCUDispatch =
(AioDispatchCUDesc_t**)palloc(sizeof(AioDispatchCUDesc_t*) * MAX_CU_PREFETCH_REQSIZ);
dList = t_thrd.cstore_cxt.InProgressAioCUDispatch;
t_thrd.cstore_cxt.InProgressAioCUDispatchCount = 0;
File* vfdList = (File*)palloc(sizeof(File) * MAX_CU_PREFETCH_REQSIZ);
errno_t rc =
memset_s((char*)vfdList, sizeof(File) * MAX_CU_PREFETCH_REQSIZ, 0xFF, sizeof(File) * MAX_CU_PREFETCH_REQSIZ);
securec_check(rc, "\0", "\0");
for (int col = 0; col < analyzeAttrNum; col++) {
for (int i = 0; i < cuidListCount; i++) {
CUDesc cuDesc;
int colNum = attrSeq[col];
uint32 cuid = cuidList[i] + FirstCUID + 1;
if (cstoreScan->m_CStore->GetCUDesc(colNum, cuid, &cuDesc, NULL) != true) {
continue;
}
cstoreScan->m_CStore->GetCUDeleteMaskIfNeed(cuid, NULL);
if (cstoreScan->m_CStore->IsTheWholeCuDeleted(cuDesc.row_count)) {
continue;
}
cstoreScan->m_CStore->CUPrefetch(
&cuDesc, colNum, dList, t_thrd.cstore_cxt.InProgressAioCUDispatchCount, vfdList);
}
}
if (t_thrd.cstore_cxt.InProgressAioCUDispatchCount > 0) {
#ifndef ENABLE_LITE_MODE
int tmp_count = t_thrd.cstore_cxt.InProgressAioCUDispatchCount;
HOLD_INTERRUPTS();
FileAsyncCURead(dList, t_thrd.cstore_cxt.InProgressAioCUDispatchCount);
t_thrd.cstore_cxt.InProgressAioCUDispatchCount = 0;
RESUME_INTERRUPTS();
FileAsyncCUClose(vfdList, tmp_count);
#endif
}
pfree_ext(dList);
pfree_ext(vfdList);
t_thrd.cstore_cxt.InProgressAioCUDispatch = NULL;
t_thrd.cstore_cxt.InProgressAioCUDispatchCount = 0;
t_thrd.storage_cxt.InProgressAioType = AioUnkown;
}
#define CSTORE_ANALYZE_PREFETCH_QUANTITY 32
typedef struct sample_tuple_cell_struct {
Datum* values;
bool* nulls;
} sample_tuple_cell;
#define maxAnalyzeUseCstoreBufferSize (g_instance.attr.attr_storage.cstore_buffers * 1024LL / 8)
* AcquireSampleCStoreRows -- acquire a random sample of rows from the table
*
* Selected rows are returned in the caller-allocated array rows[], which
* must have at least targrows entries.
* The actual number of rows selected is returned as the function result.
* We also estimate the total number of live and dead rows in the table,
* and return them into *totalrows and *totaldeadrows, respectively.
*
* To improve the analyze efficiency in the condition of severl attrs are selected,
* we add two parameters which record the information of seleced attrs.The perform
* can only used for column storage table.
*
* The returned list of tuples is in order by physical position in the table.
* (We will rely on this later to derive correlation estimates.)
*/
template <bool estimate_table_rownum>
static int64 AcquireSampleCStoreRows(Relation onerel, int elevel, HeapTuple* rows, int64 targrows, double* totalrows,
double* totaldeadrows, VacAttrStats** vacattrstats, int analyzeAttrNum)
{
int64 numrows = 0;
int64 delta_numrows = 0;
int32 samplerows = 0;
int64 liverows = 0;
int64 deadrows = 0;
BlockNumber totalblocks = 0;
BlockNumber estBlkNum = 0;
BlockSamplerData bs;
double rstate;
AnlPrefetch anlprefetch;
int cusize_threshold = maxAnalyzeUseCstoreBufferSize;
anlprefetch.blocklist = NULL;
AssertEreport(targrows > 0, MOD_OPT, "target rows must be greater than 0 when sampling");
* all system table is row-orientation, column-orientation table must
* be a user-defined table, so we just return without sample
*/
if (IS_PGXC_COORDINATOR) {
*totalrows = 0;
*totaldeadrows = 0;
return 0;
}
Bitmapset* bms_attnums = es_get_attnums_to_analyze(vacattrstats, analyzeAttrNum);
int num_attnums = bms_num_members(bms_attnums);
int colTotalNum = onerel->rd_att->natts;
int32* attrSeq = NULL;
int16* colIdx = (int16*)palloc0(sizeof(int16) * num_attnums);
int* slotIdList = NULL;
CStoreScanDesc cstoreScanDesc = NULL;
CU** cuPtr = NULL;
Relation deltarel = NULL;
double totalRowCnts = 0;
double deltarows = 0;
double curows = 0;
double cudeadrows = 0;
Assert(OidIsValid(onerel->rd_rel->reldeltarelid));
deltarel = relation_open((onerel->rd_rel->reldeltarelid), AccessShareLock);
delta_numrows = acquire_sample_rows<true>(deltarel, elevel, rows, targrows, &deltarows, totaldeadrows);
for (int i = 0; i < num_attnums; ++i) {
colIdx[i] = bms_first_member(bms_attnums);
}
Snapshot tmpSnapshot = GetActiveSnapshot();
Assert(tmpSnapshot != NULL);
cstoreScanDesc = CStoreBeginScan(onerel, num_attnums, colIdx, tmpSnapshot, false);
totalblocks = CStoreRelGetCUNumByNow(cstoreScanDesc);
curows = (double)cstoreScanDesc->m_CStore->GetLivedRowNumbers(&deadrows);
if (curows <= 0.0) {
relation_close(deltarel, AccessShareLock);
CStoreEndScan(cstoreScanDesc);
pfree_ext(colIdx);
pfree_ext(bms_attnums);
return 0;
}
*totalrows = totalRowCnts = curows + deltarows;
cudeadrows = (double)deadrows;
*totaldeadrows += cudeadrows;
deadrows = 0;
if (estimate_table_rownum && SUPPORT_PRETTY_ANALYZE) {
for (int i = 0; i < delta_numrows; i++) {
if (rows[i]) {
pfree_ext(rows[i]);
rows[i] = NULL;
}
}
relation_close(deltarel, AccessShareLock);
CStoreEndScan(cstoreScanDesc);
pfree_ext(colIdx);
pfree_ext(bms_attnums);
return 0;
}
attrSeq = (int*)palloc0(sizeof(int32) * num_attnums);
slotIdList = (int*)palloc0(sizeof(int) * num_attnums);
cuPtr = (CU**)palloc0(sizeof(CU*) * colTotalNum);
if (!estimate_table_rownum) {
for (int i = 0; i < delta_numrows; i++) {
if (rows[i]) {
pfree_ext(rows[i]);
rows[i] = NULL;
}
}
int64 deltatargrows = 0;
if (deltarows > 0) {
deltatargrows = (int)rint(targrows * deltarows / totalRowCnts);
deltatargrows = Min(deltatargrows, targrows);
if (deltatargrows > 0) {
double trows = 0;
double tdrows = 0;
delta_numrows = acquire_sample_rows<false>(deltarel, elevel, rows, deltatargrows, &trows, &tdrows);
*totalrows = trows + curows;
*totaldeadrows = tdrows + cudeadrows;
}
}
int64 temp_target = (int)rint(targrows * curows / totalRowCnts);
temp_target = Min(temp_target, targrows - deltatargrows);
targrows = temp_target;
* calculate cu number by following formula
* 1. get lived row number
* 2. estimate row size by suppose the width of each column is 4
* 3. estimate how many pages needed if it's row-stored
* 4. evaluate how many CUs needed to sample
*/
int totalwidth = 0;
int relpages = 0;
BlockNumber sampleCUs = 0;
totalwidth = 4 * onerel->rd_att->natts;
relpages = ceil(*totalrows * totalwidth / BLCKSZ);
if (relpages == 0) {
relpages = 1;
}
sampleCUs = ceil((double)targrows / relpages * totalblocks);
sampleCUs = (sampleCUs > totalblocks) ? totalblocks : sampleCUs;
elog(DEBUG2, "ANALYZE INFO : sample %u cu from totoal %u cu", sampleCUs, totalblocks);
BlockSampler_Init(&bs, totalblocks, sampleCUs);
} else {
AssertEreport(!SUPPORT_PRETTY_ANALYZE, MOD_OPT, "");
BlockSampler_Init(&bs, totalblocks, targrows);
elog(DEBUG2, "ANALYZE INFO : sample %ld rows from totoal %u cu", targrows, totalblocks);
}
rstate = anl_init_selection_state(targrows);
CStore* cstore = cstoreScanDesc->m_CStore;
FormData_pg_attribute* attrs = cstore->m_relation->rd_att->attrs;
GetValFunc* getValFuncPtr = (GetValFunc*)palloc(sizeof(GetValFunc) * colTotalNum);
rows += delta_numrows;
for (int col = 0; col < num_attnums; ++col) {
int colSeq = attrSeq[col] = colIdx[col] - 1;
InitGetValFunc(attrs[colSeq].attlen, getValFuncPtr, colSeq);
}
ADIO_RUN()
{
uint32 quantity = (uint32)CSTORE_ANALYZE_PREFETCH_QUANTITY;
anlprefetch.fetchlist1.size = (uint32)((quantity > (totalblocks / 2 + 1)) ? (totalblocks / 2 + 1) : quantity);
anlprefetch.fetchlist1.blocklist = (BlockNumber*)palloc(sizeof(BlockNumber) * anlprefetch.fetchlist1.size);
anlprefetch.fetchlist1.anl_idx = 0;
anlprefetch.fetchlist1.load_count = 0;
anlprefetch.fetchlist2.size = anlprefetch.fetchlist1.size;
anlprefetch.fetchlist2.blocklist = (BlockNumber*)palloc(sizeof(BlockNumber) * anlprefetch.fetchlist2.size);
anlprefetch.fetchlist2.anl_idx = 0;
anlprefetch.fetchlist2.load_count = 0;
anlprefetch.init = false;
ereport(DEBUG1,
(errmodule(MOD_ADIO),
errmsg("analyze prefetch for %s prefetch quantity(%u)",
RelationGetRelationName(onerel),
anlprefetch.fetchlist1.size)));
}
ADIO_END();
MemoryContext sample_context;
MemoryContext old_context;
BlockNumber targblock;
int fstColIdx = CStoreGetfstColIdx(onerel);
Datum* constValues = (Datum*)palloc(sizeof(Datum) * colTotalNum);
bool* nullValues = (bool*)palloc(sizeof(bool) * colTotalNum);
Datum* values = (Datum*)palloc0(sizeof(Datum) * colTotalNum);
bool* nulls = (bool*)palloc0(sizeof(bool) * colTotalNum);
int* funcIdx = (int*)palloc0(sizeof(int) * colTotalNum);
List* sampleTupleInfo = NIL;
sample_tuple_cell* st_cell = NULL;
ListCell* cell1 = NULL;
ListCell* cell2 = NULL;
ListCell* cell3 = NULL;
sample_context = AllocSetContextCreate(CurrentMemoryContext,
"sample cstore rows for analyze",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
while (
InvalidBlockNumber != (targblock = BlockSampler_GetBlock<true>(
cstoreScanDesc, &bs, &anlprefetch, num_attnums, attrSeq, estimate_table_rownum))) {
List* valueLocation = NIL;
List* targoffsetList = NIL;
int location = 0;
CUDesc cuDesc;
uint16 targoffset = 0;
uint16 maxoffset;
int64 total_cusize = 0;
int start_col = -1;
int end_col = -1;
bool first_batch = true;
bool all_in_buffer = true;
targblock = targblock + FirstCUID + 1;
if (cstoreScanDesc->m_CStore->GetCUDesc(fstColIdx, targblock, &cuDesc, tmpSnapshot) != true)
continue;
if (ENABLE_WORKLOAD_CONTROL)
IOSchedulerAndUpdate(IO_TYPE_READ, 1, IO_TYPE_COLUMN);
cstore->GetCUDeleteMaskIfNeed(targblock, tmpSnapshot);
if (cstore->IsTheWholeCuDeleted(cuDesc.row_count))
continue;
maxoffset = cuDesc.row_count;
if (estimate_table_rownum) {
liverows = maxoffset;
estBlkNum++;
if (liverows > 0)
break;
ereport(DEBUG1,
(errmsg("ANALYZE INFO : estimate total rows of \"%s\" - no lived rows in cuid: %u",
RelationGetRelationName(onerel),
targblock)));
totalblocks--;
continue;
}
errno_t rc;
rc = memset_s(nulls, sizeof(bool) * colTotalNum, true, sizeof(bool) * colTotalNum);
securec_check(rc, "\0", "\0");
rc = memset_s(nullValues, sizeof(bool) * colTotalNum, false, sizeof(bool) * colTotalNum);
securec_check(rc, "\0", "\0");
rc = memset_s(constValues, sizeof(Datum) * colTotalNum, 0, sizeof(Datum) * colTotalNum);
securec_check(rc, "\0", "\0");
#define copy_sample_dataum(dest, col_num, offset) \
do { \
old_context = MemoryContextSwitchTo(sample_context); \
Datum dm = getValFuncPtr[colNum][funcIdx[colNum]](cuPtr[colNum], (offset)); \
int16 valueTyplen = attrs[(col_num)].attlen; \
bool valueTypbyval = attrs[(col_num)].attlen == 0 ? false : true; \
if (valueTyplen < 0) \
(dest) = PointerGetDatum(PG_DETOAST_DATUM_COPY(dm)); \
else \
(dest) = datumCopy(dm, valueTypbyval, valueTyplen); \
(void)MemoryContextSwitchTo(old_context); \
} while (0)
* check if special row is selected for analyze
* 1. skip dead rows (deleted rows)
* 2. fetch all rows if fetched-rows < targrows
* 3. random to replace a tuple in random location
*/
#define check_match_tuple(copyTuple) \
do { \
(copyTuple) = false; \
if (cstore->IsDeadRow((uint32)targblock, (uint32)targoffset)) { \
deadrows++; \
} else if (numrows < targrows) { \
(copyTuple) = true; \
location = numrows; \
numrows++; \
samplerows++; \
} else { \
if (0 >= anl_get_next_S(samplerows, targrows, &rstate)) { \
location = (int64)(targrows * anl_random_fract()); \
(copyTuple) = true; \
AssertEreport(location >= 0 && location < targrows, MOD_OPT, ""); \
} \
samplerows++; \
} \
} while (0)
#define load_cu_data(s_col, e_col, values, nulls, copy) \
do { \
for (int col1 = (s_col); col1 <= (e_col); ++col1) { \
int colNum = attrSeq[col1]; \
if (cuPtr[colNum]) { \
if (cuPtr[colNum]->IsNull(targoffset)) { \
(nulls)[colNum] = true; \
(values)[colNum] = 0; \
} else { \
(nulls)[colNum] = false; \
if (copy) \
copy_sample_dataum((values)[colNum], colNum, targoffset); \
else \
(values)[colNum] = getValFuncPtr[colNum][funcIdx[colNum]](cuPtr[colNum], targoffset); \
} \
} else { \
(nulls)[colNum] = nullValues[colNum]; \
(values)[colNum] = constValues[colNum]; \
} \
} \
} while (0)
* try to load all cu to cstore_buffers
* 1. if total cu size < cstore_buffers/8, load all cu into cu buffer
* 2. if total cu size >= cstore_buffers/8, copy sample data to temp
* memory to avoid pining too many cu
*/
for (int col = 0; col < num_attnums; ++col) {
int colNum = attrSeq[col];
cuPtr[colNum] = NULL;
slotIdList[col] = CACHE_BLOCK_INVALID_IDX;
if (-1 == start_col)
start_col = col;
end_col = col;
(void)cstore->GetCUDesc(colNum, targblock, &cuDesc, tmpSnapshot);
if (cuDesc.IsNullCU()) {
nullValues[colNum] = true;
constValues[colNum] = 0;
elog(DEBUG2,
"ANALYZE INFO - table \"%s\": attnum(%d), cuid(%u) is null",
RelationGetRelationName(onerel),
colNum,
targblock);
continue;
} else if (cuDesc.IsSameValCU()) {
bool shoulFree = false;
nullValues[colNum] = false;
old_context = MemoryContextSwitchTo(sample_context);
constValues[colNum] = CStore::CudescTupGetMinMaxDatum(&cuDesc, &attrs[colNum], true, &shoulFree);
(void)MemoryContextSwitchTo(old_context);
elog(DEBUG2,
"ANALYZE INFO - table \"%s\": attnum(%d), cuid(%u) is const value",
RelationGetRelationName(onerel),
colNum,
targblock);
continue;
} else {
nullValues[colNum] = false;
constValues[colNum] = 0;
cuPtr[colNum] = cstore->GetCUData(&cuDesc, colNum, attrs[colNum].attlen, slotIdList[col]);
funcIdx[colNum] = cuPtr[colNum]->HasNullValue() ? 1 : 0;
* reduce the calling number AMAP. */
vacuum_delay_point();
AssertEreport(!cuPtr[colNum]->m_cache_compressed, MOD_OPT, "");
total_cusize += cuPtr[colNum]->GetUncompressBufSize();
* copy sample cu data to temp memory if
* 1. total_cusize >= cusize_threshold
* 2. or it is the last column and some sample cu data have beed loaded to temp memory
*/
if (total_cusize < cusize_threshold && (first_batch || (num_attnums - 1) != col))
continue;
if (NIL == sampleTupleInfo) {
ereport(LOG,
(errmsg(
"ANALYZE INFO - table \"%s\": copy data from CU buffer", RelationGetRelationName(onerel)),
errdetail("enlarge cstore_buffers to avoid copying data")));
}
elog(DEBUG2,
"ANALYZE INFO - table \"%s\": copy [%d, %d] column data from CU buffer",
RelationGetRelationName(onerel),
start_col,
end_col);
}
all_in_buffer = false;
if (first_batch) {
bool copyTuple = false;
cell1 = list_head(sampleTupleInfo) ? list_head(sampleTupleInfo) : NULL;
for (targoffset = FirstOffsetNumber - 1; targoffset < maxoffset; targoffset++) {
check_match_tuple(copyTuple);
if (!copyTuple)
continue;
if (cell1 != NULL) {
errno_t rc;
st_cell = (sample_tuple_cell*)lfirst(cell1);
rc = memset_s(st_cell->nulls, sizeof(bool) * colTotalNum, true, sizeof(bool) * colTotalNum);
securec_check(rc, "\0", "\0");
rc = memset_s(st_cell->values, sizeof(Datum) * colTotalNum, 0, sizeof(Datum) * colTotalNum);
securec_check(rc, "\0", "\0");
} else {
errno_t rc;
st_cell = (sample_tuple_cell*)palloc0(sizeof(sample_tuple_cell));
sampleTupleInfo = lappend(sampleTupleInfo, st_cell);
st_cell->nulls = (bool*)palloc(sizeof(bool) * colTotalNum);
rc = memset_s(st_cell->nulls, sizeof(bool) * colTotalNum, true, sizeof(bool) * colTotalNum);
securec_check(rc, "\0", "\0");
st_cell->values = (Datum*)palloc0(sizeof(Datum) * colTotalNum);
}
valueLocation = lappend_int(valueLocation, location);
targoffsetList = lappend_int(targoffsetList, targoffset);
load_cu_data(start_col, end_col, st_cell->values, st_cell->nulls, true);
cell1 = (cell1 && lnext(cell1)) ? lnext(cell1) : NULL;
}
for (int col1 = start_col; col1 <= end_col; ++col1) {
if (IsValidCacheSlotID(slotIdList[col1]))
CUCache->UnPinDataBlock(slotIdList[col1]);
}
first_batch = false;
AssertEreport(sampleTupleInfo, MOD_OPT, "");
} else {
AssertEreport(sampleTupleInfo, MOD_OPT, "");
forboth(cell1, sampleTupleInfo, cell3, targoffsetList)
{
st_cell = (sample_tuple_cell*)lfirst(cell1);
targoffset = lfirst_int(cell3);
load_cu_data(start_col, end_col, st_cell->values, st_cell->nulls, true);
}
for (int col1 = start_col; col1 <= end_col; ++col1) {
if (IsValidCacheSlotID(slotIdList[col1]))
CUCache->UnPinDataBlock(slotIdList[col1]);
}
}
start_col = -1;
}
if (all_in_buffer) {
bool copyTuple = false;
AssertEreport(0 == start_col && end_col == num_attnums - 1, MOD_OPT, "");
for (targoffset = FirstOffsetNumber - 1; targoffset < maxoffset; targoffset++) {
check_match_tuple(copyTuple);
if (!copyTuple)
continue;
load_cu_data(0, num_attnums - 1, values, nulls, false);
tableam_tops_free_tuple(rows[location]);
rows[location] = (HeapTuple)tableam_tops_form_tuple(onerel->rd_att, values, nulls);
ItemPointerSet(&(rows[location])->t_self, targblock, targoffset + 1);
}
for (int col1 = 0; col1 < num_attnums; ++col1) {
if (IsValidCacheSlotID(slotIdList[col1]))
CUCache->UnPinDataBlock(slotIdList[col1]);
}
} else {
AssertEreport(end_col == num_attnums - 1, MOD_OPT, "");
forthree(cell1, sampleTupleInfo, cell2, valueLocation, cell3, targoffsetList)
{
st_cell = (sample_tuple_cell*)lfirst(cell1);
location = lfirst_int(cell2);
targoffset = lfirst_int(cell3);
tableam_tops_free_tuple(rows[location]);
rows[location] = (HeapTuple)tableam_tops_form_tuple(onerel->rd_att, st_cell->values, st_cell->nulls);
ItemPointerSet(&(rows[location])->t_self, targblock, targoffset + 1);
}
list_free_ext(valueLocation);
list_free_ext(targoffsetList);
}
MemoryContextReset(sample_context);
}
pfree_ext(nullValues);
pfree_ext(constValues);
pfree_ext(nulls);
pfree_ext(values);
pfree_ext(attrSeq);
pfree_ext(colIdx);
pfree_ext(funcIdx);
pfree_ext(cuPtr);
pfree_ext(slotIdList);
pfree_ext(getValFuncPtr);
list_free_ext(sampleTupleInfo);
pfree_ext(bms_attnums);
MemoryContextDelete(sample_context);
ADIO_RUN()
{
pfree_ext(anlprefetch.fetchlist1.blocklist);
pfree_ext(anlprefetch.fetchlist2.blocklist);
}
ADIO_END();
relation_close(deltarel, AccessShareLock);
CStoreEndScan(cstoreScanDesc);
if (estimate_table_rownum) {
AssertEreport(!SUPPORT_PRETTY_ANALYZE, MOD_OPT, "");
if (totalblocks == 0 || (totalblocks > 0 && liverows > 0)) {
*totalrows = liverows * totalblocks + deltarows;
} else {
*totalrows = INVALID_ESTTOTALROWS + targrows;
}
ereport(elevel,
(errmsg("ANALYZE INFO : \"%s\": scanned %u of %u cus, target cuid: %u, "
"containing %ld live rows, %ld dead rows, %.0f estimated total rows",
RelationGetRelationName(onerel),
estBlkNum,
totalblocks,
targblock,
liverows,
deadrows,
*totalrows)));
return numrows + delta_numrows;
}
* Step 6: sort rows and estimate reltuples
*
* If we didn't find as many tuples as we wanted then we're done. No sort
* is needed, since they're already in order.
*
* Otherwise we need to sort the collected tuples by position
* (itempointer). It's not worth worrying about corner cases where the
* tuples are already sorted.
*/
if (numrows == targrows)
qsort((void*)rows, numrows, sizeof(HeapTuple), compare_rows);
if (IS_PGXC_DATANODE) {
ereport(elevel,
(errmsg("ANALYZE INFO : \"%s\": scanned %d of %u cus, sample %ld rows, estimated total %.0f rows",
RelationGetRelationName(onerel),
bs.m,
totalblocks,
numrows,
*totalrows)));
}
return numrows + delta_numrows;
}
double anl_random_fract(void)
{
return ((double)random() + 1) / ((double)MAX_RANDOM_VALUE + 2);
}
* These two routines embody Algorithm Z from "Random sampling with a
* reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
* (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms
* of the count S of records to skip before processing another record.
* It is computed primarily based on t, the number of records already read.
* The only extra state needed between calls is W, a random state variable.
*
* anl_init_selection_state computes the initial W value.
*
* Given that we've already read t records (t >= n), anl_get_next_S
* determines the number of records to skip before the next record is
* processed.
*/
double anl_init_selection_state(int n)
{
if (unlikely(n == 0)) {
ereport(ERROR,
(errcode(ERRCODE_DIVISION_BY_ZERO),
errmsg("records n should not be zero")));
}
return exp(-log(anl_random_fract()) / n);
}
double anl_get_next_S(double t, int n, double* stateptr)
{
double S;
if (t <= (22.0 * n)) {
double V, quot;
V = anl_random_fract();
S = 0;
t += 1;
quot = (t - (double)n) / t;
while (quot > V) {
S += 1;
t += 1;
quot *= (t - (double)n) / t;
}
} else {
double W = *stateptr;
double term = t - (double)n + 1;
for (;;) {
double numer, numer_lim, denom;
double U, X, lhs, rhs, y, tmp;
U = anl_random_fract();
X = t * (W - 1.0);
S = floor(X);
tmp = (t + 1) / term;
lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
rhs = (((t + X) / (term + S)) * term) / t;
if (lhs <= rhs) {
W = rhs / lhs;
break;
}
y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
if ((double)n < S) {
denom = t;
numer_lim = term + S;
} else {
denom = t - (double)n + S;
numer_lim = t + 1;
}
for (numer = t + S; numer >= numer_lim; numer -= 1) {
y *= numer / denom;
denom -= 1;
}
W = exp(-log(anl_random_fract()) / n);
if (exp(log(y) / n) <= (t + X) / t)
break;
}
*stateptr = W;
}
return S;
}
* qsort comparator for sorting rows[] array
*/
int compare_rows(const void* a, const void* b)
{
HeapTuple ha = *(const HeapTuple*)a;
HeapTuple hb = *(const HeapTuple*)b;
BlockNumber ba = ItemPointerGetBlockNumber(&ha->t_self);
OffsetNumber oa = ItemPointerGetOffsetNumber(&ha->t_self);
BlockNumber bb = ItemPointerGetBlockNumber(&hb->t_self);
OffsetNumber ob = ItemPointerGetOffsetNumber(&hb->t_self);
if (ba < bb)
return -1;
if (ba > bb)
return 1;
if (oa < ob)
return -1;
if (oa > ob)
return 1;
return 0;
}
* acquire_inherited_sample_rows -- acquire sample rows from inheritance tree
*
* This has the same API as acquire_sample_rows, except that rows are
* collected from all inheritance children as well as the specified table.
* We fail and return zero if there are no inheritance children.
*/
template <bool estimate_table_rownum>
static int64 acquire_inherited_sample_rows(
Relation onerel, int elevel, HeapTuple* rows, int64 targrows, double* totalrows, double* totaldeadrows)
{
List* tableOIDs = NIL;
Relation* rels = NULL;
double* relblocks = NULL;
double totalblocks = 0;
int64 numrows = 0;
int64 nrels = 0;
int64 i = 0;
ListCell* lc = NULL;
* Find all members of inheritance set. We only need AccessShareLock on
* the children.
*/
tableOIDs = find_all_inheritors(RelationGetRelid(onerel), AccessShareLock, NULL);
* Check that there's at least one descendant, else fail. This could
* happen despite analyze_rel's relhassubclass check, if table once had a
* child but no longer does. In that case, we can clear the
* relhassubclass field so as not to make the same mistake again later.
* (This is safe because we hold ShareUpdateExclusiveLock.)
*/
if (list_length(tableOIDs) < 2) {
CommandCounterIncrement();
SetRelationHasSubclass(RelationGetRelid(onerel), false);
return 0;
}
* Count the blocks in all the relations. The result could overflow
* BlockNumber, so we use double arithmetic.
*/
rels = (Relation*)palloc(list_length(tableOIDs) * sizeof(Relation));
relblocks = (double*)palloc(list_length(tableOIDs) * sizeof(double));
totalblocks = 0;
nrels = 0;
foreach (lc, tableOIDs) {
Oid childOID = lfirst_oid(lc);
Relation childrel;
childrel = heap_open(childOID, NoLock);
if (RELATION_IS_OTHER_TEMP(childrel)) {
AssertEreport(childrel != onerel, MOD_OPT, "");
heap_close(childrel, AccessShareLock);
continue;
}
rels[nrels] = childrel;
relblocks[nrels] = (double)RelationGetNumberOfBlocks(childrel);
totalblocks += relblocks[nrels];
nrels++;
}
* Now sample rows from each relation, proportionally to its fraction of
* the total block count. (This might be less than desirable if the child
* rels have radically different free-space percentages, but it's not
* clear that it's worth working harder.)
*/
numrows = 0;
*totalrows = 0;
*totaldeadrows = 0;
for (i = 0; i < nrels; i++) {
Relation childrel = rels[i];
double childblocks = relblocks[i];
if (childblocks > 0) {
int64 childtargrows;
childtargrows = (int)rint(targrows * childblocks / totalblocks);
childtargrows = Min(childtargrows, targrows - numrows);
if (childtargrows > 0) {
int64 childrows;
double trows, tdrows;
childrows = acquire_sample_rows<estimate_table_rownum>(
childrel, elevel, rows + numrows, childtargrows, &trows, &tdrows);
if (childrows > 0 && !equalTupleDescs(RelationGetDescr(childrel), RelationGetDescr(onerel))) {
TupleConversionMap* map = NULL;
map = convert_tuples_by_name(RelationGetDescr(childrel),
RelationGetDescr(onerel),
gettext_noop("could not convert row type"));
if (map != NULL) {
int64 j;
for (j = 0; j < childrows; j++) {
HeapTuple newtup;
newtup = do_convert_tuple(rows[numrows + j], map);
tableam_tops_free_tuple(rows[numrows + j]);
rows[numrows + j] = newtup;
}
free_conversion_map(map);
}
}
numrows += childrows;
*totalrows += trows;
*totaldeadrows += tdrows;
}
}
* Note: we cannot release the child-table locks, since we may have
* pointers to their TOAST tables in the sampled rows.
*/
heap_close(childrel, NoLock);
}
return numrows;
}
void PartitionGePartRelationList(Relation onerel, Relation partRel, List** partList)
{
if (RelationIsSubPartitioned(onerel)) {
List *subPartList = relationGetPartitionList(partRel, NoLock);
ListCell *lc = NULL;
foreach (lc, subPartList) {
Partition subPart = (Partition)lfirst(lc);
Relation subPartRel = partitionGetRelation(partRel, subPart);
*partList = lappend(*partList, subPartRel);
}
releasePartitionList(partRel, &subPartList, NoLock);
releaseDummyRelation(&partRel);
} else {
*partList = lappend(*partList, partRel);
}
}
* @@GaussDB@@
* Target : data partition
* Brief : acquire sample rows from partitions for a special partitioned
* : table
* Description : This has the same API as acquire_sample_rows, except that
* : rows are collected from all partitions.
* Notes :
*/
template <bool estimate_table_rownum>
static int64 acquirePartitionedSampleRows(Relation onerel, VacuumStmt* vacstmt, int elevel, HeapTuple* rows,
int64 targrows, double* totalrows, double* totaldeadrows, VacAttrStats** vacattrstats, int attrAnalyzeNum)
{
List* partList = NIL;
double* partBlocks = NULL;
double totalBlocks = 0;
int64 numRows = 0;
int nParts = 0;
ListCell* partCell = NULL;
int counter = 0;
Partition part = NULL;
Relation partRel = NULL;
MemoryContext old_context = NULL;
MemoryContext col_partition_analyze_context = NULL;
#ifdef ENABLE_MULTIPLE_NODES
if (RelationIsTsStore(onerel)) {
*totalrows = get_total_row_count(onerel);
return numRows;
}
#endif
if (RELATION_OWN_BUCKET(onerel)) {
* for table with hash buckets, we pretend each bucket
* as a parition and flatten all buckets into partList
*/
if (RelationIsPartitioned(onerel)) {
ListCell* partCell = NULL;
* for each range partion get the underlying bucket
* and save it in the flattened rpatList
*/
foreach (partCell, vacstmt->partList) {
List* partbuckets = NIL;
Partition temp_part = (Partition)lfirst(partCell);
if (RelationIsSubPartitioned(onerel)) {
partRel = partitionGetRelation(onerel, temp_part);
List *subPartList = relationGetPartitionList(partRel, NoLock);
ListCell *lc = NULL;
foreach (lc, subPartList) {
Partition subPart = (Partition)lfirst(lc);
partbuckets = relationGetBucketRelList(partRel, subPart);
partList = list_concat(partList, partbuckets);
}
releasePartitionList(partRel, &subPartList, NoLock);
releaseDummyRelation(&partRel);
} else {
partbuckets = relationGetBucketRelList(onerel, temp_part);
partList = list_concat(partList, partbuckets);
}
}
if (vacstmt->issubpartition) {
List* partbuckets = relationGetBucketRelList(vacstmt->onepartrel, vacstmt->onepart);
partList = list_concat(partList, partbuckets);
}
} else {
* for non-range parition table, only need to
* get the underlying hash bucket of the main table
*/
partList = relationGetBucketRelList(onerel, NULL);
}
} else {
foreach (partCell, vacstmt->partList) {
Assert(!vacstmt->issubpartition);
part = (Partition)lfirst(partCell);
partRel = partitionGetRelation(onerel, part);
PartitionGePartRelationList(onerel, partRel, &partList);
}
if (vacstmt->issubpartition) {
Assert(NIL == vacstmt->partList);
partList = lappend(partList, partitionGetRelation(vacstmt->onepartrel, vacstmt->onepart));
}
}
* Count the blocks in all the partitions. The result could overflow
* BlockNumber, so we use double arithmetic.
*/
partBlocks = (double*)palloc(list_length(partList) * sizeof(double));
totalBlocks = 0;
nParts = 0;
if (RelationIsColStore(onerel)) {
* If there are many partitions of column store table, cstorescan will palloc
* memory for VectorBatch, so we should create new memcontext for cstorescan,
* and reset for each partition.
*/
col_partition_analyze_context = AllocSetContextCreate(u_sess->analyze_cxt.analyze_context,
"ColPartitionAnalyze",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
old_context = MemoryContextSwitchTo(col_partition_analyze_context);
}
foreach (partCell, partList) {
partRel = (Relation)lfirst(partCell);
* We already got the needed lock
*/
if (!RelationIsColStore(onerel))
partBlocks[nParts] = (double)RelationGetNumberOfBlocks(partRel);
else {
Relation deltaRel = relation_open(partRel->rd_rel->reldeltarelid, AccessShareLock);
int16* colIdx = (int16*)palloc(sizeof(int16) * attrAnalyzeNum);
for (int i = 0; i < attrAnalyzeNum; ++i)
colIdx[i] = onerel->rd_att->attrs[vacattrstats[i]->tupattnum - 1].attnum;
CStoreScanDesc cstoreScanDesc = CStoreBeginScan(partRel, attrAnalyzeNum, colIdx, NULL, false);
partBlocks[nParts] = CStoreRelGetCUNumByNow(cstoreScanDesc);
CStoreEndScan(cstoreScanDesc);
partBlocks[nParts] += (double)RelationGetNumberOfBlocks(deltaRel);
relation_close(deltaRel, AccessShareLock);
MemoryContextReset(col_partition_analyze_context);
}
totalBlocks += partBlocks[nParts];
nParts++;
}
if (RelationIsColStore(onerel)) {
(void)MemoryContextSwitchTo(old_context);
MemoryContextDelete(col_partition_analyze_context);
}
* Now sample rows from each partition, proportionally to its fraction of
* the total block count. (This might be less than desirable if the
* partition have radically different free-space percentages, but it's not
* clear that it's worth working harder.)
*/
*totalrows = 0;
*totaldeadrows = 0;
partCell = list_head(partList);
for (counter = 0; counter < nParts; counter++) {
double partBlock = 0;
partBlock = partBlocks[counter];
partRel = (Relation)lfirst(partCell);
if (partBlock > 0) {
int64 parttargrows = (int64)ceil(targrows * partBlock / totalBlocks);
* Make sure we don't overrun due to roundoff error
*/
parttargrows = Min(parttargrows, targrows - numRows);
AssertEreport(parttargrows >= 0, MOD_OPT, "");
if (parttargrows > 0) {
int64 partrows = 0;
double trows = 0;
double tdrows = 0;
if (!RelationIsColStore(onerel))
partrows = acquire_sample_rows<estimate_table_rownum>(
partRel, elevel, rows + numRows, parttargrows, &trows, &tdrows);
else
partrows = AcquireSampleCStoreRows<estimate_table_rownum>(
partRel, elevel, rows + numRows, parttargrows, &trows, &tdrows, vacattrstats, attrAnalyzeNum);
numRows += partrows;
*totalrows += trows;
*totaldeadrows += tdrows;
* Remember to report sampled rows to stats collector. Otherwise, stats
* collector will not set new state for this partition and the partition
* ill always in a "need to be analyzed" state.
*/
if (!estimate_table_rownum)
pgstat_report_analyze(partRel, trows, tdrows);
}
}
partCell = lnext(partCell);
}
* clean up
*/
pfree_ext(partBlocks);
foreach (partCell, partList) {
partRel = (Relation)lfirst(partCell);
releaseDummyRelation(&partRel);
}
list_free_ext(partList);
return numRows;
}
void releaseSourceAfterDelteOrUpdateAttStats(
ResourceOwner asOwner, ResourceOwner oldOwner1, Relation pgstat, Relation pgstat_ext)
{
if (pgstat != NULL) {
heap_close(pgstat, RowExclusiveLock);
}
if (pgstat_ext != NULL) {
heap_close(pgstat_ext, RowExclusiveLock);
}
ResourceOwnerRelease(asOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, false);
ResourceOwnerRelease(asOwner, RESOURCE_RELEASE_LOCKS, false, false);
ResourceOwnerRelease(asOwner, RESOURCE_RELEASE_AFTER_LOCKS, false, false);
t_thrd.utils_cxt.CurrentResourceOwner = oldOwner1;
ResourceOwnerDelete(asOwner);
}
* update_attstats() -- update attribute statistics for one relation
*
* Statistics are stored in several places: the pg_class row for the
* relation has stats about the whole relation, and there is a
* pg_statistic row for each (non-system) attribute that has ever
* been analyzed. The pg_class values are updated by VACUUM, not here.
*
* pg_statistic rows are just added or updated normally. This means
* that pg_statistic will probably contain some deleted rows at the
* completion of a vacuum cycle, unless it happens to get vacuumed last.
*
* To keep things simple, we punt for pg_statistic, and don't try
* to compute or store rows for pg_statistic itself in pg_statistic.
* This could possibly be made to work, but it's not worth the trouble.
* Note analyze_rel() has seen to it that we won't come here when
* vacuuming pg_statistic itself.
*
* Note: there would be a race condition here if two backends could
* ANALYZE the same table concurrently. Presently, we lock that out
* by taking a self-exclusive lock on the relation in analyze_rel().
*
* Note: both single-column and extended statistics are process/inserted in this
* function
*/
void update_attstats(Oid relid, char relkind, bool inh, int natts, VacAttrStats** vacattrstats, char relpersistence)
{
Relation pgstat = NULL;
Relation pgstat_ext = NULL;
int attno;
ResourceOwner asOwner, oldOwner1;
MemoryContext oldcontext = CurrentMemoryContext;
if (natts <= 0) {
return;
}
* Create a resource owner to keep track of resources
* in order to release resources when catch the exception.
*/
asOwner = ResourceOwnerCreate(t_thrd.utils_cxt.CurrentResourceOwner, "update_stats",
THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_OPTIMIZER));
oldOwner1 = t_thrd.utils_cxt.CurrentResourceOwner;
t_thrd.utils_cxt.CurrentResourceOwner = asOwner;
for (attno = 0; attno < natts; attno++) {
VacAttrStats* stats = vacattrstats[attno];
if (!stats->stats_valid)
continue;
if (stats->num_attrs > 1) {
if (!pgstat_ext) {
pgstat_ext = heap_open(StatisticExtRelationId, RowExclusiveLock);
}
update_stats_catalog<false>(pgstat_ext, oldcontext, relid, relkind, inh, stats, natts, relpersistence);
} else {
if (!pgstat) {
pgstat = heap_open(StatisticRelationId, RowExclusiveLock);
}
update_stats_catalog<true>(pgstat, oldcontext, relid, relkind, inh, stats, natts, relpersistence);
if (t_thrd.proc->workingVersionNum >= STATISTIC_HISTORY_VERSION_NUMBER) {
InsertColumnStatisticHistory(relid, relkind, inh, stats);
}
}
}
releaseSourceAfterDelteOrUpdateAttStats(asOwner, oldOwner1, pgstat, pgstat_ext);
}
static ArrayType* es_construct_extended_statistics(VacAttrStats* stats, int k)
{
ArrayType *array = NULL;
if (STATISTIC_KIND_MCV == stats->stakind[k] || STATISTIC_KIND_NULL_MCV == stats->stakind[k]) {
array = es_construct_mcv_value_array(stats, k);
#ifndef ENABLE_MULTIPLE_NODES
} else if (STATISTIC_EXT_DEPENDENCIES == stats->stakind[k]) {
array = es_construct_dependency_value_array(stats, k);
#endif
} else {
array = NULL;
}
return array;
}
* update_single_column_attrstats --- insert a single-column statistic entry into pg_statistic
*
* @param(in) pgstat
* relation handler for pg_statistic
* @param(in) oldcontext
* memory context for caller function
* @param(in) relid
* relation oid for being analyzed relation
* @param(in) relkind
* relation kind for being analyzed relation
* @param(in) inh
* relation's inh value for being anlyzed relation
* @param(in) stats
* column-level statistic that will be insert into pg_statistic
*
* @return: void
*/
template <bool isSingleColumn>
static void update_stats_catalog(
Relation pgstat, MemoryContext oldcontext, Oid relid, char relkind, bool inh, VacAttrStats* stats, int natts,
char relpersistence)
{
HeapTuple stup, oldtup;
int i, k, n;
Assert(Natts_pg_statistic >= Natts_pg_statistic_ext);
Datum values[Natts_pg_statistic];
bool nulls[Natts_pg_statistic];
bool replaces[Natts_pg_statistic];
* Construct a new pg_statistic(_ext) tuple
*/
for (i = 0; i < Natts_pg_statistic; ++i) {
nulls[i] = false;
replaces[i] = true;
}
AttrNumber attnum = 0;
if (isSingleColumn) {
attnum = stats->attrs[0]->attnum;
values[Anum_pg_statistic_starelid - 1] = ObjectIdGetDatum(relid);
values[Anum_pg_statistic_starelkind - 1] = CharGetDatum(relkind);
values[Anum_pg_statistic_staattnum - 1] = Int16GetDatum(attnum);
values[Anum_pg_statistic_stainherit - 1] = BoolGetDatum(inh);
values[Anum_pg_statistic_stanullfrac - 1] = Float4GetDatum(stats->stanullfrac);
values[Anum_pg_statistic_stawidth - 1] = Int32GetDatum(stats->stawidth);
values[Anum_pg_statistic_stadistinct - 1] = Float4GetDatum(stats->stadistinct);
values[Anum_pg_statistic_stadndistinct - 1] = Float4GetDatum(stats->stadndistinct);
values[Anum_pg_statistic_staextinfo - 1] = 0;
nulls[Anum_pg_statistic_staextinfo - 1] = true;
} else {
values[Anum_pg_statistic_ext_starelid - 1] = ObjectIdGetDatum(relid);
values[Anum_pg_statistic_ext_starelkind - 1] = CharGetDatum(relkind);
values[Anum_pg_statistic_ext_stainherit - 1] = BoolGetDatum(inh);
values[Anum_pg_statistic_ext_stanullfrac - 1] = Float4GetDatum(stats->stanullfrac);
values[Anum_pg_statistic_ext_stawidth - 1] = Int32GetDatum(stats->stawidth);
values[Anum_pg_statistic_ext_stadistinct - 1] = Float4GetDatum(stats->stadistinct);
values[Anum_pg_statistic_ext_stadndistinct - 1] = Float4GetDatum(stats->stadndistinct);
values[Anum_pg_statistic_ext_staexprs - 1] = (Datum)0;
nulls[Anum_pg_statistic_ext_staexprs - 1] = true;
}
Assert(Anum_pg_statistic_stakind1 == Anum_pg_statistic_ext_stakind1);
i = Anum_pg_statistic_stakind1 - 1;
for (k = 0; k < STATISTIC_NUM_SLOTS; k++) {
values[i++] = Int16GetDatum(stats->stakind[k]);
}
Assert(Anum_pg_statistic_staop1 == Anum_pg_statistic_ext_staop1);
i = Anum_pg_statistic_staop1 - 1;
for (k = 0; k < STATISTIC_NUM_SLOTS; k++) {
values[i++] = ObjectIdGetDatum(stats->staop[k]);
}
if (isSingleColumn) {
i = Anum_pg_statistic_stanumbers1 - 1;
} else {
i = Anum_pg_statistic_ext_stanumbers1 - 1;
}
for (k = 0; k < STATISTIC_NUM_SLOTS; k++) {
int nnum = stats->numnumbers[k];
if (nnum > 0) {
Datum* numdatums = (Datum*)palloc(nnum * sizeof(Datum));
ArrayType* arry = NULL;
for (n = 0; n < nnum; n++)
numdatums[n] = Float4GetDatum(stats->stanumbers[k][n]);
arry = construct_array(numdatums, nnum, FLOAT4OID, sizeof(float4), FLOAT4PASSBYVAL, 'i');
values[i++] = PointerGetDatum(arry);
} else {
nulls[i] = true;
values[i++] = (Datum)0;
}
}
if (isSingleColumn) {
i = Anum_pg_statistic_stavalues1 - 1;
} else {
i = Anum_pg_statistic_ext_stavalues1 - 1;
}
for (k = 0; k < STATISTIC_NUM_SLOTS; k++) {
if (stats->numvalues[k] > 0) {
if (isSingleColumn) {
ArrayType* array = construct_array(stats->stavalues[k],
stats->numvalues[k],
stats->statypid[k],
stats->statyplen[k],
stats->statypbyval[k],
stats->statypalign[k]);
values[i++] = PointerGetDatum(array);
} else {
ArrayType* array = es_construct_extended_statistics(stats, k);
if (array) {
values[i++] = PointerGetDatum(array);
} else {
nulls[i] = true;
values[i++] = (Datum)0;
ereport(ERROR,
(errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
errmsg("Multi-columns statistic does not support hist/corr/mcelem/dechist")));
}
}
} else {
nulls[i] = true;
values[i++] = (Datum)0;
}
}
int2vector* stakey = NULL;
if (isSingleColumn == false) {
Assert(stats->num_attrs > 1);
stakey = buildint2vector(NULL, stats->num_attrs);
for (unsigned i = 0; i < stats->num_attrs; i++) {
stakey->values[i] = stats->attrs[i]->attnum;
}
values[Anum_pg_statistic_ext_stakey - 1] = PointerGetDatum(stakey);
}
if (relpersistence == RELPERSISTENCE_GLOBAL_TEMP) {
up_gtt_att_statistic(relid,
attnum,
natts,
RelationGetDescr(pgstat),
values,
nulls);
return;
}
PG_TRY();
{
if (isSingleColumn) {
oldtup = SearchSysCache4(STATRELKINDATTINH,
ObjectIdGetDatum(relid),
CharGetDatum(relkind),
Int16GetDatum(attnum),
BoolGetDatum(inh));
} else {
oldtup = SearchSysCache4(STATRELKINDKEYINH,
ObjectIdGetDatum(relid),
CharGetDatum(relkind),
BoolGetDatum(inh),
values[Anum_pg_statistic_ext_stakey - 1]);
}
if (HeapTupleIsValid(oldtup)) {
stup = heap_modify_tuple(oldtup, RelationGetDescr(pgstat), values, nulls, replaces);
ReleaseSysCache(oldtup);
simple_heap_update(pgstat, &stup->t_self, stup);
} else {
stup = heap_form_tuple(RelationGetDescr(pgstat), values, nulls);
(void)simple_heap_insert(pgstat, stup);
}
CatalogUpdateIndexes(pgstat, stup);
}
PG_CATCH();
{
if (isSingleColumn) {
analyze_concurrency_process(relid, attnum, oldcontext, PG_FUNCNAME_MACRO);
} else {
analyze_concurrency_process(relid, ES_MULTI_COLUMN_STATS_ATTNUM, oldcontext, PG_FUNCNAME_MACRO);
}
}
PG_END_TRY();
if (isSingleColumn == false) {
pfree_ext(stakey);
}
tableam_tops_free_tuple(stup);
}
* delete_attstats() -- delete attribute statistics for one relation if
* there has no reltuples now and there has reltuples before.
* @in relid - the relation id which we will do analyze
* @in relkind - 'c': starelid ref pg_class.oid; 'p': starelid ref pg_partition.oid
* @in inh - true if inheritance children are included
* @in natts - how many column the relation have
* @in vacattrstats - the statistic info for all the attributes of the relation
* @return: void
*/
void delete_attstats(
Oid relid, char relkind, bool inh, int natts, VacAttrStats** vacattrstats, unsigned int delete_stats_option)
{
int attno;
ResourceOwner asOwner, oldOwner1;
MemoryContext oldcontext = CurrentMemoryContext;
Relation pgstat = NULL;
Relation pgstat_ext = NULL;
if (natts <= 0) {
return;
}
* Create a resource owner to keep track of resources
* in order to release resources when catch the exception.
*/
asOwner = ResourceOwnerCreate(t_thrd.utils_cxt.CurrentResourceOwner, "delete_stats",
THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_OPTIMIZER));
oldOwner1 = t_thrd.utils_cxt.CurrentResourceOwner;
t_thrd.utils_cxt.CurrentResourceOwner = asOwner;
for (attno = 0; attno < natts; attno++) {
VacAttrStats* stats = vacattrstats[attno];
bool multicol_stats = false;
if ((!(delete_stats_option & DELETE_STATS_MULTI)) && (stats->num_attrs > 1)) {
continue;
}
int16 attnum = stats->attrs[0]->attnum;
if (stats->num_attrs > 1) {
attnum = ES_MULTI_COLUMN_STATS_ATTNUM;
multicol_stats = true;
}
HeapTuple oldtup = NULL;
if (multicol_stats) {
if (!pgstat_ext) {
pgstat_ext = heap_open(StatisticExtRelationId, RowExclusiveLock);
}
int2vector* stakey = NULL;
stakey = buildint2vector(NULL, stats->num_attrs);
for (unsigned i = 0; i < stats->num_attrs; i++) {
stakey->values[i] = stats->attrs[i]->attnum;
}
oldtup = SearchSysCache4(STATRELKINDKEYINH,
ObjectIdGetDatum(relid),
CharGetDatum(relkind),
BoolGetDatum(inh),
PointerGetDatum(stakey));
if (stats->num_attrs > 0) {
Bitmapset* bms_attnums = bms_make_singleton(stats->attrs[0]->attnum);
for (unsigned int i = 1; i < stats->num_attrs; i++) {
bms_attnums = bms_add_member(bms_attnums, stats->attrs[i]->attnum);
}
const char* model_name_base = "BAYESNET_MODEL_STATS";
StringInfoData model_name;
initStringInfo(&model_name);
appendStringInfo(&model_name, "%s_%d_%d", model_name_base, relid, bms_hash_value(bms_attnums));
remove_model_from_cache_and_disk(model_name.data);
pfree_ext(model_name.data);
}
pfree_ext(stakey);
} else {
if (!pgstat) {
pgstat = heap_open(StatisticRelationId, RowExclusiveLock);
}
oldtup = SearchSysCache4(STATRELKINDATTINH,
ObjectIdGetDatum(relid),
CharGetDatum(relkind),
Int16GetDatum(attnum),
BoolGetDatum(inh));
}
if (HeapTupleIsValid(oldtup)) {
Relation sd = NULL;
if (multicol_stats) {
sd = pgstat_ext;
} else {
if (!pgstat) {
pgstat = heap_open(StatisticRelationId, RowExclusiveLock);
}
sd = pgstat;
}
PG_TRY();
{
simple_heap_delete(sd, &oldtup->t_self);
}
PG_CATCH();
{
analyze_concurrency_process(relid, attnum, oldcontext, PG_FUNCNAME_MACRO);
}
PG_END_TRY();
ReleaseSysCache(oldtup);
}
}
releaseSourceAfterDelteOrUpdateAttStats(asOwner, oldOwner1, pgstat, pgstat_ext);
}
* delete_attstats_replication() -- delete attribute statistics for one replication if
* there has no reltuples now and there has reltuples before.
* @in relid - the relation id which we will do analyze
* @in stmt - the statment for analyze or vacuum command
* @return: void
*/
void delete_attstats_replication(Oid relid, VacuumStmt* stmt)
{
int attr_cnt = 0;
int nindexes = 0;
AnlIndexData* indexdata = NULL;
bool hasindex = false;
VacAttrStats** vacattrstats = NULL;
Relation* Irel = NULL;
Relation rel = NULL;
rel = relation_open(relid, AccessShareLock);
vacattrstats = get_vacattrstats_by_vacstmt(rel, stmt, &attr_cnt, &nindexes, &indexdata, &hasindex, false, &Irel);
* Emit the completed stats rows into pg_statistic, deleting any
* previous statistics for the target columns.
*/
delete_attstats(relid, STARELKIND_CLASS, false, attr_cnt, vacattrstats);
* we should delete statistic of complex table
* if statistic of hdfs has deleted and all the table's totalrows are 0.
*/
if ((stmt->pstGlobalStatEx[ANALYZEDELTA - 1].totalRowCnts) == 0 &&
(rel->rd_rel->relhassubclass || RelationIsPAXFormat(rel)))
delete_attstats(relid, STARELKIND_CLASS, true, attr_cnt, vacattrstats);
for (int i = 0; i < nindexes; i++) {
AnlIndexData* thisdata = &indexdata[i];
delete_attstats(RelationGetRelid(Irel[i]), STARELKIND_CLASS, false, thisdata->attr_cnt, thisdata->vacattrstats);
}
vac_close_indexes(nindexes, Irel, NoLock);
relation_close(rel, AccessShareLock);
}
* Standard fetch function for use by compute_stats subroutines.
*
* This exists to provide some insulation between compute_stats routines
* and the actual storage of the sample data.
*/
static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool* isNull, Relation rel)
{
int attnum = stats->tupattnum;
HeapTuple tuple = stats->rows[rownum];
TupleDesc tupDesc = rel->rd_att;
return tableam_tops_tuple_getattr(tuple, attnum, tupDesc, isNull);
}
* Fetch function for analyzing index expressions.
*
* We have not bothered to construct index tuples, instead the data is
* just in Datum arrays.
*/
static Datum ind_fetch_func(VacAttrStatsP stats, int rownum, bool* isNull, Relation rel)
{
int i;
i = rownum * stats->rowstride;
*isNull = stats->exprnulls[i];
return stats->exprvals[i];
}
*
* Code below this point represents the "standard" type-specific statistics
* analysis algorithms. This code can be replaced on a per-data-type basis
* by setting a nonzero value in pg_type.typanalyze.
*
*==========================================================================
*/
* To avoid consuming too much memory during analysis and/or too much space
* in the resulting pg_statistic rows, we ignore varlena datums that are wider
* than WIDTH_THRESHOLD (after detoasting!). This is legitimate for MCV
* and distinct-value calculations since a wide value is unlikely to be
* duplicated at all, much less be a most-common value. For the same reason,
* ignoring wide values will not affect our estimates of histogram bin
* boundaries very much.
*/
#define WIDTH_THRESHOLD 1024
#define swapInt(a, b) \
do { \
int _tmp; \
_tmp = a; \
a = b; \
b = _tmp; \
} while (0)
#define swapDatum(a, b) \
do { \
Datum _tmp; \
_tmp = a; \
a = b; \
b = _tmp; \
} while (0)
* Extra information used by the default analysis routines
*/
typedef struct {
Oid eqopr;
Oid eqfunc;
Oid ltopr;
} StdAnalyzeData;
typedef struct {
Datum value;
int tupno;
} ScalarItem;
typedef struct {
int count;
int first;
} ScalarMCVItem;
typedef struct {
SortSupport ssup;
int* tupnoLink;
} CompareScalarsContext;
static void compute_minimal_stats(
VacAttrStatsP stats, AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows, Relation rel);
static void compute_scalar_stats(
VacAttrStatsP stats, AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows, Relation rel);
static int compare_scalars(const void* a, const void* b, void* arg);
static int compare_mcvs(const void* a, const void* b);
static void analyze_compute_ndistinct(
int64 nmultiple, double ndistinct, int64 toowide_cnt, int64 samplerows, double totalrows, VacAttrStatsP stats);
static void analyze_compute_correlation(
int64 values_cnt, double corr_xysum, double corrs, VacAttrStatsP stats, int slot_idx);
static void analyze_compute_mcv(
int* slot_idx, const char* tableName, AnalyzeSampleTableSpecInfo* spec, bool process_null = false);
static double compute_mcv_mincount(int64 samplerows, double ndistinct, double num_bins);
* @Description: Compute target number of statistics datapoints to collect
* during VACUUM ANALYZE of this column according to default_statistics_target.
*
* @in attr - a tuple with the format of pg_attribute relation.
* @return - int (target number of statistics)
*/
int compute_attr_target(Form_pg_attribute attr)
{
int att_target = attr->attstattarget;
* If the attstattarget column is -1, use the default value;
* If the attstattarget column is less -1 or default_statistics_target is negtive, use the percent value.
* NB: it is okay to scribble on stats->attr since it's a copy.
*/
if (IS_PGXC_DATANODE) {
if (attr->attstattarget < 0 && default_statistics_target > 0) {
if (attr->attstattarget == -1)
attr->attstattarget = default_statistics_target;
att_target = default_statistics_target;
} else if (attr->attstattarget < 0 && default_statistics_target <= 0)
att_target = DEFAULT_ATTR_TARGET;
else
att_target = attr->attstattarget;
} else if (attr->attstattarget < 0) {
if (default_statistics_target < 0)
attr->attstattarget = -default_statistics_target;
else
attr->attstattarget = default_statistics_target;
* default_statistics_target < 0: We have give a specific mcv attstattarget is 100,
* and histgram attstattarget is 301 for normal table. For text search type or index data,
* attstattarget is -default_statistics_target may be few, so we should adjust it.
*/
attr->attstattarget = Max(attr->attstattarget, DEFAULT_ATTR_TARGET);
att_target = attr->attstattarget;
}
return att_target;
}
* std_typanalyze -- the default type-specific typanalyze function
*/
bool std_typanalyze(VacAttrStats* stats)
{
Form_pg_attribute attr = stats->attrs[0];
Oid ltopr;
Oid eqopr;
StdAnalyzeData* mystats = NULL;
int att_target;
att_target = compute_attr_target(attr);
get_sort_group_operators(stats->attrtypid[0], false, false, false, <opr, &eqopr, NULL, NULL);
if (!OidIsValid(eqopr))
return false;
mystats = (StdAnalyzeData*)palloc(sizeof(StdAnalyzeData));
mystats->eqopr = eqopr;
mystats->eqfunc = get_opcode(eqopr);
mystats->ltopr = ltopr;
stats->extra_data = mystats;
* Determine which standard statistics algorithm to use
*/
if (OidIsValid(ltopr)) {
stats->compute_stats = compute_scalar_stats;
* The following choice of minrows is based on the paper
* "Random sampling for histogram construction: how much is enough?"
* by Surajit Chaudhuri, Rajeev Motwani and Vivek Narasayya, in
* Proceedings of ACM SIGMOD International Conference on Management
* of Data, 1998, Pages 436-447. Their Corollary 1 to Theorem 5
* says that for table size n, histogram size k, maximum relative
* error in bin size f, and error probability gamma, the minimum
* random sample size is
* r = 4 * k * ln(2*n/gamma) / f^2
* Taking f = 0.5, gamma = 0.01, n = 10^6 rows, we obtain
* r = 305.82 * k
* Note that because of the log function, the dependence on n is
* quite weak; even at n = 10^12, a 300*k sample gives <= 0.66
* bin size error with probability 0.99. So there's no real need to
* scale for n, which is a good thing because we don't necessarily
* know it at this point.
* --------------------
*/
stats->minrows = DEFAULT_EST_TARGET_ROWS * att_target;
} else {
stats->compute_stats = compute_minimal_stats;
stats->minrows = DEFAULT_EST_TARGET_ROWS * att_target;
}
return true;
}
* compute_minimal_stats() -- compute minimal column statistics
*
* We use this when we can find only an "=" operator for the datatype.
*
* We determine the fraction of non-null rows, the average width, the
* most common values, and the (estimated) number of distinct values.
*
* The most common values are determined by brute force: we keep a list
* of previously seen values, ordered by number of times seen, as we scan
* the samples. A newly seen value is inserted just after the last
* multiply-seen value, causing the bottommost (oldest) singly-seen value
* to drop off the list. The accuracy of this method, and also its cost,
* depend mainly on the length of the list we are willing to keep.
*/
static void compute_minimal_stats(
VacAttrStatsP stats, AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows, Relation rel)
{
int i;
int null_cnt = 0;
int nonnull_cnt = 0;
int toowide_cnt = 0;
double total_width = 0;
bool is_varlena = (!stats->attrtype[0]->typbyval && stats->attrtype[0]->typlen == -1);
bool is_varwidth = (!stats->attrtype[0]->typbyval && stats->attrtype[0]->typlen < 0);
FmgrInfo f_cmpeq;
typedef struct {
Datum value;
int count;
} TrackItem;
TrackItem* track = NULL;
int track_cnt, track_max;
int num_mcv = stats->attrs[0]->attstattarget;
StdAnalyzeData* mystats = (StdAnalyzeData*)stats->extra_data;
* We track up to 2*n values for an n-element MCV list; but at least 10
*/
track_max = 2 * num_mcv;
if (track_max < 10) {
track_max = 10;
}
track = (TrackItem*)palloc(track_max * sizeof(TrackItem));
track_cnt = 0;
fmgr_info(mystats->eqfunc, &f_cmpeq);
for (i = 0; i < samplerows; i++) {
Datum value;
bool isnull = false;
bool match = false;
int firstcount1, j;
vacuum_delay_point();
value = fetchfunc(stats, i, &isnull, rel);
if (isnull) {
null_cnt++;
continue;
}
nonnull_cnt++;
* If it's a variable-width field, add up widths for average width
* calculation. Note that if the value is toasted, we use the toasted
* width. We don't bother with this calculation if it's a fixed-width
* type.
*/
if (is_varlena) {
total_width += VARSIZE_ANY(DatumGetPointer(value));
* If the value is toasted, we want to detoast it just once to
* avoid repeated detoastings and resultant excess memory usage
* during the comparisons. Also, check to see if the value is
* excessively wide, and if so don't detoast at all --- just
* ignore the value.
*/
if (toast_raw_datum_size(value) > WIDTH_THRESHOLD) {
toowide_cnt++;
continue;
}
value = PointerGetDatum(PG_DETOAST_DATUM(value));
} else if (is_varwidth) {
total_width += strlen(DatumGetCString(value)) + 1;
}
* See if the value matches anything we're already tracking.
*/
match = false;
firstcount1 = track_cnt;
for (j = 0; j < track_cnt; j++) {
if (DatumGetBool(FunctionCall2Coll(&f_cmpeq, DEFAULT_COLLATION_OID, value, track[j].value))) {
match = true;
break;
}
if (j < firstcount1 && track[j].count == 1) {
firstcount1 = j;
}
}
if (match) {
track[j].count++;
while (j > 0 && track[j].count > track[j - 1].count) {
swapDatum(track[j].value, track[j - 1].value);
swapInt(track[j].count, track[j - 1].count);
j--;
}
} else {
if (track_cnt < track_max) {
track_cnt++;
}
for (j = track_cnt - 1; j > firstcount1; j--) {
track[j].value = track[j - 1].value;
track[j].count = track[j - 1].count;
}
if (firstcount1 < track_cnt) {
track[firstcount1].value = value;
track[firstcount1].count = 1;
}
}
}
if (nonnull_cnt > 0) {
int nmultiple, summultiple;
stats->stats_valid = true;
stats->stanullfrac = (double)null_cnt / (double)samplerows;
if (is_varwidth) {
stats->stawidth = total_width / (double)nonnull_cnt;
} else {
stats->stawidth = stats->attrtype[0]->typlen;
}
summultiple = 0;
for (nmultiple = 0; nmultiple < track_cnt; nmultiple++) {
if (track[nmultiple].count == 1) {
break;
}
summultiple += track[nmultiple].count;
}
if (nmultiple == 0) {
* If we found no repeated non-null values, assume it's a unique
* column; but be sure to discount for any nulls we found.
*/
stats->stadistinct = -1.0 * (1 - stats->stanullfrac);
} else if (track_cnt < track_max && toowide_cnt == 0 && nmultiple == track_cnt) {
* Our track list includes every value in the sample, and every
* value appeared more than once. Assume the column has just
* these values.
*/
stats->stadistinct = track_cnt;
} else {
* Estimate the number of distinct values using the estimator
* proposed by Haas and Stokes in IBM Research Report RJ 10025:
* n*d / (n - f1 + f1*n/N)
* where f1 is the number of distinct values that occurred
* exactly once in our sample of n rows (from a total of N),
* and d is the total number of distinct values in the sample.
* This is their Duj1 estimator; the other estimators they
* recommend are considerably more complex, and are numerically
* very unstable when n is much smaller than N.
*
* We assume (not very reliably!) that all the multiply-occurring
* values are reflected in the final track[] list, and the other
* nonnull values all appeared but once. (XXX this usually
* results in a drastic overestimate of ndistinct. Can we do
* any better?)
* ----------
*/
int f1 = nonnull_cnt - summultiple;
int d = f1 + nmultiple;
double numer, denom, stadistinct;
numer = (double)samplerows * (double)d;
denom = (double)(samplerows - f1) + (double)f1 * (double)samplerows / totalrows;
stadistinct = numer / denom;
if (stadistinct < (double)d) {
stadistinct = (double)d;
}
if (stadistinct > totalrows) {
stadistinct = totalrows;
}
stats->stadistinct = floor(stadistinct + 0.5);
}
* If we estimated the number of distinct values at more than 10% of
* the total row count (a very arbitrary limit), then assume that
* stadistinct should scale with the row count rather than be a fixed
* value.
*/
if (stats->stadistinct > 0.1 * totalrows)
stats->stadistinct = -(stats->stadistinct / totalrows);
* Decide how many values are worth storing as most-common values. If
* we are able to generate a complete MCV list (all the values in the
* sample will fit, and we think these are all the ones in the table),
* then do so. Otherwise, store only those values that are
* significantly more common than the (estimated) average. We set the
* threshold rather arbitrarily at 25% more than average, with at
* least 2 instances in the sample.
*/
if (track_cnt < track_max && toowide_cnt == 0 && stats->stadistinct > 0 && track_cnt <= num_mcv) {
num_mcv = track_cnt;
} else {
double ndistinct = stats->stadistinct;
double avgcount, mincount;
if (ndistinct < 0) {
ndistinct = -ndistinct * totalrows;
}
avgcount = (double)samplerows / ndistinct;
mincount = avgcount * 1.25;
if (mincount < 2) {
mincount = 2;
}
if (num_mcv > track_cnt) {
num_mcv = track_cnt;
}
for (i = 0; i < num_mcv; i++) {
if (track[i].count < mincount) {
num_mcv = i;
break;
}
}
}
if (num_mcv > 0) {
MemoryContext old_context;
Datum* mcv_values = NULL;
float4* mcv_freqs = NULL;
old_context = MemoryContextSwitchTo(stats->anl_context);
mcv_values = (Datum*)palloc(num_mcv * sizeof(Datum));
mcv_freqs = (float4*)palloc(num_mcv * sizeof(float4));
for (i = 0; i < num_mcv; i++) {
mcv_values[i] = datumCopy(track[i].value, stats->attrtype[0]->typbyval, stats->attrtype[0]->typlen);
mcv_freqs[i] = (double)track[i].count / (double)samplerows;
}
(void)MemoryContextSwitchTo(old_context);
stats->stakind[0] = STATISTIC_KIND_MCV;
stats->staop[0] = mystats->eqopr;
stats->stanumbers[0] = mcv_freqs;
stats->numnumbers[0] = num_mcv;
stats->stavalues[0] = mcv_values;
stats->numvalues[0] = num_mcv;
* Accept the defaults for stats->statypid and others. They have
* been set before we were called (see vacuum.h)
*/
}
} else if (null_cnt > 0) {
stats->stats_valid = true;
stats->stanullfrac = 1.0;
if (is_varwidth)
stats->stawidth = 0;
else
stats->stawidth = stats->attrtype[0]->typlen;
stats->stadistinct = 0.0;
}
}
* compute_scalar_stats() -- compute column statistics
*
* We use this when we can find "=" and "<" operators for the datatype.
*
* We determine the fraction of non-null rows, the average width, the
* most common values, the (estimated) number of distinct values, the
* distribution histogram, and the correlation of physical to logical order.
*
* The desired stats can be determined fairly easily after sorting the
* data values into order.
*/
static void compute_scalar_stats(
VacAttrStatsP stats, AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows, Relation rel)
{
int i;
int null_cnt = 0;
int nonnull_cnt = 0;
int toowide_cnt = 0;
double total_width = 0;
bool is_varlena = (!stats->attrtype[0]->typbyval && stats->attrtype[0]->typlen == -1);
bool is_varwidth = (!stats->attrtype[0]->typbyval && stats->attrtype[0]->typlen < 0);
double corr_xysum;
SortSupportData ssup;
ScalarItem* values = NULL;
int values_cnt = 0;
int* tupnoLink = NULL;
ScalarMCVItem* track = NULL;
int track_cnt = 0;
int num_mcv = Min(stats->attrs[0]->attstattarget, MAX_ATTR_MCV_STAT_TARGET);
int num_bins = Min(stats->attrs[0]->attstattarget, MAX_ATTR_HIST_STAT_TARGET);
StdAnalyzeData* mystats = (StdAnalyzeData*)stats->extra_data;
values = (ScalarItem*)palloc(samplerows * sizeof(ScalarItem));
tupnoLink = (int*)palloc(samplerows * sizeof(int));
track = (ScalarMCVItem*)palloc(num_mcv * sizeof(ScalarMCVItem));
errno_t rc = memset_s(&ssup, sizeof(ssup), 0, sizeof(ssup));
securec_check(rc, "", "");
ssup.ssup_cxt = CurrentMemoryContext;
ssup.ssup_collation = DEFAULT_COLLATION_OID;
ssup.ssup_nulls_first = false;
* For now, don't perform abbreviated key conversion, because full values
* are required for MCV slot generation. Supporting that optimization
* would necessitate teaching compare_scalars() to call a tie-breaker.
*/
ssup.abbreviate = false;
PrepareSortSupportFromOrderingOp(mystats->ltopr, &ssup);
for (i = 0; i < samplerows; i++) {
Datum value;
bool isnull = false;
vacuum_delay_point();
value = fetchfunc(stats, i, &isnull, rel);
if (isnull) {
null_cnt++;
continue;
}
nonnull_cnt++;
* If it's a variable-width field, add up widths for average width
* calculation. Note that if the value is toasted, we use the toasted
* width. We don't bother with this calculation if it's a fixed-width
* type.
*/
if (is_varlena) {
total_width += VARSIZE_ANY(DatumGetPointer(value));
* If the value is toasted, we want to detoast it just once to
* avoid repeated detoastings and resultant excess memory usage
* during the comparisons. Also, check to see if the value is
* excessively wide, and if so don't detoast at all --- just
* ignore the value.
*/
if (toast_raw_datum_size(value) > WIDTH_THRESHOLD) {
toowide_cnt++;
continue;
}
value = PointerGetDatum(PG_DETOAST_DATUM(value));
} else if (is_varwidth) {
total_width += strlen(DatumGetCString(value)) + 1;
}
values[values_cnt].value = value;
values[values_cnt].tupno = values_cnt;
tupnoLink[values_cnt] = values_cnt;
values_cnt++;
}
if (values_cnt > 0) {
int ndistinct,
nmultiple,
num_hist, dups_cnt;
int slot_idx = 0;
CompareScalarsContext cxt;
cxt.ssup = &ssup;
cxt.tupnoLink = tupnoLink;
qsort_arg((void*)values, values_cnt, sizeof(ScalarItem), compare_scalars, (void*)&cxt);
* Now scan the values in order, find the most common ones, and also
* accumulate ordering-correlation statistics.
*
* To determine which are most common, we first have to count the
* number of duplicates of each value. The duplicates are adjacent in
* the sorted list, so a brute-force approach is to compare successive
* datum values until we find two that are not equal. However, that
* requires N-1 invocations of the datum comparison routine, which are
* completely redundant with work that was done during the sort. (The
* sort algorithm must at some point have compared each pair of items
* that are adjacent in the sorted order; otherwise it could not know
* that it's ordered the pair correctly.) We exploit this by having
* compare_scalars remember the highest tupno index that each
* ScalarItem has been found equal to. At the end of the sort, a
* ScalarItem's tupnoLink will still point to itself if and only if it
* is the last item of its group of duplicates (since the group will
* be ordered by tupno).
*/
corr_xysum = 0;
ndistinct = 0;
nmultiple = 0;
dups_cnt = 0;
for (i = 0; i < values_cnt; i++) {
int tupno = values[i].tupno;
corr_xysum += ((double)i) * ((double)tupno);
dups_cnt++;
if (tupnoLink[tupno] == tupno) {
ndistinct++;
if (dups_cnt > 1) {
nmultiple++;
if (track_cnt < num_mcv || dups_cnt > track[track_cnt - 1].count) {
* Found a new item for the mcv list; find its
* position, bubbling down old items if needed. Loop
* invariant is that j points at an empty/ replaceable
* slot.
*/
int j;
if (track_cnt < num_mcv) {
track_cnt++;
}
for (j = track_cnt - 1; j > 0; j--) {
if (dups_cnt <= track[j - 1].count) {
break;
}
track[j].count = track[j - 1].count;
track[j].first = track[j - 1].first;
}
track[j].count = dups_cnt;
track[j].first = i + 1 - dups_cnt;
}
}
dups_cnt = 0;
}
}
stats->stats_valid = true;
stats->stanullfrac = (double)null_cnt / (double)samplerows;
if (is_varwidth) {
stats->stawidth = total_width / (double)nonnull_cnt;
} else {
stats->stawidth = stats->attrtype[0]->typlen;
}
analyze_compute_ndistinct(nmultiple, ndistinct, toowide_cnt, samplerows, totalrows, stats);
* Decide how many values are worth storing as most-common values. If
* we are able to generate a complete MCV list (all the values in the
* sample will fit, and we think these are all the ones in the table),
* then do so. Otherwise, store only those values that are
* significantly more common than the (estimated) average. We set the
* threshold rather arbitrarily at 25% more than average, with at
* least 2 instances in the sample. Also, we won't suppress values
* that have a frequency of at least 1/K where K is the intended
* number of histogram bins; such values might otherwise cause us to
* emit duplicate histogram bin boundaries. (We might end up with
* duplicate histogram entries anyway, if the distribution is skewed;
* but we prefer to treat such values as MCVs if at all possible.)
*/
if (track_cnt == ndistinct && toowide_cnt == 0 && stats->stadistinct > 0 && track_cnt <= num_mcv) {
num_mcv = track_cnt;
} else {
double stadistinct = stats->stadistinct;
double mincount;
if (stadistinct < 0) {
stadistinct = -stadistinct * totalrows;
}
mincount = compute_mcv_mincount(samplerows, stadistinct, num_bins);
if (num_mcv > track_cnt) {
num_mcv = track_cnt;
}
for (i = 0; i < num_mcv; i++) {
if (track[i].count < mincount) {
num_mcv = i;
break;
}
}
}
if (num_mcv > 0) {
MemoryContext old_context;
Datum* mcv_values = NULL;
float4* mcv_freqs = NULL;
old_context = MemoryContextSwitchTo(stats->anl_context);
mcv_values = (Datum*)palloc(num_mcv * sizeof(Datum));
mcv_freqs = (float4*)palloc(num_mcv * sizeof(float4));
for (i = 0; i < num_mcv; i++) {
mcv_values[i] =
datumCopy(values[track[i].first].value, stats->attrtype[0]->typbyval, stats->attrtype[0]->typlen);
mcv_freqs[i] = (double)track[i].count / (double)samplerows;
}
(void)MemoryContextSwitchTo(old_context);
stats->stakind[slot_idx] = STATISTIC_KIND_MCV;
stats->staop[slot_idx] = mystats->eqopr;
stats->stanumbers[slot_idx] = mcv_freqs;
stats->numnumbers[slot_idx] = num_mcv;
stats->stavalues[slot_idx] = mcv_values;
stats->numvalues[slot_idx] = num_mcv;
* Accept the defaults for stats->statypid and others. They have
* been set before we were called (see vacuum.h)
*/
slot_idx++;
}
* Generate a histogram slot entry if there are at least two distinct
* values not accounted for in the MCV list. (This ensures the
* histogram won't collapse to empty or a singleton.)
*/
num_hist = ndistinct - num_mcv;
if (num_hist > num_bins) {
num_hist = num_bins + 1;
}
if (num_hist >= 2) {
MemoryContext old_context;
Datum* hist_values = NULL;
int nvals;
int pos, posfrac, delta, deltafrac;
qsort((void*)track, num_mcv, sizeof(ScalarMCVItem), compare_mcvs);
* Collapse out the MCV items from the values[] array.
*
* Note we destroy the values[] array here... but we don't need it
* for anything more. We do, however, still need values_cnt.
* nvals will be the number of remaining entries in values[].
*/
if (num_mcv > 0) {
int src, dest;
int j;
src = dest = 0;
j = 0;
while (src < values_cnt) {
int ncopy;
if (j < num_mcv) {
int first = track[j].first;
if (src >= first) {
src = first + track[j].count;
j++;
continue;
}
ncopy = first - src;
} else {
ncopy = values_cnt - src;
}
rc = memmove_s(&values[dest], ncopy * sizeof(ScalarItem), &values[src], ncopy * sizeof(ScalarItem));
securec_check(rc, "", "");
src += ncopy;
dest += ncopy;
}
nvals = dest;
} else {
nvals = values_cnt;
}
AssertEreport(nvals >= num_hist, MOD_OPT, "");
old_context = MemoryContextSwitchTo(stats->anl_context);
hist_values = (Datum*)palloc(num_hist * sizeof(Datum));
* The object of this loop is to copy the first and last values[]
* entries along with evenly-spaced values in between. So the
* i'th value is values[(i * (nvals - 1)) / (num_hist - 1)]. But
* computing that subscript directly risks integer overflow when
* the stats target is more than a couple thousand. Instead we
* add (nvals - 1) / (num_hist - 1) to pos at each step, tracking
* the integral and fractional parts of the sum separately.
*/
delta = (nvals - 1) / (num_hist - 1);
deltafrac = (nvals - 1) % (num_hist - 1);
pos = posfrac = 0;
for (i = 0; i < num_hist; i++) {
hist_values[i] = datumCopy(values[pos].value, stats->attrtype[0]->typbyval, stats->attrtype[0]->typlen);
pos += delta;
posfrac += deltafrac;
if (posfrac >= (num_hist - 1)) {
pos++;
posfrac -= (num_hist - 1);
}
}
(void)MemoryContextSwitchTo(old_context);
stats->stakind[slot_idx] = STATISTIC_KIND_HISTOGRAM;
stats->staop[slot_idx] = mystats->ltopr;
stats->stavalues[slot_idx] = hist_values;
stats->numvalues[slot_idx] = num_hist;
* Accept the defaults for stats->statypid and others. They have
* been set before we were called (see vacuum.h)
*/
slot_idx++;
}
if (values_cnt > 1) {
analyze_compute_correlation(values_cnt, corr_xysum, 0, stats, slot_idx);
slot_idx++;
}
} else if (nonnull_cnt > 0) {
Assert(nonnull_cnt == toowide_cnt);
stats->stats_valid = true;
stats->stanullfrac = (double) null_cnt / (double) samplerows;
if (is_varwidth)
stats->stawidth = total_width / (double) nonnull_cnt;
else
stats->stawidth = stats->attrtype[0]->typlen;
stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
} else if (null_cnt > 0) {
stats->stats_valid = true;
stats->stanullfrac = 1.0;
if (is_varwidth) {
stats->stawidth = 0;
} else {
stats->stawidth = stats->attrtype[0]->typlen;
}
stats->stadistinct = 0.0;
}
}
* qsort_arg comparator for sorting ScalarItems
*
* Aside from sorting the items, we update the tupnoLink[] array
* whenever two ScalarItems are found to contain equal datums. The array
* is indexed by tupno; for each ScalarItem, it contains the highest
* tupno that that item's datum has been found to be equal to. This allows
* us to avoid additional comparisons in compute_scalar_stats().
*/
static int compare_scalars(const void* a, const void* b, void* arg)
{
Datum da = ((const ScalarItem*)a)->value;
int ta = ((const ScalarItem*)a)->tupno;
Datum db = ((const ScalarItem*)b)->value;
int tb = ((const ScalarItem*)b)->tupno;
CompareScalarsContext* cxt = (CompareScalarsContext*)arg;
int compare;
compare = ApplySortComparator(da, false, db, false, cxt->ssup);
if (compare != 0) {
return compare;
}
* The two datums are equal, so update cxt->tupnoLink[].
*/
if (cxt->tupnoLink[ta] < tb) {
cxt->tupnoLink[ta] = tb;
}
if (cxt->tupnoLink[tb] < ta) {
cxt->tupnoLink[tb] = ta;
}
* For equal datums, sort by tupno
*/
return ta - tb;
}
* @global stats
* @Description: get sample flag the second times from the sample rows come which the first sampling.
* @out require_samp - decide which block we need to get sample rows
* @in totalSamples - the total sample row num of the first sampling
* @in secSampSize - the second sample size
* @return: void
*/
void get_sampling_rows(bool** require_samp, int totalSamples, int secSampSize)
{
BlockSamplerData bs;
*require_samp = (bool*)palloc0(totalSamples * sizeof(bool));
BlockSampler_Init(&bs, totalSamples, secSampSize);
while (BlockSampler_HasMore(&bs)) {
BlockNumber targblock = BlockSampler_Next(&bs);
(*require_samp)[targblock] = true;
}
return;
}
* @global_stats
* compute sample rate for CN and DN.
*
* Input Param:
* relid: the relation oid for analyze
* vacstmt: the statment for analyze command
* num_samples: identify CN or DN compute sample size
*
* Output Param:
* require_samp: decide which block we need to get sample rows
*/
int compute_sample_size(VacuumStmt* vacstmt, int num_samples, bool** require_samp, Oid relid, int tableidx)
{
double com_size;
* The following choice of minrows is based on the paper
* "Random sampling for histogram construction: how much is enough?"
* by Surajit Chaudhuri, Rajeev Motwani and Vivek Narasayya, in
* Proceedings of ACM SIGMOD International Conference on Management
* of Data, 1998, Pages 436-447. Their Corollary 1 to Theorem 5
* says that for table size n, histogram size k, maximum relative
* error fraction, and error probability gamma, the minimum
* random sample size is
* r = 4 * k * ln(2*n/gamma) / error_fraction^2
* We use gamma=0.01 in our calculations.
* --------------------
*/
if (num_samples <= 0) {
Relation rel = NULL;
vacstmt->sampleTableRequired = (default_statistics_target >= 0) ? false : true;
if (vacstmt->pstGlobalStatEx[tableidx].totalRowCnts <= 0) {
if (vacstmt->pstGlobalStatEx[tableidx].totalRowCnts == 0) {
vacstmt->pstGlobalStatEx[tableidx].sampleRate = 0;
} else {
* If estimate total rows are dead rows, may be exist lived rows,
* now we cann't compute sample rate, so we should set sampleRate as 2%.
*/
vacstmt->pstGlobalStatEx[tableidx].sampleRate = DEFAULT_SAMPLERATE;
vacstmt->pstGlobalStatEx[tableidx].totalRowCnts = 0;
}
return 0;
}
rel = relation_open(relid, AccessShareLock);
com_size = compute_com_size(vacstmt, rel, tableidx);
relation_close(rel, AccessShareLock);
vacstmt->pstGlobalStatEx[tableidx].sampleRate = com_size / vacstmt->pstGlobalStatEx[tableidx].totalRowCnts;
return 0;
} else {
* Compute num of sample rows as roundoff in all datanodes,
* because avoid of exceeding to the num which compute in coordinator.
*/
int secSize =
(int)floor(vacstmt->pstGlobalStatEx[tableidx].totalRowCnts * vacstmt->pstGlobalStatEx[tableidx].sampleRate);
secSize = Min(secSize, num_samples);
get_sampling_rows(require_samp, num_samples, secSize);
return secSize;
}
}
* qsort comparator for sorting ScalarMCVItems by position
*/
static int compare_mcvs(const void* a, const void* b)
{
int da = ((const ScalarMCVItem*)a)->first;
int db = ((const ScalarMCVItem*)b)->first;
return da - db;
}
* @global stats
* @Description: send estimate or real total row count to cn.
* @in vacstmt - the statment for analyze or vacuum command
* @in analyzemode - identify which type the table will send total rows to cn
* @in totalrows - the total row num
* @return: void
*/
static void send_totalrowcnt_to_cn(VacuumStmt* vacstmt, AnalyzeMode analyzemode, double totalrows)
{
StringInfoData buf;
pq_beginmessage(&buf, 'P');
if (analyzemode == ANALYZENORMAL) {
pq_sendint64(&buf, totalrows);
pq_sendint64(&buf, vacstmt->pstGlobalStatEx[ANALYZENORMAL].topMemSize);
}
pq_endmessage(&buf);
}
bool do_analyze_sampling_begin(Relation onerel, VacuumStmt* vacstmt, AnalyzeMode analyzemode, Relation& tmpRelation,
Oid& tmpRelid, TupleConversionMap*& tupmap, TupleDesc& tupdesc, TupOutputState*& tstate)
{
analyze_tmptbl_debug_dn(
DebugStage_Begin, analyzemode, vacstmt->tmpSampleTblNameList, &tmpRelid, &tmpRelation, NULL, NULL);
if (tmpRelation) {
tupmap = check_tmptbl_tupledesc(onerel, tmpRelation->rd_att);
if (tupmap == NULL) {
* Don't insert tuple into temp table if doing ddl confilict with doing analyze.
*/
analyze_tmptbl_debug_dn(
DebugStage_End, ANALYZENORMAL, vacstmt->tmpSampleTblNameList, &tmpRelid, &tmpRelation, NULL, NULL);
if (default_statistics_target < 0)
return false;
}
}
* Don't send sample rows to coordinator and only saved sample rows into
* temp table for coordinator to select if relative-estimation.
*/
if (!vacstmt->sampleTableRequired) {
tupdesc = CreateTupleDescCopy(onerel->rd_att);
tstate = begin_tup_output_tupdesc(vacstmt->dest, tupdesc);
}
return true;
}
* @global stats
* @Description: Send the sample rows to cn generate for the second sampling and third sampling.
* Save sample rows to temp table for coordinator scan sample and compute statistics later.
* @in onerel - the relation for analyze
* @in vacstmt - the statment for analyze or vacuum command
* @in numrows - the first sample row num
* @in require_samp - the second sample flag
* @in rows - the first sample rows
* @return: void
*/
static void send_sample_to_cn(
Relation onerel, VacuumStmt* vacstmt, int64 numrows, const bool* require_samp, HeapTuple* rows)
{
TupOutputState* tstate = NULL;
TupleDesc tupdesc = NULL;
Oid tmpRelid = InvalidOid;
Relation tmpRelation = NULL;
TupleConversionMap* tupmap = NULL;
bool ret =
do_analyze_sampling_begin(onerel, vacstmt, ANALYZENORMAL, tmpRelation, tmpRelid, tupmap, tupdesc, tstate);
if (!ret) {
return;
}
vacstmt->dest->forAnalyzeSampleTuple = true;
for (int64 i = 0; i < numrows; i++) {
if (require_samp[i]) {
if (!vacstmt->sampleTableRequired) {
(void)ExecStoreTuple(rows[i], tstate->slot, InvalidBuffer, false);
(vacstmt->dest->receiveSlot)(tstate->slot, vacstmt->dest);
(void)ExecClearTuple(tstate->slot);
}
analyze_tmptbl_debug_dn(DebugStage_Execute,
ANALYZENORMAL,
vacstmt->tmpSampleTblNameList,
&tmpRelid,
&tmpRelation,
rows[i],
tupmap);
}
}
vacstmt->dest->forAnalyzeSampleTuple = false;
analyze_tmptbl_debug_dn(
DebugStage_End, ANALYZENORMAL, vacstmt->tmpSampleTblNameList, &tmpRelid, &tmpRelation, NULL, NULL);
if (!vacstmt->sampleTableRequired) {
end_tup_output(tstate);
}
}
* @global stats
* @Description: do sampling for normal table the second times.
* @in onerel - the relation for analyze
* @in vacstmt - the statment for analyze or vacuum command
* @in numrows - the first sample row num
* @in totalrows - the total row num which using for comput the second sample num
* @in rows - the first sample rows
* @return: void
*/
static void do_sampling_normal_second(
Relation onerel, VacuumStmt* vacstmt, int64 numrows, double totalrows, HeapTuple* rows)
{
bool* require_samp = NULL;
vacstmt->tableidx = ANALYZENORMAL;
vacstmt->pstGlobalStatEx[vacstmt->tableidx].totalRowCnts = totalrows;
vacstmt->num_samples = compute_sample_size(vacstmt, numrows, &require_samp, onerel->rd_id, ANALYZENORMAL);
if (vacstmt->num_samples > 0) {
send_sample_to_cn(onerel, vacstmt, numrows, require_samp, rows);
}
elog(DEBUG1,
"%s insert %lf sample rows for normal table into temp table for queryid:%lu. "
"detail: samplerows[%ld], sampleRate[%lf], secsamplerows[%d]",
g_instance.attr.attr_common.PGXCNodeName,
totalrows,
u_sess->debug_query_id,
numrows,
vacstmt->pstGlobalStatEx[ANALYZENORMAL].sampleRate,
vacstmt->num_samples);
return;
}
* equalTupleDescAttrsTypeid: check two tuples descriptor's typeid are diffirent or not.
*
* Parameters:
* @in tupdesc1: one tuple descriptor
* @in tupdesc2: another tuple descriptor
*
* Returns: bool
*/
static bool equalTupleDescAttrsTypeid(TupleDesc tupdesc1, TupleDesc tupdesc2)
{
if ((NULL == tupdesc1) || (NULL == tupdesc2))
return true;
if (tupdesc1->natts != tupdesc2->natts)
return false;
for (int i = 0; i < tupdesc1->natts; i++) {
Form_pg_attribute attr1 = &tupdesc1->attrs[i];
Form_pg_attribute attr2 = &tupdesc2->attrs[i];
if ((!attr1->attisdropped && !attr2->attisdropped) &&
((attr1->atttypid != attr2->atttypid) || (attr1->attlen != attr2->attlen)))
return false;
}
return true;
}
static BlockNumber GetOneRelNBlocks(
const Relation onerel, const Relation Irel, const VacuumStmt* vacstmt, double totalindexrows)
{
BlockNumber nblocks = 0;
if (RelationIsColStore(onerel)) {
nblocks = estimate_psort_index_blocks(Irel->rd_att, totalindexrows);
} else if (RelationIsPartitioned(onerel) && !RelationIsGlobalIndex(Irel)) {
ListCell* partCell = NULL;
Partition part = NULL;
Oid indexOid = InvalidOid;
Oid partOid = InvalidOid;
Oid partIndexOid = InvalidOid;
Partition indexPart = NULL;
Relation partRel = NULL;
indexOid = RelationGetRelid(Irel);
foreach (partCell, vacstmt->partList) {
part = (Partition)lfirst(partCell);
partOid = PartitionGetPartid(part);
if (RelationIsSubPartitioned(onerel)) {
partRel = partitionGetRelation(onerel, part);
List *subPartOidList = relationGetPartitionOidList(partRel);
ListCell *lc = NULL;
foreach (lc, subPartOidList) {
Oid subPartOid = (Oid)lfirst_oid(lc);
partIndexOid = getPartitionIndexOid(indexOid, subPartOid);
}
releasePartitionOidList(&subPartOidList);
releaseDummyRelation(&partRel);
} else {
partIndexOid = getPartitionIndexOid(indexOid, partOid);
}
indexPart = partitionOpen(Irel, partIndexOid, AccessShareLock);
nblocks += PartitionGetNumberOfBlocks(Irel, indexPart);
partitionClose(Irel, indexPart, AccessShareLock);
}
if (vacstmt->issubpartition) {
partIndexOid = getPartitionIndexOid(indexOid, PartitionGetPartid(vacstmt->onepart));
indexPart = partitionOpen(Irel, partIndexOid, AccessShareLock);
nblocks = PartitionGetNumberOfBlocks(Irel, indexPart);
partitionClose(Irel, indexPart, AccessShareLock);
}
} else {
nblocks = RelationGetNumberOfBlocks(Irel);
}
return nblocks;
}
* estimate_index_blocks
* This function estimates the number of blocks(pages) of the index.
* We assume that all non column stored relations are being stored as B-trees.
*/
static BlockNumber estimate_index_blocks(Relation rel, double totalTuples, double table_factor)
{
BlockNumber nblocks = 0;
if (RelationIsColStore(rel)) {
nblocks = estimate_psort_index_blocks(rel->rd_att, totalTuples);
} else {
nblocks = estimate_btree_index_blocks(rel->rd_att, totalTuples, table_factor);
}
return nblocks;
}
* update_pages_and_tuples_pgclass: Update pages and tuples of the relation for analyze to pg_class.
*
* Parameters:
* @in onerel: the relation for analyze or vacuum
* @in vacstmt: the statment for analyze or vacuum command
* @in attr_cnt: the count of attributes for analyze
* @in vacattrstats: statistic structure for all attributes or index by the statment for analyze
* using for comput and update pg_statistic.
* @in hasindex: are there have index or not for analyze
* @in nindexes: the num of index for analyze
* @in indexdata: per-index data for analyze
* @in Irel: malloc a new index relation for analyze
* @in relpages: relation pages
* @in totalrows: total rows for the relation
* @in numrows: the num of sample rows
* @in inh: is inherit table for analzye or not
*
* Returns: void
*/
static void update_pages_and_tuples_pgclass(Relation onerel, VacuumStmt* vacstmt, int attr_cnt,
VacAttrStats** vacattrstats, bool hasindex, int nindexes, AnlIndexData* indexdata, Relation* Irel,
BlockNumber relpages, double totalrows, double totaldeadrows, int64 numrows, bool inh)
{
BlockNumber updrelpages = relpages;
* we have updated pg_class in function ReceivePageAndTuple when we receive pg_class from dn1,
* so, we only update pg_class on datanodes this place.
*/
if (IS_PGXC_DATANODE) {
BlockNumber mapCont = 0;
if (RelationIsColStore(onerel)) {
* for CU format and PAX format, relpages is 0 since RelationGetNumberOfBlocks
* return 0, so we have to generate relpages from totalrows
*
* The best way is use all rows(alive and dead rows), however to ensure forward
* compatibility, we can only process scenes that all rows are dead
*/
double allrows = totalrows > 0 ? totalrows : totaldeadrows;
if (RelationIsPAXFormat(onerel)) {
updrelpages = estimate_cstore_blocks(onerel, vacattrstats, attr_cnt, numrows, allrows, true);
} else {
updrelpages = estimate_cstore_blocks(onerel, vacattrstats, attr_cnt, numrows, allrows, false);
}
}
#ifdef ENABLE_MULTIPLE_NODES
else if (RelationIsTsStore(onerel)) {
updrelpages = estimate_tsstore_blocks(onerel, attr_cnt, totalrows);
}
#endif
if (RelationIsPartitioned(onerel)) {
Relation partRel = NULL;
ListCell* partCell = NULL;
Partition part = NULL;
foreach (partCell, vacstmt->partList) {
part = (Partition)lfirst(partCell);
partRel = partitionGetRelation(onerel, part);
mapCont += visibilitymap_count(onerel, part);
releaseDummyRelation(&partRel);
}
if (vacstmt->issubpartition) {
mapCont += visibilitymap_count(onerel, vacstmt->onepart);
}
} else {
mapCont = visibilitymap_count(onerel, NULL);
}
if (!vacuumPartition(vacstmt->flags)) {
Relation classRel = heap_open(RelationRelationId, RowExclusiveLock);
vac_update_relstats(onerel, classRel, updrelpages, totalrows, mapCont,
hasindex, InvalidTransactionId, InvalidMultiXactId);
heap_close(classRel, RowExclusiveLock);
} else {
Partition part = (Partition)((vacstmt->issubpartition) ? vacstmt->onepart : linitial(vacstmt->partList));
vac_update_partstats(part, updrelpages, totalrows, mapCont, InvalidTransactionId, InvalidMultiXactId);
}
}
double table_factor = 1.0;
bool isCompute = ENABLE_SQL_BETA_FEATURE(PARAM_PATH_OPT) && onerel && onerel->rd_rel;
if (isCompute) {
BlockNumber pages = onerel->rd_rel->relpages;
double estimated_relpages = (double)estimate_index_blocks(onerel, totalrows, table_factor);
table_factor = (double)pages / estimated_relpages;
table_factor = table_factor > 1.0 ? table_factor : 1.0;
}
* Same for indexes. Vacuum always scans all indexes, so if we're part of
* VACUUM ANALYZE, don't overwrite the accurate count already inserted by
* VACUUM.
*/
if (!((unsigned int)vacstmt->options & VACOPT_VACUUM) || ((unsigned int)vacstmt->options & VACOPT_ANALYZE)) {
Partition part = NULL;
if (vacuumPartition(vacstmt->flags)) {
if (vacstmt->issubpartition) {
part = vacstmt->onepart;
} else {
part = (Partition)linitial(vacstmt->partList);
}
}
for (int ind = 0; ind < nindexes; ind++) {
AnlIndexData* thisdata = &indexdata[ind];
double totalindexrows;
BlockNumber nblocks = 0;
if (RelationIsUstoreFormat(onerel)) {
totalindexrows = totalrows;
} else {
totalindexrows = ceil(thisdata->tupleFract * totalrows);
}
* @global stats
* Update pg_class with global relpages and global reltuples for indexs the CN where analyze is issued.
*/
if (IS_PGXC_COORDINATOR && ((unsigned int)vacstmt->options & VACOPT_ANALYZE) &&
(0 != vacstmt->pstGlobalStatEx[vacstmt->tableidx].totalRowCnts)) {
nblocks = estimate_index_blocks(Irel[ind], totalindexrows, table_factor);
if (part) {
Oid partIndexOid = getPartitionIndexOid(RelationGetRelid(Irel[ind]), PartitionGetPartid(part));
Partition partIndex = partitionOpen(Irel[ind], partIndexOid, AccessShareLock);
vac_update_partstats(partIndex, nblocks, totalindexrows, 0,
BootstrapTransactionId, InvalidMultiXactId);
partitionClose(Irel[ind], partIndex, AccessShareLock);
} else {
Relation classRel = heap_open(RelationRelationId, RowExclusiveLock);
vac_update_relstats(Irel[ind], classRel, nblocks, totalindexrows, 0, false,
BootstrapTransactionId, InvalidMultiXactId);
heap_close(classRel, RowExclusiveLock);
}
continue;
}
if ((unsigned int)vacstmt->options & VACOPT_VACUUM)
break;
nblocks = GetOneRelNBlocks(onerel, Irel[ind], vacstmt, totalindexrows);
if (part) {
Oid partIndexOid = getPartitionIndexOid(RelationGetRelid(Irel[ind]), PartitionGetPartid(part));
Partition partIndex = partitionOpen(Irel[ind], partIndexOid, AccessShareLock);
vac_update_partstats(partIndex, nblocks, totalindexrows, 0,
BootstrapTransactionId, InvalidMultiXactId);
partitionClose(Irel[ind], partIndex, AccessShareLock);
} else {
Relation classRel = heap_open(RelationRelationId, RowExclusiveLock);
vac_update_relstats(Irel[ind], classRel, nblocks, totalindexrows, 0, false,
BootstrapTransactionId, InvalidMultiXactId);
heap_close(classRel, RowExclusiveLock);
}
}
}
}
void switch_memory_context(MemoryContext& col_context, MemoryContext& old_context,
AnalyzeMode analyzemode)
{
if (ANALYZENORMAL == analyzemode) {
col_context = AllocSetContextCreate(u_sess->analyze_cxt.analyze_context,
"Analyze Column",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
old_context = MemoryContextSwitchTo(col_context);
}
}
* do_analyze_samplerows: Compute statistics and update to pg_statistic if using sample rows.
*
* Parameters:
* @in onerel: the relation for analyze or vacuum
* @in vacstmt: the statment for analyze or vacuum command
* @in attr_cnt: the count of attributes for analyze
* @in vacattrstats: statistic structure for all attributes or index by the statment for analyze
* using for comput and update pg_statistic.
* @in hasindex: are there have index or not for analyze
* @in nindexes: the num of index for analyze
* @in indexdata: per-index data for analyze
* @in tableidx: identify the relation type for analyze
* @in inh: is inherit table for analzye or not
* @in Irel: malloc a new index relation for analyze
* @in totalrows: total rows for the relation
* @in numrows: the num of sample rows
* @in rows: sample tuples for compute statistics
* @in analyzemode: identify the relation type for analyze
* @in caller_context: analyze memory context
* @in save_userid: saved current userId using for finalize to restore
* @in save_sec_context: saved for SecurityRestrictionContext
* @in save_nestlevel: saved for GUCNestLevel
*
* Returns: void
*/
static bool do_analyze_samplerows(Relation onerel, VacuumStmt* vacstmt, int attr_cnt, VacAttrStats** vacattrstats,
bool hasindex, int nindexes, AnlIndexData* indexdata, bool inh, Relation* Irel, double* totalrows, int64* numrows,
HeapTuple* rows, AnalyzeMode analyzemode, MemoryContext caller_context,
Oid save_userid, int save_sec_context, int save_nestlevel)
{
int64 num_samplerows = *numrows;
double num_total_rows = *totalrows;
HeapTuple* samplerows = rows;
int tableidx = vacstmt->tableidx;
* If sampleRate more than 0, it means DN need send sample rows and total row count to CN.
* We should send real total row count back to CN If sampleRate of either
* dfs or delta more than 0, and current table is dfs.
* because if dfs's rows is 0, we should send delta's rows to CN.
*/
if (NEED_DO_SECOND_ANALYZE(analyzemode, vacstmt)) {
switch (analyzemode) {
case ANALYZENORMAL:
do_sampling_normal_second(onerel, vacstmt, num_samplerows, num_total_rows, rows);
break;
default:
break;
}
if (NEED_SEND_TOTALROWCNT_TO_CN(analyzemode, vacstmt, num_total_rows)) {
send_totalrowcnt_to_cn(vacstmt, analyzemode, num_total_rows);
}
}
* @global stats: totalRowCnts = 0 indicates it's another CN trying to sync global stats with
* the CN where analyze issued,
* and we don't want other CNs to get the sample rows and compute stats again.
*/
if (IS_PGXC_COORDINATOR && (vacstmt->pstGlobalStatEx[tableidx].totalRowCnts != 0)) {
if (analyzemode == ANALYZENORMAL) {
samplerows = vacstmt->sampleRows;
num_samplerows = vacstmt->num_samples;
} else {
samplerows = vacstmt->pstGlobalStatEx[tableidx].sampleRows;
num_samplerows = vacstmt->pstGlobalStatEx[tableidx].num_samples;
}
num_total_rows = vacstmt->pstGlobalStatEx[tableidx].totalRowCnts;
*numrows = num_samplerows;
*totalrows = num_total_rows;
* There is a condition: coordinator has received sample rows from all datanodes,
* then the relation's attribute type has modified by other client.
* if we compute stats using relation's attribute in local, it different with the sample tuples,
* so we should check the relation's attribute in local is different with attribute
* received from datanodes or not.
*/
if (!IsConnFromCoord() && !vacstmt->pstGlobalStatEx[tableidx].isReplication) {
TupleDesc rd_att =
(analyzemode == ANALYZENORMAL) ? vacstmt->tupleDesc : vacstmt->pstGlobalStatEx[tableidx].tupleDesc;
* It identify the relation's attribute have modified in local under analyzing
* if they are different, we don't compute stats for the more.
*/
if (!equalTupleDescAttrsTypeid(rd_att, onerel->rd_att)) {
if (!inh)
vac_close_indexes(nindexes, Irel, NoLock);
do_analyze_finalize(caller_context, save_userid, save_sec_context, save_nestlevel, analyzemode);
* If vacstmt->relation is NULL, it means that the delta table is processed here.
* The table information in the warning will be given when processing the main table.
*/
if (vacstmt->relation)
ereport(WARNING,
(errmsg("The tupleDesc analyzed in coordinator is different from tupleDesc which received from "
"datanode."),
errhint("Do analyze again for the relation: %s.", vacstmt->relation->relname)));
else
ereport(WARNING,
(errmsg("The tupleDesc analyzed in coordinator is different from tupleDesc which received from "
"datanode.")));
return false;
}
}
}
if (num_samplerows <= 0) {
return true;
}
int i;
MemoryContext col_context, old_context;
switch_memory_context(col_context, old_context, analyzemode);
for (i = 0; i < attr_cnt; i++) {
VacAttrStats* stats = vacattrstats[i];
AttributeOpts* aopt = NULL;
if (stats->num_attrs > 1) {
continue;
}
stats->rows = samplerows;
stats->tupDesc = onerel->rd_att;
Assert(onerel->rd_att->natts == stats->tupDesc->natts);
DEBUG_START_TIMER;
(*stats->compute_stats)(stats, std_fetch_func, num_samplerows, num_total_rows, onerel);
DEBUG_STOP_TIMER("Compute statistic for attr: %s", NameStr(stats->attrs[0]->attname));
* If the appropriate flavor of the n_distinct option is
* specified, override with the corresponding value.
*/
aopt = get_attribute_options(onerel->rd_id, stats->attrs[0]->attnum);
if (aopt != NULL) {
float8 n_distinct;
n_distinct = inh ? aopt->n_distinct_inherited : aopt->n_distinct;
if (fabs(n_distinct) > EPSILON)
stats->stadistinct = n_distinct;
}
set_stats_dndistinct(stats, vacstmt, tableidx);
MemoryContextResetAndDeleteChildren(col_context);
}
if (hasindex)
compute_index_stats(onerel, num_total_rows, indexdata, nindexes, samplerows, num_samplerows, col_context);
(void)MemoryContextSwitchTo(old_context);
MemoryContextDelete(col_context);
* Emit the completed stats rows into pg_statistic, replacing any
* previous statistics for the target columns. (If there are stats in
* pg_statistic for columns we didn't process, we leave them alone.)
*/
char kind = STARELKIND_CLASS;
Oid targetTabOid = RelationGetRelid(onerel);
Partition part = NULL;
if (vacuumPartition(vacstmt->flags)) {
kind = STARELKIND_PARTITION;
if (vacstmt->issubpartition) {
part = vacstmt->onepart;
} else {
part = (Partition)linitial(vacstmt->partList);
}
targetTabOid = PartitionGetPartid(part);
}
update_attstats(targetTabOid, kind, inh, attr_cnt, vacattrstats, RelationGetRelPersistence(onerel));
for (i = 0; i < nindexes; i++) {
AnlIndexData* thisdata = &indexdata[i];
Oid indexOid = RelationGetRelid(Irel[i]);
if (NULL != part) {
indexOid = getPartitionIndexOid(RelationGetRelid(Irel[i]), PartitionGetPartid(part));
}
update_attstats(indexOid, kind, false, thisdata->attr_cnt, thisdata->vacattrstats,
RelationGetRelPersistence(Irel[i]));
}
return true;
}
* Description: Initialize special information for analyze one attributes of sample table.
*
* Parameters:
* @in spec: the sample info of special attribute for compute statistic
* @in stats: statistic info of one attribute for analyze
* @in numTotalRows: total rows for the relation
* @in numSampleRows: the num of sample rows
*
* Returns: void
*/
static void init_sample_table_spec_info(
AnalyzeSampleTableSpecInfo* spec, VacAttrStats* stats, double numTotalRows, int64 numSampleRows)
{
errno_t rc;
AssertEreport(stats != NULL, MOD_OPT, "");
AssertEreport(numSampleRows > 0.0, MOD_OPT, "");
stats->stadistinct = 0.0;
stats->stanullfrac = 0.0;
stats->stawidth = 0;
spec->stats = stats;
spec->is_varwidth = es_is_variable_width(stats);
spec->nonnull_cnt = 0;
spec->null_cnt = 0;
spec->nmultiple = 0;
spec->ndistinct = 0;
spec->samplerows = numSampleRows;
spec->totalrows = numTotalRows;
spec->v_alias = (char**)palloc0(sizeof(char*) * stats->num_attrs);
for (unsigned int i = 0; i < stats->num_attrs; ++i) {
spec->v_alias[i] = (char*)palloc0(sizeof(char) * NAMEDATALEN);
}
rc = memset_s(&spec->mcv_list, sizeof(McvInfo), 0, sizeof(McvInfo));
securec_check(rc, "\0", "\0");
spec->mcv_list.stattarget = MAX_ATTR_MCV_STAT_TARGET;
rc = memset_s(&spec->hist_list, sizeof(HistgramInfo), 0, sizeof(HistgramInfo));
securec_check(rc, "\0", "\0");
spec->hist_list.stattarget = MAX_ATTR_HIST_STAT_TARGET;
spec->hist_list.histitem = (SampleItem*)palloc0(spec->hist_list.stattarget * sizeof(SampleItem));
}
* do_analyze_sampletable: analyze attributes with execute query, compute statistics
* and update to pg_statistic if using sample rows.
*
* Parameters:
* @in onerel: the relation for analyze or vacuum
* @in vacstmt: the statment for analyze or vacuum command
* @in attr_cnt: the count of attributes for analyze
* @in vacattrstats: statistic structure for all attributes or index by the statment for analyze
* using for comput and update pg_statistic.
* @in hasindex: are there have index or not for analyze
* @in nindexes: the num of index for analyze
* @in indexdata: per-index data for analyze
* @in inh: is inherit table for analzye or not
* @in Irel: malloc a new index relation for analyze
* @in totalrows: total rows for the relation
* @in numrows: the num of sample rows
* @in analyzemode: identify the relation type for analyze
*
* Returns: void
*/
static void do_analyze_sampletable(Relation onerel, VacuumStmt* vacstmt, int attr_cnt, VacAttrStats** vacattrstats,
bool hasindex, int nindexes, AnlIndexData* indexdata, bool inh, Relation* Irel, double totalrows, int64 numrows,
AnalyzeMode analyzemode)
{
int i;
MemoryContext col_context, old_context;
int64 num_sample = 0;
HeapTuple* samplerows = NULL;
switch_memory_context(col_context, old_context, analyzemode);
for (i = 0; i < attr_cnt; i++) {
VacAttrStats* stats = vacattrstats[i];
AttributeOpts* aopt = NULL;
stats->tupDesc = onerel->rd_att;
if (stats->num_attrs > 1 || (!OidIsValid(stats->attrtype[0]->typanalyze))) {
do_analyze_compute_attr_stats(inh, onerel, vacstmt, stats, totalrows, numrows, analyzemode);
} else {
* If specific the typanalyze of function, we should get sample rows from all datanodes,
* and if there are more than one specific typanalyze of function, we want to get sample only once.
*/
if (0 == num_sample) {
get_sample_rows_for_query(
u_sess->analyze_cxt.analyze_context, onerel, vacstmt, &num_sample, &samplerows);
}
if (0 != num_sample) {
stats->rows = samplerows;
(*stats->compute_stats)(stats, std_fetch_func, num_sample, totalrows, onerel);
}
}
* If the appropriate flavor of the n_distinct option is
* specified, override with the corresponding value.
*/
aopt = get_attribute_options(onerel->rd_id, stats->attrs[0]->attnum);
if (aopt != NULL) {
float8 n_distinct;
n_distinct = inh ? aopt->n_distinct_inherited : aopt->n_distinct;
if (fabs(n_distinct) > EPSILON)
stats->stadistinct = n_distinct;
}
set_stats_dndistinct(stats, vacstmt, vacstmt->tableidx);
MemoryContextResetAndDeleteChildren(col_context);
}
* Emit the completed stats rows into pg_statistic, replacing any
* previous statistics for the target columns. (If there are stats in
* pg_statistic for columns we didn't process, we leave them alone.)
*/
char kind = STARELKIND_CLASS;
Oid targetTabOid = RelationGetRelid(onerel);
Partition part = NULL;
if (vacuumPartition(vacstmt->flags)) {
kind = STARELKIND_PARTITION;
if (vacstmt->issubpartition) {
part = vacstmt->onepart;
} else {
part = (Partition)linitial(vacstmt->partList);
}
targetTabOid = PartitionGetPartid(part);
}
update_attstats(targetTabOid, kind, inh, attr_cnt, vacattrstats,
RelationGetRelPersistence(onerel));
* If exist index and there is no sample rows, we don't need compute statistic for index any more.
* Otherwise, we shoulde compute index statistics.
*/
if (hasindex &&
analyze_index_sample_rows(
u_sess->analyze_cxt.analyze_context, onerel, vacstmt, nindexes, indexdata, &num_sample, &samplerows)) {
compute_index_stats(onerel, totalrows, indexdata, nindexes, samplerows, num_sample, col_context);
for (i = 0; i < nindexes; i++) {
AnlIndexData* thisdata = &indexdata[i];
Oid indexOid = RelationGetRelid(Irel[i]);
if (NULL != part) {
indexOid = getPartitionIndexOid(RelationGetRelid(Irel[i]), PartitionGetPartid(part));
}
update_attstats(indexOid, kind, false, thisdata->attr_cnt, thisdata->vacattrstats,
RelationGetRelPersistence(Irel[i]));
}
}
(void)MemoryContextSwitchTo(old_context);
MemoryContextDelete(col_context);
}
* analyze_compute_ndistinct: estimate the number of distinct values using the estimator.
*
* Parameters:
* @in nmultiple: duplicate num of distinct values more than 1
* @in ndistinct: num of distinct for sample rows
* @in toowide_cnt: how many rows that value is toasted
* @in samplerows: num of sample rows
* @in totalrows: num of total rows
* @in stats: statistics for one attribute
*
* Returns: void
*/
static void analyze_compute_ndistinct(
int64 nmultiple, double ndistinct, int64 toowide_cnt, int64 samplerows, double totalrows, VacAttrStatsP stats)
{
if (nmultiple == 0) {
* If we found no repeated non-null values, assume it's a unique
* column; but be sure to discount for any nulls we found.
*/
stats->stadistinct = -1.0 * (1 - stats->stanullfrac);
} else if (toowide_cnt == 0 && (nmultiple - ndistinct == 0)) {
* Every value in the sample appeared more than once. Assume the
* column has just these values.
*/
stats->stadistinct = ndistinct;
} else {
* Estimate the number of distinct values using the estimator
* proposed by Haas and Stokes in IBM Research Report RJ 10025:
* n*d / (n - f1 + f1*n/N)
* where f1 is the number of distinct values that occurred
* exactly once in our sample of n rows (from a total of N),
* and d is the total number of distinct values in the sample.
* This is their Duj1 estimator; the other estimators they
* recommend are considerably more complex, and are numerically
* very unstable when n is much smaller than N.
*
* Overwidth values are assumed to have been distinct.
* ----------
*/
double f1 = ndistinct - nmultiple + toowide_cnt;
double d = f1 + nmultiple;
double numer = 0;
double denom = 0;
double stadistinct = 0;
numer = (double)samplerows * d;
denom = (double)(samplerows - f1) + (double)f1 * (double)samplerows / totalrows;
stadistinct = numer / denom;
if (stadistinct < (double)d) {
stadistinct = (double)d;
}
if (stadistinct > totalrows) {
stadistinct = totalrows;
}
stats->stadistinct = floor(stadistinct + 0.5);
}
* If we estimated the number of distinct values at more than 10% of
* the total row count (a very arbitrary limit), then assume that
* stadistinct should scale with the row count rather than be a fixed
* value.
*/
if (stats->stadistinct > 0.1 * totalrows) {
stats->stadistinct = -(stats->stadistinct / totalrows);
}
if (stats->stadistinct >= 0 && stats->stadndistinct >= 0 && stats->stadndistinct > stats->stadistinct) {
stats->stadistinct = stats->stadndistinct;
}
}
* analyze_compute_correlation: estimate correlation.
*
* Parameters:
* @in values_cnt: count of non-null values for all samples
* @in corr_xysum: correlation of the sum
* @in correlations: the correlations of dn1 if non zero for sampletable, others compute by corr_xysum
* @in stats: statistics for one attribute
* @in slot_idx: indentify index of statistic kind for correlation
*
* Returns: void
*/
static void analyze_compute_correlation(
int64 values_cnt, double corr_xysum, double correlations, VacAttrStatsP stats, int slot_idx)
{
StdAnalyzeData* mystats = (StdAnalyzeData*)stats->extra_data;
MemoryContext old_context;
float4* corrs = NULL;
double corr_xsum, corr_x2sum;
old_context = MemoryContextSwitchTo(stats->anl_context);
corrs = (float4*)palloc(sizeof(float4));
(void)MemoryContextSwitchTo(old_context);
* Since we know the x and y value sets are both
* 0, 1, ..., values_cnt-1
* we have sum(x) = sum(y) =
* (values_cnt-1)*values_cnt / 2
* and sum(x^2) = sum(y^2) =
* (values_cnt-1)*values_cnt*(2*values_cnt-1) / 6.
* ----------
*/
if (correlations == 0) {
corr_xsum = ((double)(values_cnt - 1)) * ((double)values_cnt) / 2.0;
corr_x2sum = ((double)(values_cnt - 1)) * ((double)values_cnt) * (double)(2 * values_cnt - 1) / 6.0;
corrs[0] =
(values_cnt * corr_xysum - corr_xsum * corr_xsum) / (values_cnt * corr_x2sum - corr_xsum * corr_xsum);
} else {
*corrs = correlations;
}
stats->stakind[slot_idx] = STATISTIC_KIND_CORRELATION;
stats->staop[slot_idx] = mystats->ltopr;
stats->stanumbers[slot_idx] = corrs;
stats->numnumbers[slot_idx] = 1;
}
* compute_mcv_mincount: Compute the min count of mcv.
*
* Parameters:
* @in samplerows: num of all the samples
* @in ndistinct: how many distinct value of all the samples
* @in attstattarget: the bound for the final mcv list.
*
* Returns: double
*/
static double compute_mcv_mincount(int64 samplerows, double ndistinct, double attstattarget)
{
#define MIN_MCV_THRESHOLD 1.25
double avgcount, mincount, maxmincount;
avgcount = (double)samplerows / ndistinct;
mincount = avgcount * 1.25;
if (mincount < 2) {
mincount = 2;
}
maxmincount = (double)samplerows / attstattarget;
if (mincount > maxmincount) {
mincount = maxmincount;
}
return mincount;
}
* spi_callback_get_sample_rows: A callback function for use to copy each tuple for index sample rows.
*
* Parameters:
* @in clientData: argument to call back function (usually pointer to data-structure
* that the callback function populates).
*
* Returns: void
*/
static void spi_callback_get_sample_rows(void* clientData)
{
MemoryContext oldctx;
AnalyzeSampleRowsSpecInfo* spec = (AnalyzeSampleRowsSpecInfo*)clientData;
bool* nulls = NULL;
Datum* values = NULL;
AssertEreport(SPI_tuptable != NULL, MOD_OPT, "");
AssertEreport(SPI_tuptable->tupdesc, MOD_OPT, "");
if (0 == SPI_processed)
return;
nulls = (bool*)palloc0(spec->tupDesc->natts * sizeof(bool));
values = (Datum*)palloc0(spec->tupDesc->natts * sizeof(Datum));
oldctx = MemoryContextSwitchTo(spec->memorycontext);
spec->num_sample = SPI_processed;
spec->samplerows = (HeapTuple*)palloc0(SPI_processed * sizeof(HeapTuple));
if (spec->tupDesc->natts == SPI_tuptable->tupdesc->natts) {
for (uint32 i = 0; i < SPI_processed; i++) {
spec->samplerows[i] = (HeapTuple)tableam_tops_copy_tuple(SPI_tuptable->vals[i]);
}
} else {
* Rebuild tuple value by using the tuple receive by spi interface and relation's tuple desc.
* Because if there is dropped column, the tuple desc of statistics include it and
* the tuple desc receive from datanode exclude it, so we should keep consistent
* between tuple desc and tuple values.
*/
for (uint32 i = 0; i < SPI_processed; i++) {
int dropped_num = 0;
for (int j = 0; j < spec->tupDesc->natts; j++) {
Form_pg_attribute attr1 = &spec->tupDesc->attrs[j];
if (attr1->attisdropped) {
nulls[j] = true;
dropped_num++;
continue;
}
values[j] = tableam_tops_tuple_getattr(SPI_tuptable->vals[i], j - dropped_num + 1, SPI_tuptable->tupdesc, &nulls[j]);
}
AssertEreport(spec->tupDesc->natts == (SPI_tuptable->tupdesc->natts + dropped_num), MOD_OPT, "");
spec->samplerows[i] = (HeapTuple)tableam_tops_form_tuple(spec->tupDesc, values, nulls);
}
}
(void)MemoryContextSwitchTo(oldctx);
}
static void get_sample_rows_utility(
MemoryContext speccontext, Relation onerel, double sampleRate, int64* num_sample_rows, HeapTuple** samplerows)
{
AnalyzeSampleRowsSpecInfo spec;
const char* sampleSchemaName = NULL;
const char* sampleTableName = NULL;
StringInfoData str;
initStringInfo(&str);
sampleSchemaName = get_namespace_name(get_rel_namespace(onerel->rd_id));
sampleTableName = get_rel_name(onerel->rd_id);
* This query orders the table by rank on the value and chooses values that occur every
* 'bucketSize' interval. It also chooses the maximum value from the relation. It then
* removes duplicate values and orders the values in ascending order. This corresponds to
* the histogram.
*/
appendStringInfo(&str,
"select * from %s.%s where pg_catalog.random() <= %.2f",
quote_identifier(sampleSchemaName),
quote_identifier(sampleTableName),
sampleRate);
spec.memorycontext = speccontext;
spec.tupDesc = onerel->rd_att;
spec.num_sample = 0;
u_sess->analyze_cxt.is_under_analyze = true;
spi_exec_with_callback(DestSPI, str.data, false, 0, true, (void (*)(void*))spi_callback_get_sample_rows, &spec);
u_sess->analyze_cxt.is_under_analyze = false;
*num_sample_rows = spec.num_sample;
*samplerows = spec.samplerows;
pfree_ext(str.data);
pfree_ext(sampleTableName);
pfree_ext(sampleSchemaName);
}
* @Description: Construct and execute query to get sample rows
* for index columns or tsvetor column.
*
* @in speccontext: memory context for get sample rows
* @in tableidx: identify the relation type for analyze
* @in onerel: the relation for current table
* @in vacstmt: the statment for analyze or vacuum command.
* @out num_sample_rows: the result num of sample rows
* @out samplerows: all the result sample rows
* @return - void
*/
static void get_sample_rows_for_query(
MemoryContext speccontext, Relation onerel, VacuumStmt* vacstmt, int64* num_sample_rows, HeapTuple** samplerows)
{
int save_default_statistics_target;
double save_sampleRate;
save_default_statistics_target = default_statistics_target;
save_sampleRate = vacstmt->pstGlobalStatEx[vacstmt->tableidx].sampleRate;
default_statistics_target = 100;
(void)compute_sample_size(vacstmt, 0, NULL, onerel->rd_id, vacstmt->tableidx);
default_statistics_target = save_default_statistics_target;
get_sample_rows_utility(
speccontext, onerel, vacstmt->pstGlobalStatEx[vacstmt->tableidx].sampleRate, num_sample_rows, samplerows);
vacstmt->pstGlobalStatEx[vacstmt->tableidx].sampleRate = save_sampleRate;
}
* analyze_index_sample_rows: decide if need sample rows for index or not. construct and
* execute query to get sample rows if there are index columns.
*
* Parameters:
* @in speccontext: memory context for each column
* @in rel: the relation for analyze
* @in vacstmt: the statment for analyze or vacuum command.
* @in nindexes: the num of index for analyze
* @in indexdata: per-index data for analyze
* @out num_sample_rows: the result num of sample rows
* @out samplerows: all the result sample rows
*
* Returns: bool(need sample row for index or not)
*/
static bool analyze_index_sample_rows(MemoryContext speccontext, Relation rel, VacuumStmt* vacstmt, int nindexes,
AnlIndexData* indexdata, int64* num_sample_rows, HeapTuple** samplerows)
{
bool need_sample = false;
for (int ind = 0; ind < nindexes; ind++) {
AnlIndexData* thisdata = &indexdata[ind];
IndexInfo* indexInfo = thisdata->indexInfo;
int attr_cnt = thisdata->attr_cnt;
if (attr_cnt != 0 || indexInfo->ii_Predicate != NIL) {
need_sample = true;
break;
}
}
* If there are specific attrs in current relation, it have got sample rows before,
* so we should not get sample rows again.
*/
if (need_sample && *num_sample_rows == 0) {
get_sample_rows_for_query(speccontext, rel, vacstmt, num_sample_rows, samplerows);
if (*num_sample_rows == 0)
need_sample = false;
}
return need_sample;
}
* set_stats_dndistinct: compute dndistinct value and set to statistic.
*
* Parameters:
* @in stats: statistic info of one attribute for analyze
* @in vacstmt: the statment for analyze or vacuum command
* @in tableidx: identify the relation type for analyze
*
* Returns: bool(need sample row for index or not)
*/
static void set_stats_dndistinct(VacAttrStats* stats, VacuumStmt* vacstmt, int tableidx)
{
double distinct, dndistinct, dntotalrows;
Oid reloid = stats->attrs[0]->attrelid;
char relkind = get_rel_relkind(reloid);
unsigned int num_datanodes = ng_get_baserel_num_data_nodes(reloid, relkind);
if (!(IS_PGXC_COORDINATOR && !IsConnFromCoord()) || (vacstmt->pstGlobalStatEx[tableidx].totalRowCnts <= 0)) {
return;
}
distinct = (stats->stadistinct < 0.0) ? (-stats->stadistinct * vacstmt->pstGlobalStatEx[tableidx].totalRowCnts)
: stats->stadistinct;
* Estimate dndistinct with poisson for
* (1) foreign table
* (2) extended statistics (when distribute key is not contained in multi-column)
*/
if (vacstmt->isForeignTables ||
(stats->num_attrs > 1 && (!es_is_distributekey_contained_in_multi_column(reloid, stats)))) {
vacstmt->pstGlobalStatEx[tableidx].dndistinct[stats->attrs[0]->attnum - 1] =
Min(ceil(NUM_DISTINCT_GTL_FOR_POISSON(
distinct, vacstmt->pstGlobalStatEx[tableidx].totalRowCnts, num_datanodes, 1)),
distinct);
}
if (vacstmt->pstGlobalStatEx[tableidx].attnum < stats->attrs[0]->attnum || stats->attrs[0]->attnum <= 0) {
ereport(ERROR,
(errmodule(MOD_OPT),
errcode(ERRCODE_DATA_CORRUPTED),
errmsg("The column info is changed, Do analyze again for the relation: %s.",
vacstmt->relation->relname)));
}
if (vacstmt->pstGlobalStatEx[tableidx].dndistinct[stats->attrs[0]->attnum - 1] == 0.0) {
return;
}
dndistinct = vacstmt->pstGlobalStatEx[tableidx].dndistinct[stats->attrs[0]->attnum - 1];
if (vacstmt->pstGlobalStatEx[tableidx].dn1totalRowCnts == 0) {
stats->stadndistinct = dndistinct;
return;
}
if (dndistinct < 0.0) {
dndistinct = -dndistinct * vacstmt->pstGlobalStatEx[tableidx].dn1totalRowCnts;
}
* The dndistinct may be inaccurately if default_statistics_target is negtive
* and target rows limited by work_mem, so minimume of dndistinct
* can't less than stadistinct / num_datanodes.
*/
stats->stadndistinct = Max(dndistinct, ceil(distinct / num_datanodes));
dntotalrows = (dndistinct > ceil(distinct / num_datanodes))
? vacstmt->pstGlobalStatEx[tableidx].dn1totalRowCnts
: ceil(vacstmt->pstGlobalStatEx[tableidx].totalRowCnts / num_datanodes);
if (stats->stadndistinct > 0.1 * dntotalrows) {
stats->stadndistinct = -(stats->stadndistinct / dntotalrows);
if (stats->stadndistinct < -1.0) {
stats->stadndistinct = -1.0;
}
}
}
* Description: Generates a table name for the auxiliary temp table that may be created during ANALYZE.
*
* Parameters:
* @in relationOid: relation oid
* @in attname: attribute name.
*
* Returns:
* sample table name. This must be pfree'd by the caller.
*/
static char* temporarySampleTableName(Oid relationOid, AnalyzeSampleTableSpecInfo* spec)
{
char tmpname[NAMEDATALEN];
errno_t ret;
if (spec != NULL) {
for (unsigned int i = 0; i < spec->stats->num_attrs; ++i) {
ret = snprintf_s(spec->v_alias[i],
NAMEDATALEN,
NAMEDATALEN - 1,
"v_%u_%lu_%ld_%d",
relationOid,
u_sess->debug_query_id,
GetCurrentTimestamp(),
spec->stats->attrs[i]->attnum);
securec_check_ss(ret, "\0", "\0");
}
if (spec->stats->num_attrs == 1) {
ret = snprintf_s(tmpname,
NAMEDATALEN,
NAMEDATALEN - 1,
"%s_%u_%lu_%ld_s%d",
ANALYZE_TEMP_TABLE_PREFIX,
relationOid,
u_sess->debug_query_id,
GetCurrentTimestamp(),
spec->stats->attrs[0]->attnum);
} else {
ret = snprintf_s(tmpname,
NAMEDATALEN,
NAMEDATALEN - 1,
"%s_%u_%lu_%ld_m%d_m%d",
ANALYZE_TEMP_TABLE_PREFIX,
relationOid,
u_sess->debug_query_id,
GetCurrentTimestamp(),
spec->stats->attrs[0]->attnum,
spec->stats->attrs[1]->attnum);
}
} else {
ret = snprintf_s(tmpname,
NAMEDATALEN,
NAMEDATALEN - 1,
"%s_%u_%lu_%ld",
ANALYZE_TEMP_TABLE_PREFIX,
relationOid,
u_sess->debug_query_id,
GetCurrentTimestamp());
}
securec_check_ss(ret, "\0", "\0");
return pstrdup(tmpname);
}
* Description: This method builds a sampled version of the given relation. The sampled
* version is created in a temp namespace. Note that ANALYZE can be
* executed only by the database or table user. It is safe to assume that the database
* user has permissions to create temp tables. The sampling is done by
* using the random() built-in function.
*
* Parameters:
* @in relid: relation to be sampled
* @in spec: the sample info of special attribute for compute statistic
* @in indexes: indexes of columns
* @in twocol_tmptable_name: temp table name of P(x1, x2) table
* @in left_tmptable_name: temp table name of P(x1) table
* @in right_tmptable_name: temp table name of P(x2) table
*
* Returns: temp table name (char*)
*/
char* build_temptable_sample_agg_joined(Oid relid, AnalyzeSampleTableSpecInfo* spec,
const uint32 *indexes, const char *twocol_tmptable_name, const char *leftcol_tmptable_name,
const char *rightcol_tmptable_name, int num_index)
{
StringInfoData str;
initStringInfo(&str);
char* sampleTableName = NULL;
sampleTableName = temporarySampleTableName(relid, NULL);
#ifdef ENABLE_MULTIPLE_NODES
appendStringInfo(&str,
"set query_dop = 4; create temp table %s ",
quote_identifier(sampleTableName));
#else
appendStringInfo(&str,
" create temp table %s ",
quote_identifier(sampleTableName));
#endif
if (es_is_type_supported_by_cstore(spec->stats)) {
if (g_instance.attr.attr_storage.enable_mix_replication || (!IS_DN_MULTI_STANDYS_MODE())) {
appendStringInfo(&str, "with(orientation=column) ");
}
}
appendStringInfo(&str, "as (select ");
appendStringInfo(&str,
"%s.v_count x1x2, %s.v_count x1, %s.v_count x2 from %s, %s, %s where %s.%s=%s.%s and %s.%s=%s.%s);",
quote_identifier(twocol_tmptable_name),
quote_identifier(leftcol_tmptable_name),
quote_identifier(rightcol_tmptable_name),
quote_identifier(twocol_tmptable_name),
quote_identifier(leftcol_tmptable_name),
quote_identifier(rightcol_tmptable_name),
quote_identifier(twocol_tmptable_name),
get_column_name_alias(spec, ES_COLUMN_ALIAS, indexes, 1),
quote_identifier(leftcol_tmptable_name),
get_column_name_alias(spec, ES_COLUMN_ALIAS, indexes, 1),
quote_identifier(twocol_tmptable_name),
get_column_name_alias(spec, ES_COLUMN_ALIAS, indexes+1, 1),
quote_identifier(rightcol_tmptable_name),
get_column_name_alias(spec, ES_COLUMN_ALIAS, indexes+1, 1));
#ifdef ENABLE_MULTIPLE_NODES
int saved_query_dop = u_sess->opt_cxt.query_dop;
appendStringInfo(&str, "set query_dop = %d", saved_query_dop);
#endif
elog(ES_LOGLEVEL, "[Sample Table Query] %s", str.data);
spi_exec_with_callback(DestSPI, str.data, false, 0, false, (void (*)(void*))NULL, NULL);
pfree_ext(str.data);
return sampleTableName;
}
* Description: This method builds a sampled version of the given relation. The sampled
* version is created in a temp namespace. Note that ANALYZE can be
* executed only by the database or table user. It is safe to assume that the database
* user has permissions to create temp tables. The sampling is done by
* using the random() built-in function.
*
* Parameters:
* @in relid: relation to be sampled
* @in analyzemode: the table type which collect statistics information
* @in inh: is inherit table or not
* @in vacstmt: the statment for analyze or vacuum command
* @in spec: the sample info of special attribute for compute statistic
* @in indexes: indexes of columns
* @in num_index: number of columns
*
* Returns: temp table name (char*)
*/
char* build_temptable_sample_agg(Oid relid, AnalyzeMode analyzemode, bool inh,
VacuumStmt* vacstmt, AnalyzeSampleTableSpecInfo* spec, const uint32 *indexes,
int num_index)
{
StringInfoData str;
const char* schemaName = NULL;
const char* tableName = NULL;
char* sampleTableName = NULL;
Relation onerel = NULL;
Relation mianrel = NULL;
DEBUG_MOD_START_TIMER(MOD_AUTOVAC);
onerel = relation_open(relid, AccessShareLock);
schemaName = get_namespace_name(get_rel_namespace(relid));
tableName = get_rel_name(relid);
initStringInfo(&str);
sampleTableName = temporarySampleTableName(relid, NULL);
* This query orders the table by rank on the value and chooses values that occur every
* 'bucketSize' interval. It also chooses the maximum value from the relation. It then
* removes duplicate values and orders the values in ascending order. This corresponds to
* the histogram.
* We should set hashagg_table_size before query because distinct value of current attribute
* may be not accurate which result in the performance of query degread.
*/
#ifdef ENABLE_MULTIPLE_NODES
appendStringInfo(&str,
"set query_dop = 4; create temp table %s ",
quote_identifier(sampleTableName));
#else
appendStringInfo(&str,
" create temp table %s ",
quote_identifier(sampleTableName));
#endif
if (es_is_type_supported_by_cstore(spec->stats)) {
if (g_instance.attr.attr_storage.enable_mix_replication || (!IS_DN_MULTI_STANDYS_MODE())) {
appendStringInfo(&str, "with(orientation=column) ");
}
}
#ifdef ENABLE_MULTIPLE_NODES
if (es_is_type_distributable(spec->stats)) {
appendStringInfo(
&str, "distribute by hash(%s) as (select ",
get_column_name_alias(spec, ES_COLUMN_ALIAS, indexes, num_index));
} else {
appendStringInfo(&str, "distribute by hash(i) as (select (random()*32767)::int2 as i, ");
}
#else
appendStringInfo(&str, "as (select ");
#endif
if (vacstmt->tmpSampleTblNameList) {
Relation temp_relation;
const char* tmpSampleTableName = get_sample_tblname(analyzemode, vacstmt->tmpSampleTblNameList);
temp_relation = relation_open(RelnameGetRelid(tmpSampleTableName), AccessShareLock);
appendStringInfo(&str,
"%s, count(*) as v_count "
"from %s %s group by %s);",
get_column_name_alias(spec, ES_COLUMN_NAME | ES_COLUMN_ALIAS, indexes, num_index),
(!inh && RelationIsPAXFormat(onerel)) ? "only" : "",
quote_identifier(tmpSampleTableName),
get_column_name_alias(spec, ES_COLUMN_ALIAS, indexes, num_index));
SetUserIdAndSecContext(temp_relation->rd_rel->relowner, 0);
relation_close(temp_relation, AccessShareLock);
} else {
appendStringInfo(&str,
"%s, count(*) as v_count "
"from %s %s.%s where random() <= %.2f group by %s);",
get_column_name_alias(spec, ES_COLUMN_NAME | ES_COLUMN_ALIAS, indexes, num_index),
(!inh && RelationIsPAXFormat(onerel)) ? "only" : "",
quote_identifier(schemaName),
quote_identifier(tableName),
vacstmt->pstGlobalStatEx[vacstmt->tableidx].sampleRate,
get_column_name_alias(spec, ES_COLUMN_ALIAS, indexes, num_index));
SetUserIdAndSecContext(onerel->rd_rel->relowner, 0);
}
#ifdef ENABLE_MULTIPLE_NODES
int saved_query_dop = u_sess->opt_cxt.query_dop;
appendStringInfo(&str, "set query_dop = %d", saved_query_dop);
#endif
if (mianrel)
relation_close(mianrel, AccessShareLock);
relation_close(onerel, AccessShareLock);
elog(ES_LOGLEVEL, "[Sample Table Query] %s", str.data);
spi_exec_with_callback(DestSPI, str.data, false, 0, false, (void (*)(void*))NULL, NULL);
DEBUG_MOD_STOP_TIMER(MOD_AUTOVAC, "Created sample table %s success. querystring: %s", sampleTableName, str.data);
pfree_ext(str.data);
pfree_ext(tableName);
pfree_ext(schemaName);
return sampleTableName;
}
* Description: This method builds a sampled version of the given relation. The sampled
* version is created in a temp namespace. Note that ANALYZE can be
* executed only by the database or table user. It is safe to assume that the database
* user has permissions to create temp tables. The sampling is done by
* using the random() built-in function.
*
* Parameters:
* @in relid: relation to be sampled
* @in type: identify create temp table for attribute or table
* @in analyzemode: the table type which collect statistics information
* @in inh: is inherit table or not
* @in vacstmt: the statment for analyze or vacuum command
* @in spec: the sample info of special attribute for compute statistic
*
* Returns: temp table name (char*)
*/
char* buildTempSampleTable(Oid relid, Oid mian_relid, TempSmpleTblType type, AnalyzeMode analyzemode, bool inh,
VacuumStmt* vacstmt, AnalyzeSampleTableSpecInfo* spec)
{
StringInfoData str;
const char* schemaName = NULL;
const char* tableName = NULL;
char* sampleTableName = NULL;
Relation onerel = NULL;
Relation mianrel = NULL;
DEBUG_MOD_START_TIMER(MOD_AUTOVAC);
onerel = relation_open(relid, AccessShareLock);
if (OidIsValid(mian_relid)) {
AssertEreport(TempSmpleTblType_Table == type, MOD_OPT, "");
mianrel = relation_open(mian_relid, AccessShareLock);
schemaName = get_namespace_name(get_rel_namespace(mian_relid));
tableName = get_rel_name(mian_relid);
} else {
schemaName = get_namespace_name(get_rel_namespace(relid));
tableName = get_rel_name(relid);
}
initStringInfo(&str);
if (TempSmpleTblType_Attrbute == type) {
int saved_query_dop = u_sess->opt_cxt.query_dop;
sampleTableName = temporarySampleTableName(relid, spec);
* This query orders the table by rank on the value and chooses values that occur every
* 'bucketSize' interval. It also chooses the maximum value from the relation. It then
* removes duplicate values and orders the values in ascending order. This corresponds to
* the histogram.
* We should set hashagg_table_size before query because distinct value of current attribute
* may be not accurate which result in the performance of query degread.
*/
appendStringInfo(&str,
"%s create temp table %s ",
IS_SINGLE_NODE ? "" : "set query_dop = 4;",
quote_identifier(sampleTableName));
if (es_is_type_supported_by_cstore(spec->stats)) {
if (g_instance.attr.attr_storage.enable_mix_replication || (!IS_DN_MULTI_STANDYS_MODE())) {
appendStringInfo(&str, "with(orientation=column) ");
}
}
if (IS_SINGLE_NODE) {
appendStringInfo(&str, "as (select ");
} else if (es_is_type_distributable(spec->stats)) {
appendStringInfo(
&str, "distribute by hash(%s) as (select ", es_get_column_name_alias(spec, ES_COLUMN_ALIAS));
} else {
appendStringInfo(&str, "distribute by hash(i) as (select (pg_catalog.random()*32767)::int2 as i, ");
}
if (vacstmt->tmpSampleTblNameList) {
Relation temp_relation;
const char* tmpSampleTableName = get_sample_tblname(analyzemode, vacstmt->tmpSampleTblNameList);
temp_relation = relation_open(RelnameGetRelid(tmpSampleTableName), AccessShareLock);
appendStringInfo(&str,
"%s, count(*) as v_count "
"from %s %s group by %s);",
es_get_column_name_alias(spec, ES_COLUMN_NAME | ES_COLUMN_ALIAS),
(!inh && RelationIsPAXFormat(onerel)) ? "only" : "",
quote_identifier(tmpSampleTableName),
es_get_column_name_alias(spec, ES_COLUMN_ALIAS));
SetUserIdAndSecContext(temp_relation->rd_rel->relowner, 0);
relation_close(temp_relation, AccessShareLock);
} else {
appendStringInfo(&str,
"%s, pg_catalog.count(*) as v_count "
"from %s %s.%s where pg_catalog.random() <= %.2f group by %s);",
es_get_column_name_alias(spec, ES_COLUMN_NAME | ES_COLUMN_ALIAS),
(!inh && RelationIsPAXFormat(onerel)) ? "only" : "",
quote_identifier(schemaName),
quote_identifier(tableName),
vacstmt->pstGlobalStatEx[vacstmt->tableidx].sampleRate,
es_get_column_name_alias(spec, ES_COLUMN_ALIAS));
SetUserIdAndSecContext(onerel->rd_rel->relowner, 0);
}
appendStringInfo(&str, "set query_dop = %d", saved_query_dop);
} else {
char* group_name = get_pgxc_groupname(get_pgxc_class_groupoid(relid));
const char* redistribution_group_name = PgxcGroupGetInRedistributionGroup();
* Unable to create temp table on old installation group while
* in cluster-resizing under analyzing.
*/
if (redistribution_group_name != NULL && group_name != NULL &&
strcmp(group_name, redistribution_group_name) == 0) {
DEBUG_MOD_STOP_TIMER(MOD_AUTOVAC,
"Unable to create temp table on old installation group \"%s\" while in cluster-resizing "
"under analyzing for table \"%s\".",
group_name,
quote_identifier(tableName));
relation_close(onerel, AccessShareLock);
pfree_ext(tableName);
pfree_ext(schemaName);
return NULL;
}
sampleTableName = temporarySampleTableName(relid, NULL);
if (NULL == group_name) {
appendStringInfo(&str,
"create temp table %s (like %s.%s INCLUDING DISTRIBUTION);",
quote_identifier(sampleTableName),
quote_identifier(schemaName),
quote_identifier(tableName));
} else {
appendStringInfo(&str,
"create temp table %s (like %s.%s INCLUDING DISTRIBUTION) to group %s;",
quote_identifier(sampleTableName),
quote_identifier(schemaName),
quote_identifier(tableName),
quote_identifier(group_name));
pfree_ext(group_name);
}
}
if (mianrel)
relation_close(mianrel, AccessShareLock);
relation_close(onerel, AccessShareLock);
elog(ES_LOGLEVEL, "[Sample Table Query] %s", str.data);
u_sess->analyze_cxt.is_under_analyze = true;
spi_exec_with_callback(DestSPI, str.data, false, 0, false, (void (*)(void*))NULL, NULL);
u_sess->analyze_cxt.is_under_analyze = false;
DEBUG_MOD_STOP_TIMER(MOD_AUTOVAC, "Created sample table %s success. querystring: %s", sampleTableName, str.data);
pfree_ext(str.data);
pfree_ext(tableName);
pfree_ext(schemaName);
return sampleTableName;
}
* Description: Drop the temp table created during ANALYZE.
*
* Parameters:
* @in tableName: temp table name
*
* Returns: void
*/
void dropSampleTable(const char* tableName)
{
StringInfoData str;
initStringInfo(&str);
appendStringInfo(&str, "drop table %s", quote_identifier(tableName));
spi_exec_with_callback(DestSPI, str.data, false, 0, false, (void (*)(void*))NULL, NULL);
pfree_ext(str.data);
elog(DEBUG1, "ANALYZE dropping sample table: %s", tableName);
}
* Description: This routine creates an array from one of the result attributes after an SPI call.
* It allocates this array in the specified context. Note that, in general, the allocation
* context must not the SPI context because that is likely to get cleaned out soon.
*
* Parameters:
* @in attrno: attribute to flatten into an array (1 based)
* @in memory_context: memory context for statistic of column
*
* Returns: array of attribute type
*/
static ArrayBuildState* spi_get_result_array(int attrno, MemoryContext memory_context)
{
ArrayBuildState* result = NULL;
Form_pg_attribute attribute = &SPI_tuptable->tupdesc->attrs[attrno];
AssertEreport(attribute, MOD_OPT, "");
for (uint32 i = 0; i < SPI_processed; i++) {
Datum dValue = 0;
bool isnull = false;
dValue = tableam_tops_tuple_getattr(SPI_tuptable->vals[i], attrno + 1, SPI_tuptable->tupdesc, &isnull);
* Add this value to the result array.
*/
result = accumArrayResult(result, dValue, isnull, attribute->atttypid, memory_context);
}
return result;
}
* Description: A callback function for use with spiExecuteWithCallback.
* Reads each column of output into an array The number of arrays,
* and the output location are determined by treating *clientData.
*
* Parameters:
* @in clientData: as a EachResultColumnAsArraySpec
*
* Returns: void
*/
static void spi_callback_get_multicolarray(void* clientData)
{
AnalyzeResultMultiColAsArraySpecInfo* spec = (AnalyzeResultMultiColAsArraySpecInfo*)clientData;
ArrayBuildState** out = spec->output;
AssertEreport(SPI_tuptable != NULL, MOD_OPT, "");
AssertEreport(SPI_tuptable->tupdesc, MOD_OPT, "");
* Save the copy of tuple desc for check whether the attribute of received from datanode
* is the same with local attribute for analyzed.
*/
if (spec->num_rows == 0) {
MemoryContext oldcontext = MemoryContextSwitchTo(spec->memoryContext);
spec->spi_tupDesc = CreateTupleDescCopy(SPI_tuptable->tupdesc);
(void)MemoryContextSwitchTo(oldcontext);
}
if (SPI_processed == 0)
return;
if (spec->compute_histgram) {
compute_histgram_internal(spec);
spec->num_rows += SPI_processed;
return;
}
for (int i = 0; i < spec->num_cols; i++) {
*out = spi_get_result_array(i, spec->memoryContext);
AssertEreport(*out, MOD_OPT, "");
out++;
}
spec->num_rows += SPI_processed;
}
* Description: A callback function for use with spiExecuteWithCallback.
* Asserts that exactly one row was returned.
* Gets the row's first column as a float, using 0.0 if the value is null.
*
* Parameters:
* @in clientData: as a EachResultColumnAsArraySpec
*
* Returns: void
*/
static void spi_callback_get_singlerow(void* clientData)
{
Datum datum_f;
bool isnull = false;
int64* out = (int64*)clientData;
if (0 == SPI_processed) {
*out = 0;
} else {
AssertEreport(SPI_tuptable != NULL, MOD_OPT, "must have result");
AssertEreport(SPI_processed == 1, MOD_OPT, "we expect only one tuple");
AssertEreport(SPI_tuptable->tupdesc->attrs[0].atttypid == INT8OID, MOD_OPT, "");
datum_f = tableam_tops_tuple_getattr(SPI_tuptable->vals[0], 1, SPI_tuptable->tupdesc, &isnull);
if (isnull) {
*out = 0;
} else {
*out = DatumGetInt64(datum_f);
}
}
}
* Description: Computes the total rows in sample table.
*
* Parameters:
* @in tableName: temp table name
* @in spec: the sample info of special attribute for compute statistic
*
* Returns: void
*/
static void analyze_compute_samplerows(const char* tableName, AnalyzeSampleTableSpecInfo* spec)
{
int64 samplerows = 0;
StringInfoData str;
initStringInfo(&str);
appendStringInfo(&str, "select pg_catalog.sum(v_count)::int8 as samplerows from %s;", quote_identifier(tableName));
elog(ES_LOGLEVEL, "[Query to compute samplerows] : %s", str.data);
DEBUG_MOD_START_TIMER(MOD_AUTOVAC);
spi_exec_with_callback(DestSPI, str.data, false, 0, true, (void (*)(void*))spi_callback_get_singlerow, &samplerows);
pfree_ext(str.data);
spec->samplerows = samplerows;
DEBUG_MOD_STOP_TIMER(MOD_AUTOVAC, "Compute samplerows = %ld for table %s success.", spec->samplerows, tableName);
}
* Description: Computes the absolute number of NULLs in the sample table.
*
* Parameters:
* @in tableName: temp table name
* @in spec: the sample info of special attribute for compute statistic
*
* Returns: void
*/
static void analyze_compute_nullcount(const char* tableName, AnalyzeSampleTableSpecInfo* spec)
{
int64 null_count = 0;
DEBUG_MOD_START_TIMER(MOD_AUTOVAC);
if (es_is_not_null(spec)) {
spec->stats->stanullfrac = 0.0;
} else {
StringInfoData str;
initStringInfo(&str);
* For multi-column statistics, the definition of composit NULL values are all
* elements are NULL, and also use this result to compute nullfrac
*/
appendStringInfo(&str,
"select pg_catalog.sum(v_count)::int8 from %s where %s limit 1;",
quote_identifier(tableName),
es_get_column_name_alias(spec, ES_COLUMN_ALIAS, " and ", "", " is null "));
elog(ES_LOGLEVEL, "[Query to compute null count] : %s", str.data);
spi_exec_with_callback(
DestSPI, str.data, false, 0, true, (void (*)(void*))spi_callback_get_singlerow, &null_count);
pfree_ext(str.data);
spec->stats->stanullfrac = Min((null_count * 1.0) / spec->samplerows, 1.0);
}
spec->null_cnt = null_count;
spec->nonnull_cnt = spec->samplerows - spec->null_cnt;
DEBUG_MOD_STOP_TIMER(MOD_AUTOVAC, "Compute nullfrac = %.6f for table %s success"
"null_count = %ld, samplerows = %ld.",
spec->stats->stanullfrac,
tableName,
null_count,
spec->samplerows);
}
* Description: Computes average width of an attribute. This uses the built-in pg_column_size.
*
* Parameters:
* @in tableName: temp table name
* @in spec: the sample info of special attribute for compute statistic
*
* Returns: void
*/
static void analyze_compute_avgwidth(const char* tableName, AnalyzeSampleTableSpecInfo* spec)
{
DEBUG_MOD_START_TIMER(MOD_AUTOVAC);
if (spec->is_varwidth) {
if (spec->nonnull_cnt == 0 && spec->null_cnt > 0) {
spec->stats->stawidth = 0;
} else {
int64 avgwidth = 0;
StringInfoData str;
initStringInfo(&str);
appendStringInfo(&str,
"select (pg_catalog.sum((%s) * v_count)::int8 / pg_catalog.sum(v_count)::int8)::int8 "
"from %s where %s;",
es_get_column_name_alias(spec, ES_COLUMN_ALIAS, " + ", "pg_catalog.pg_column_size(", ")"),
quote_identifier(tableName),
es_get_column_name_alias(spec, ES_COLUMN_ALIAS, " and ", "", " is not null "));
elog(ES_LOGLEVEL, "[Query to compute avgwidth] : %s", str.data);
spi_exec_with_callback(
DestSPI, str.data, false, 0, true, (void (*)(void*))spi_callback_get_singlerow, &avgwidth);
pfree_ext(str.data);
spec->stats->stawidth = avgwidth;
}
} else {
spec->stats->stawidth = 0;
for (unsigned int i = 0; i < spec->stats->num_attrs; ++i) {
spec->stats->stawidth += spec->stats->attrs[i]->attlen;
}
}
DEBUG_MOD_STOP_TIMER(MOD_AUTOVAC, "Compute avgwidth = %d for table %s success.", spec->stats->stawidth, tableName);
}
* Description: Compute NDistinct of a column using sample table.
*
* Parameters:
* @in tableName: temp table name
* @in spec: the sample info of special attribute for compute statistic
*
* Returns: void
*/
static void analyze_distinct(const char* tableName, AnalyzeSampleTableSpecInfo* spec)
{
StringInfoData str;
ArrayBuildState* spiResult[DEFAULT_COLUMN_NUM];
AnalyzeResultMultiColAsArraySpecInfo result;
DEBUG_MOD_START_TIMER(MOD_AUTOVAC);
initStringInfo(&str);
appendStringInfo(&str,
"select pg_catalog.count(*) as ndistinct, "
"(select pg_catalog.count(*) from %s where v_count > 1) as nmultiple "
"from %s where NOT (%s);",
quote_identifier(tableName),
quote_identifier(tableName),
es_get_column_name_alias(spec, ES_COLUMN_ALIAS, " and ", "", " is null "));
elog(ES_LOGLEVEL, "[Query to compute distinct] : %s", str.data);
result.num_cols = DEFAULT_COLUMN_NUM;
result.output = spiResult;
result.memoryContext = CurrentMemoryContext;
result.compute_histgram = false;
result.num_rows = 0;
spi_exec_with_callback(DestSPI, str.data, false, 0, true, (void (*)(void*))spi_callback_get_multicolarray, &result);
pfree_ext(str.data);
if (result.num_rows > 0) {
spec->ndistinct = spiResult[0]->dvalues[0];
spec->nmultiple = spiResult[1]->dvalues[0];
if (spec->nonnull_cnt > 0) {
analyze_compute_ndistinct(
spec->nmultiple, spec->ndistinct, 0, spec->samplerows, spec->totalrows, spec->stats);
} else {
spec->stats->stadistinct = 0.0;
}
}
DEBUG_MOD_STOP_TIMER(MOD_AUTOVAC,
"Compute global_ndistinct = %.0f for table %s success, "
"sample_ndistinct = %.0f, nmultiple = %ld, nonnull_cnt = %ld, "
"samplerows = %ld, totalrows = %.0f.",
spec->stats->stadistinct,
tableName,
spec->ndistinct,
spec->nmultiple,
spec->nonnull_cnt,
spec->samplerows,
spec->totalrows);
}
* Description: Computes the most common values and their frequencies for relation.
*
* Parameters:
* @in slot_idx: index of statistic struct
* @in tableName: temp table name
* @in spec: the sample info of special attribute for compute statistic
*
* Returns: void
*/
static void analyze_compute_mcv(
int* slot_idx, const char* tableName, AnalyzeSampleTableSpecInfo* spec, bool process_null)
{
if (process_null) {
if (*slot_idx >= STATISTIC_NUM_SLOTS) {
elog(WARNING, "statistics read max stavalue slot");
return;
}
* Check if the to-be analyzed multi-column is defined with NOT-NULL constraint so
* that we can optimize out the null-mcv steps
*/
bool mcv_not_null = true;
for (int i = 0; i < (int)spec->stats->num_attrs; i++) {
if (!spec->stats->attrs[i]->attnotnull) {
mcv_not_null = false;
break;
}
}
if (mcv_not_null) {
elog(DEBUG3, "not need to collect null-mcv as each attributes is NOT NULL");
return;
}
}
double mincount;
ArrayBuildState** spiResult = (ArrayBuildState**)palloc0(sizeof(ArrayBuildState*) * (spec->stats->num_attrs + 1));
StringInfoData str;
AnalyzeResultMultiColAsArraySpecInfo result;
DEBUG_MOD_START_TIMER(MOD_AUTOVAC);
mincount = compute_mcv_mincount(spec->samplerows, spec->ndistinct, spec->mcv_list.stattarget);
mincount = Max(mincount, MIN_MCV_COUNT);
initStringInfo(&str);
if (process_null) {
appendStringInfo(&str,
"select %s, v_count from %s where not (%s) and not (%s)"
"and v_count >= %.0f order by v_count desc, %s limit 100;",
es_get_column_name_alias(spec, ES_COLUMN_ALIAS),
quote_identifier(tableName),
es_get_column_name_alias(spec, ES_COLUMN_ALIAS, " and ", "", " is null "),
es_get_column_name_alias(spec, ES_COLUMN_ALIAS, " and ", "", " is not null "),
mincount,
es_get_column_name_alias(spec, ES_COLUMN_ALIAS));
} else {
appendStringInfo(&str,
"select %s, v_count from %s where %s"
"and v_count >= %.0f order by v_count desc, %s limit 100;",
es_get_column_name_alias(spec, ES_COLUMN_ALIAS),
quote_identifier(tableName),
es_get_column_name_alias(spec, ES_COLUMN_ALIAS, " and ", "", " is not null "),
mincount,
es_get_column_name_alias(spec, ES_COLUMN_ALIAS));
}
elog(ES_LOGLEVEL, "[Query to compute MCV] : %s", str.data);
result.num_cols = spec->stats->num_attrs + 1;
result.output = spiResult;
result.memoryContext = CurrentMemoryContext;
result.compute_histgram = false;
result.num_rows = 0;
result.spi_tupDesc = NULL;
spi_exec_with_callback(DestSPI, str.data, false, 0, true, (void (*)(void*))spi_callback_get_multicolarray, &result);
pfree_ext(str.data);
spec->mcv_list.num_mcv = result.num_rows;
* Check whether the attribute of received from datanode
* is the same with local attribute for analyzed or not.
*/
if (spec->stats->num_attrs == 1 && result.spi_tupDesc &&
(result.spi_tupDesc->attrs[0].atttypid != spec->stats->attrs[0]->atttypid ||
result.spi_tupDesc->attrs[0].atttypmod != spec->stats->attrs[0]->atttypmod ||
result.spi_tupDesc->attrs[0].attlen != spec->stats->attrs[0]->attlen)) {
ereport(WARNING,
(errmsg("The tupleDesc analyzed on %s is different from tupleDesc which received from datanode "
"when computing mcv.",
g_instance.attr.attr_common.PGXCNodeName),
errdetail("Attribute \"%s\" of type %s does not match corresponding attribute of type %s.",
NameStr(spec->stats->attrs[0]->attname),
format_type_be(spec->stats->attrs[0]->atttypid),
format_type_be(result.spi_tupDesc->attrs[0].atttypid))));
FreeTupleDesc(result.spi_tupDesc);
DEBUG_MOD_STOP_TIMER(MOD_AUTOVAC, "Compute MCV for table %s failed.", tableName);
return;
}
if (result.num_rows > 0) {
Datum* mcv_values = NULL;
bool* mcv_nulls = NULL;
float4* mcv_freqs = NULL;
MemoryContext old_context;
old_context = MemoryContextSwitchTo(spec->stats->anl_context);
mcv_values = (Datum*)palloc((result.num_rows * spec->stats->num_attrs) * sizeof(Datum));
mcv_nulls = (bool*)palloc((result.num_rows * spec->stats->num_attrs) * sizeof(bool));
mcv_freqs = (float4*)palloc(result.num_rows * sizeof(float4));
for (int64 i = 0; i < result.num_rows; i++) {
for (unsigned int j = 0; j < spec->stats->num_attrs; ++j) {
int index = i + j * result.num_rows;
mcv_values[index] = datumCopy(
spiResult[j]->dvalues[i], spec->stats->attrtype[j]->typbyval, spec->stats->attrtype[j]->typlen);
mcv_nulls[index] = spiResult[j]->dnulls[i];
}
mcv_freqs[i] =
(double)DatumGetInt64(spiResult[spec->stats->num_attrs]->dvalues[i]) / (double)spec->samplerows;
spec->mcv_list.rows_mcv += DatumGetInt64(spiResult[spec->stats->num_attrs]->dvalues[i]);
}
(void)MemoryContextSwitchTo(old_context);
if (process_null) {
spec->stats->stakind[*slot_idx] = STATISTIC_KIND_NULL_MCV;
} else {
spec->stats->stakind[*slot_idx] = STATISTIC_KIND_MCV;
}
spec->stats->staop[*slot_idx] =
(1 == spec->stats->num_attrs) ? ((StdAnalyzeData*)spec->stats->extra_data)->eqopr : 0;
spec->stats->stanumbers[*slot_idx] = mcv_freqs;
spec->stats->numnumbers[*slot_idx] = result.num_rows;
spec->stats->stavalues[*slot_idx] = mcv_values;
spec->stats->stanulls[*slot_idx] = mcv_nulls;
spec->stats->numvalues[*slot_idx] = result.num_rows * spec->stats->num_attrs;
* Accept the defaults for stats->statypid and others. They have
* been set before we were called (see vacuum.h)
*/
(*slot_idx)++;
* Save the last mcv as the start of the histgram value.
* We only collect histgram in sigle column scenario
*/
spec->hist_list.start_value = datumCopy(spiResult[0]->dvalues[result.num_rows - 1],
spec->stats->attrtype[0]->typbyval,
spec->stats->attrtype[0]->typlen);
spec->hist_list.start_value_count = spiResult[1]->dvalues[result.num_rows - 1];
}
pfree_ext(spiResult);
DEBUG_MOD_STOP_TIMER(MOD_AUTOVAC,
"Compute %d rows of MCV for table %s success.",
spec->mcv_list.num_mcv, tableName);
}
* Description: Get each tuples received from datanodes,
* saved in histogram list if the sum of count reach bucketsize.
*
* Parameters:
* @in spec: the sample info of special attribute for compute histogram
*
* Returns: void
*/
static void compute_histgram_internal(AnalyzeResultMultiColAsArraySpecInfo* spec)
{
#define VALUE_COLUMN 1
#define ROWCOUNT_COLUMN 2
MemoryContext oldContext;
int64 vcount;
Datum histgramValue;
TupleDesc tupDesc = spec->spi_tupDesc;
bool isnull = false;
int16 valueTyplen = 0;
bool valueTypbyval = false;
char valueTypalign = 0;
#define GET_DETOAST_HISTGRAM_VALUE(val) \
do { \
(val) = tableam_tops_tuple_getattr(SPI_tuptable->vals[i], VALUE_COLUMN, tupDesc, &isnull); \
if (!isnull && !valueTypbyval) { \
oldContext = MemoryContextSwitchTo(u_sess->analyze_cxt.analyze_context); \
if (valueTyplen < 0) \
(val) = PointerGetDatum(PG_DETOAST_DATUM_COPY((val))); \
else \
(val) = datumCopy((val), valueTypbyval, valueTyplen); \
(void)MemoryContextSwitchTo(oldContext); \
} \
} while (0)
get_typlenbyvalalign(tupDesc->attrs[VALUE_COLUMN - 1].atttypid, &valueTyplen, &valueTypbyval, &valueTypalign);
for (uint32 i = 0; i < SPI_processed; i++) {
vcount = tableam_tops_tuple_getattr(SPI_tuptable->vals[i], ROWCOUNT_COLUMN, tupDesc, &isnull);
spec->hist_list.sum_count += vcount;
spec->hist_list.cur_mcv_idx++;
* Save the value to histgram list either the sum of count exceed
* the bucketSize or the current value is the first or the last value.
*/
if (spec->hist_list.num_hist < spec->hist_list.stattarget &&
(spec->hist_list.num_hist == 0 || spec->hist_list.cur_mcv_idx == spec->non_mcv_num ||
spec->hist_list.sum_count >= spec->hist_list.bucketSize)) {
GET_DETOAST_HISTGRAM_VALUE(histgramValue);
spec->hist_list.histitem[spec->hist_list.num_hist].value = histgramValue;
spec->hist_list.histitem[spec->hist_list.num_hist].count = vcount;
spec->hist_list.rows_hist += vcount;
spec->hist_list.num_hist++;
* compute remainder sum of count current value in order to
* compute next histgram value.
*/
spec->hist_list.sum_count = 0;
}
* The histgram value is the last value(as the max value),
* save it to the last location of hist list.
*/
if (spec->hist_list.num_hist >= spec->hist_list.stattarget &&
spec->hist_list.cur_mcv_idx == spec->non_mcv_num) {
GET_DETOAST_HISTGRAM_VALUE(histgramValue);
spec->hist_list.histitem[spec->hist_list.num_hist - 1].value = histgramValue;
spec->hist_list.histitem[spec->hist_list.num_hist - 1].count = vcount;
spec->hist_list.rows_hist += vcount;
}
}
}
* Description: Save all the values in histogram list to statistic struct.
*
* Parameters:
* @in slot_idx: index of statistic struct
* @in hist_list: histogram list
* @in stats: statistic info of one attribute for analyze
*
* Returns: void
*/
static void compute_histgram_final(int* slot_idx, HistgramInfo* hist_list, VacAttrStats* stats)
{
if (hist_list->num_hist < 2) {
return;
}
Datum* hist_values = NULL;
MemoryContext old_context;
old_context = MemoryContextSwitchTo(stats->anl_context);
hist_values = (Datum*)palloc(MAX_ATTR_HIST_STAT_TARGET * sizeof(Datum));
* We should restrict and adjust the num of histgram
* if there are more than 100 histgram values in hist_list.
*/
for (int i = 0; i < hist_list->num_hist; i++) {
hist_values[i] =
datumCopy(hist_list->histitem[i].value, stats->attrtype[0]->typbyval, stats->attrtype[0]->typlen);
}
(void)MemoryContextSwitchTo(old_context);
stats->stakind[*slot_idx] = STATISTIC_KIND_HISTOGRAM;
stats->staop[*slot_idx] = ((StdAnalyzeData*)stats->extra_data)->ltopr;
stats->stavalues[*slot_idx] = hist_values;
stats->numvalues[*slot_idx] = hist_list->num_hist;
* Accept the defaults for stats->statypid and others. They have
* been set before we were called (see vacuum.h)
*/
(*slot_idx)++;
}
* Description: Compute histogram entries for relation.
*
* Parameters:
* @in slot_idx: index of statistic struct
* @in tableName: temp table name
* @in spec: the sample info of special attribute for compute statistic
*
* Returns: void
*/
static void analyze_compute_histgram(int* slot_idx, const char* tableName, AnalyzeSampleTableSpecInfo* spec)
{
#define ANAYLZE_MAX_STAWIDTH 20
* Only save the length of 20 for histgram value if the type of attribute is
* varchar/char/bpchar and the max width more than 20.
*/
#define CHAR_WIDTH_EXCEED_20(spec) \
(((VARCHAROID == (spec)->stats->attrs[0]->atttypid) || (CHAROID == (spec)->stats->attrs[0]->atttypid) || \
(BPCHAROID == (spec)->stats->attrs[0]->atttypid)) && \
(spec)->stats->stawidth >= ANAYLZE_MAX_STAWIDTH)
int64 non_mcv_num = 0;
int64 non_mcv_rows = 0;
StringInfoData str;
ArrayBuildState* spiResult[2];
AnalyzeResultMultiColAsArraySpecInfo result;
bool is_attr_diff = false;
DEBUG_MOD_START_TIMER(MOD_AUTOVAC);
non_mcv_rows = spec->nonnull_cnt - spec->mcv_list.rows_mcv;
non_mcv_num = spec->ndistinct - spec->mcv_list.num_mcv;
if (non_mcv_num > 2) {
if (non_mcv_num > spec->hist_list.stattarget) {
spec->hist_list.bucketSize = rint(non_mcv_rows / spec->hist_list.stattarget);
} else {
spec->hist_list.bucketSize = 1;
}
initStringInfo(&str);
if (!CHAR_WIDTH_EXCEED_20(spec)) {
appendStringInfo(&str,
"select %s, v_count from %s ",
es_get_column_name_alias(spec, ES_COLUMN_ALIAS),
quote_identifier(tableName));
} else {
appendStringInfo(&str,
"select pg_catalog.substr(%s,1,20) as sub_v, v_count from %s ",
es_get_column_name_alias(spec, ES_COLUMN_ALIAS),
quote_identifier(tableName));
}
if (spec->mcv_list.num_mcv > 0) {
bool typisvarlena = false;
Oid foutoid;
char* start_value = NULL;
getTypeOutputInfo(spec->stats->attrtypid[0], &foutoid, &typisvarlena);
start_value = OidOutputFunctionCall(foutoid, spec->hist_list.start_value);
appendStringInfo(&str,
"where NOT(%s) and "
"(v_count < %ld or (v_count = %ld and %s > %s)) %s order by %s;",
es_get_column_name_alias(spec, ES_COLUMN_ALIAS, " AND ", " ", " is null "),
spec->hist_list.start_value_count,
spec->hist_list.start_value_count,
es_get_column_name_alias(spec, ES_COLUMN_ALIAS),
quote_literal_cstr(start_value),
!CHAR_WIDTH_EXCEED_20(spec) ? "" : "and sub_v is not null",
es_get_column_name_alias(spec, ES_COLUMN_ALIAS));
} else {
appendStringInfo(&str,
"where NOT (%s) %s order by %s;",
es_get_column_name_alias(spec, ES_COLUMN_ALIAS, " and ", "", " is null "),
!CHAR_WIDTH_EXCEED_20(spec) ? "" : "and sub_v is not null",
es_get_column_name_alias(spec, ES_COLUMN_ALIAS));
}
elog(ES_LOGLEVEL, "[Query to compute histgram] %s", str.data);
result.num_cols = 2;
result.output = spiResult;
result.memoryContext = CurrentMemoryContext;
result.compute_histgram = true;
result.non_mcv_num = non_mcv_num;
result.hist_list = spec->hist_list;
result.num_rows = 0;
result.spi_tupDesc = NULL;
spi_exec_with_callback(
DestSPITupleAnalyze, str.data, false, 0, false, (void (*)(void*))spi_callback_get_multicolarray, &result);
pfree_ext(str.data);
if (!CHAR_WIDTH_EXCEED_20(spec)) {
is_attr_diff =
result.spi_tupDesc && (result.spi_tupDesc->attrs[0].atttypid != spec->stats->attrs[0]->atttypid ||
result.spi_tupDesc->attrs[0].atttypmod != spec->stats->attrs[0]->atttypmod ||
result.spi_tupDesc->attrs[0].attlen != spec->stats->attrs[0]->attlen);
} else {
is_attr_diff = result.spi_tupDesc && (result.spi_tupDesc->attrs[0].atttypid != TEXTOID ||
result.spi_tupDesc->attrs[0].atttypmod != -1 ||
result.spi_tupDesc->attrs[0].attlen != -1);
}
* Check whether the attribute of received from datanode
* is the same with local attribute for analyzed or not.
*/
if (is_attr_diff) {
ereport(WARNING,
(errmsg("The tupleDesc analyzed on %s is different from tupleDesc which received from datanode "
"when computing histgram.",
g_instance.attr.attr_common.PGXCNodeName),
errdetail("Attribute \"%s\" of type %s does not match corresponding attribute of type %s.",
NameStr(spec->stats->attrs[0]->attname),
format_type_be(spec->stats->attrs[0]->atttypid),
format_type_be(result.spi_tupDesc->attrs[0].atttypid))));
FreeTupleDesc(result.spi_tupDesc);
return;
}
if (result.num_rows > 0) {
compute_histgram_final(slot_idx, &result.hist_list, spec->stats);
}
}
DEBUG_MOD_STOP_TIMER(MOD_AUTOVAC,
"Compute %d rows of Histgram for table %s success.",
result.hist_list.num_hist, tableName);
}
* Description: Analyze one attribute with execute query,
* and compute distinct/mcv/histgram with sample tuple.
*
* Parameters:
* @in inh: is inherit table for analzye or not
* @in onerel: the relation for analyze or vacuum
* @in vacstmt: the statment for analyze or vacuum command
* @in stats: statistics for one attribute
* @in numTotalRows: total rows for the relation
* @in numSampleRows: the num of sample rows
* @in analyzemode: identify the relation type for analyze
*
* Returns: void
*/
static void do_analyze_compute_attr_stats(bool inh, Relation onerel, VacuumStmt* vacstmt, VacAttrStats* stats,
double numTotalRows, int64 numSampleRows, AnalyzeMode analyzemode)
{
int slot_idx = 0;
bool computeDistinct = true;
bool computeMCV = true;
bool computeHist = true;
Oid ltopr;
Oid save_userid;
int save_sec_context;
AnalyzeSampleTableSpecInfo spec;
char* tableName = NULL;
init_sample_table_spec_info(&spec, stats, numTotalRows, numSampleRows);
GetUserIdAndSecContext(&save_userid, &save_sec_context);
tableName =
buildTempSampleTable(onerel->rd_id, InvalidOid, TempSmpleTblType_Attrbute, analyzemode, inh, vacstmt, &spec);
u_sess->analyze_cxt.is_under_analyze = true;
analyze_compute_samplerows(tableName, &spec);
analyze_compute_nullcount(tableName, &spec);
if (ceil(spec.null_cnt) >= ceil(spec.samplerows)) {
* All values are null. no point computing other statistics.
*/
computeDistinct = false;
computeMCV = false;
computeHist = false;
}
if (spec.stats->attrs[0]->atttypid == BOOLOID) {
spec.stats->stadistinct = 2;
computeDistinct = false;
computeHist = false;
}
if (computeDistinct) {
analyze_distinct(tableName, &spec);
if (fabs(spec.stats->stadistinct + 1.0) < EPSILON) {
computeMCV = false;
}
}
analyze_compute_avgwidth(tableName, &spec);
if (spec.stats->stawidth > WIDTH_THRESHOLD) {
* Extremely wide columns are considered to be fully distinct. See comments
* against WIDTH_THRESHOLD
*/
computeMCV = false;
computeHist = false;
}
if (computeMCV && !(u_sess->attr.attr_sql.enable_ai_stats &&
u_sess->attr.attr_sql.multi_stats_type == BAYESNET_OPTION)) {
analyze_compute_mcv(&slot_idx, tableName, &spec);
}
if (computeHist) {
if (spec.stats->num_attrs > 1) {
computeHist = false;
}
get_sort_group_operators(stats->attrtypid[0], false, false, false, <opr, NULL, NULL, NULL);
if (!OidIsValid(ltopr)) {
computeHist = false;
}
}
if (computeHist) {
analyze_compute_histgram(&slot_idx, tableName, &spec);
}
if (spec.stats->num_attrs == 1 && IS_PGXC_COORDINATOR &&
vacstmt->pstGlobalStatEx[vacstmt->tableidx].correlations[stats->attrs[0]->attnum - 1] != 0) {
analyze_compute_correlation(spec.nonnull_cnt,
0,
vacstmt->pstGlobalStatEx[vacstmt->tableidx].correlations[stats->attrs[0]->attnum - 1],
stats,
slot_idx);
}
* In multi-column statistics case, the first MCV is created with no-null values,
* in order to support better selectivity estimation of NULL-filtering with other
* predicates, we need collect another type of MCV that created with null values.
*/
if (computeMCV && spec.stats->num_attrs > 1 && !(u_sess->attr.attr_sql.enable_ai_stats &&
u_sess->attr.attr_sql.multi_stats_type == BAYESNET_OPTION)) {
analyze_compute_mcv(&slot_idx, tableName, &spec, true);
}
if (spec.stats->num_attrs > 1 && u_sess->attr.attr_sql.enable_ai_stats &&
u_sess->attr.attr_sql.multi_stats_type != MCV_OPTION) {
bool bayesnet_success = analyze_compute_bayesnet(&slot_idx, onerel, analyzemode, inh,
vacstmt, &spec, tableName);
if (!bayesnet_success && computeMCV &&
u_sess->attr.attr_sql.multi_stats_type != (int)ALL_OPTION) {
ereport(WARNING, (errmodule(MOD_AUTOVAC),
errmsg("Only NDV of multi-column statistics is created.")));
}
}
#ifndef ENABLE_MULTIPLE_NODES
if (u_sess->attr.attr_sql.enable_functional_dependency && spec.stats->num_attrs > 1) {
analyze_compute_dependencies(onerel, &slot_idx, tableName, &spec, stats);
}
#endif
if (log_min_messages > DEBUG1) {
dropSampleTable(tableName);
}
u_sess->analyze_cxt.is_under_analyze = false;
stats = spec.stats;
stats->stats_valid = true;
pfree_ext(tableName);
SetUserIdAndSecContext(save_userid, save_sec_context);
ereport(DEBUG1,(errmodule(MOD_AUTOVAC),
errmsg("ANALYZE computing statistics on attribute %s, total column number: %u.",
NameStr(stats->attrs[0]->attname),
stats->num_attrs)));
}
* Description: Get temp sample table name for normal or hdfs table.
*
* Parameters:
* @in analyzemode: the table type which collect statistics information
* @in tmpSampleTblNameList: list of temp table name
*
* Returns: const char *
*/
const char* get_sample_tblname(AnalyzeMode analyzemode, List* tmpSampleTblNameList)
{
const char* tmpSampleTableName = NULL;
if (analyzemode == ANALYZENORMAL) {
tmpSampleTableName = ((Value*)linitial(tmpSampleTblNameList))->val.str;
} else {
tmpSampleTableName = ((Value*)lsecond(tmpSampleTblNameList))->val.str;
}
return tmpSampleTableName;
}
* Description: Insert sample tuple into temp table on datanode under debugging for analyze.
*
* Parameters:
* @in type: the stage for datanode send sample to coordinator under debugging
* @in analyzemode: the table type which collect statistics information
* @in tmpSampleTblNameList: list of temp table name
* @out sampleTableOid: temp table oid
* @in tmpRelation: temp table relation
* @in tuple: sample tuple which datanode send to coordinator
*
* Returns: void
*/
static void analyze_tmptbl_debug_dn(AnalyzeTempTblDebugStage type, AnalyzeMode analyzemode, List* tmpSampleTblNameList,
Oid* sampleTableOid, Relation* tmpRelation, HeapTuple tuple, TupleConversionMap* tupmap)
{
if ((sampleTableOid == NULL) || (tmpRelation == NULL) || (tmpSampleTblNameList == NIL) ||
(default_statistics_target >= 0 && log_min_messages > DEBUG1)) {
return;
}
if (DebugStage_Begin == type) {
const char* tmpSampleTableName = get_sample_tblname(analyzemode, tmpSampleTblNameList);
*sampleTableOid = RelnameGetRelid(tmpSampleTableName);
if (OidIsValid(*sampleTableOid)) {
*tmpRelation = heap_open(*sampleTableOid, RowExclusiveLock);
}
} else if (DebugStage_Execute == type) {
if (OidIsValid(*sampleTableOid)) {
HeapTuple convert_tuple = tuple;
if (tupmap != NULL) {
convert_tuple = do_convert_tuple(tuple, tupmap);
}
(void)simple_heap_insert(*tmpRelation, convert_tuple);
CatalogUpdateIndexes(*tmpRelation, convert_tuple);
if (tupmap != NULL) {
tableam_tops_free_tuple(convert_tuple);
}
}
} else {
if (OidIsValid(*sampleTableOid)) {
heap_close(*tmpRelation, RowExclusiveLock);
*sampleTableOid = InvalidOid;
}
}
}
* Description: Check tupdesc of analyze's relation match with tupdesc of temp table or not.
* If encounter with ddl for [ALTER TABLE] after create temp table under analyzing,
* it identify doing ddl confilict with doing analyze,
* so we don't insert sample tuple into temp table any more.
* Parameters:
* @in onerel: the relation for analyze or vacuum
* @in temptbl_desc: the tuple desc for temp table
*
* Returns: TupleConversionMap*
*/
static TupleConversionMap* check_tmptbl_tupledesc(Relation onerel, TupleDesc temptbl_desc)
{
TupleConversionMap* tupmap = NULL;
MemoryContext oldcontext = CurrentMemoryContext;
PG_TRY();
{
tupmap = convert_tuples_by_name(CreateTupleDescCopy(onerel->rd_att),
temptbl_desc,
gettext_noop("attributes of relation for analyze does not match with temp table."));
}
PG_CATCH();
{
ErrorData* edata = NULL;
(void)MemoryContextSwitchTo(oldcontext);
edata = CopyErrorData();
FlushErrorState();
if (edata->sqlerrcode == ERRCODE_DATATYPE_MISMATCH) {
ereport(LOG,
(errmsg("The tupleDesc analyzed on %s is different from tupleDesc which temp table created.",
g_instance.attr.attr_common.PGXCNodeName),
errdetail("%s", edata->detail),
errhint("Do analyze again for the relation: %s.", NameStr(onerel->rd_rel->relname))));
FreeErrorData(edata);
} else {
ReThrowError(edata);
}
}
PG_END_TRY();
return tupmap;
}
* Return estimated tupe size (Byte) of the relation.
*/
int GetOneTupleSize(VacuumStmt* stmt, Relation rel)
{
int attr_cnt = 0;
int nindexes = 0;
AnlIndexData* indexdata = NULL;
bool hasindex = false;
Relation* Irel = NULL;
VacAttrStats** vacattrstats =
get_vacattrstats_by_vacstmt(rel, stmt, &attr_cnt, &nindexes, &indexdata, &hasindex, true, &Irel, false);
int total_width = get_total_width(vacattrstats, attr_cnt, nindexes, indexdata, rel);
int oneTupleSize = get_one_tuplesize(rel, total_width);
return oneTupleSize;
}
* Return the tuple count to be processed
*/
static double compute_com_size(VacuumStmt* vacstmt, Relation rel, int tableidx)
{
double com_size;
int64 workmem_allow_rows = 0;
int attr_cnt = 0;
int nindexes = 0;
int total_width = 0;
AnlIndexData* indexdata = NULL;
bool hasindex = false;
VacAttrStats** vacattrstats = NULL;
Relation* Irel = NULL;
* If it is absolute estimate, we use theorem based on the paper
* "Random sampling for histogram construction: how much is enough?"
*/
if (default_statistics_target > 0) {
int targetrows = 0;
com_size = (double)4 * default_statistics_target *
log(2 * vacstmt->pstGlobalStatEx[tableidx].totalRowCnts / 0.01) /
(ANALYZE_RELATIVE_ERROR * ANALYZE_RELATIVE_ERROR);
vacattrstats =
get_vacattrstats_by_vacstmt(rel, vacstmt, &attr_cnt, &nindexes, &indexdata, &hasindex, true, &Irel);
* Determine how many rows we need to sample according to user-defined target
* which is stored in pg_attribute.
* if we have never modified the default target, targetrows shold be 30000
*/
for (int i = 0; i < attr_cnt; i++) {
if (targetrows < vacattrstats[i]->minrows)
targetrows = vacattrstats[i]->minrows;
}
for (int j = 0; j < nindexes; j++) {
AnlIndexData* thisdata = &indexdata[j];
for (int i = 0; i < thisdata->attr_cnt; i++) {
if (targetrows < thisdata->vacattrstats[i]->minrows)
targetrows = thisdata->vacattrstats[i]->minrows;
}
}
* 1. if user-defined target rows is too low, we choose a better target rows accoding
* the paper "Random sampling for histogram construction: how much is enough?" by
* Surajit Chaudhuri, Rajeev Motwani and Vivek Narasayya
*/
com_size = Max(com_size, targetrows);
com_size = Max(com_size, DEFAULT_SAMPLE_ROWCNT);
com_size = Min(com_size, vacstmt->pstGlobalStatEx[tableidx].totalRowCnts);
total_width = get_total_width(vacattrstats, attr_cnt, nindexes, indexdata, rel);
workmem_allow_rows = get_workmem_allow_rows(rel, total_width);
* 1. sample rows size should not be more than work_mem
* 2. sample rows size should not be less than DEFAULT_SAMPLE_ROWCNT
*/
if (com_size > DEFAULT_SAMPLE_ROWCNT && com_size > workmem_allow_rows) {
com_size = Min(com_size, workmem_allow_rows);
com_size = Max(com_size, DEFAULT_SAMPLE_ROWCNT);
}
} else if (default_statistics_target < 0) {
com_size = ceil(vacstmt->pstGlobalStatEx[tableidx].totalRowCnts * (default_statistics_target * (-1) / 100.0));
com_size = com_size > DEFAULT_SAMPLE_ROWCNT ? com_size : vacstmt->pstGlobalStatEx[tableidx].totalRowCnts;
} else {
com_size = Min(vacstmt->pstGlobalStatEx[tableidx].totalRowCnts, DEFAULT_SAMPLE_ROWCNT);
}
com_size = Max(0, com_size);
return com_size;
}