*
* xlog.cpp
* openGauss transaction log manager
*
*
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/gausskernel/storage/access/transam/xlog.cpp
*
* -------------------------------------------------------------------------
*/
#include "c.h"
#include "postgres.h"
#include "knl/knl_variable.h"
#include <ctype.h>
#include <signal.h>
#include <pthread.h>
#include <time.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/wait.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <net/if.h>
#ifdef __USE_NUMA
#include <numa.h>
#endif
#include "access/cbmparsexlog.h"
#include "access/clog.h"
#include "access/csnlog.h"
#include "access/cstore_insert.h"
#include "access/double_write.h"
#include "access/heapam.h"
#include "access/multixact.h"
#include "access/multi_redo_api.h"
#include "access/rewriteheap.h"
#include "access/subtrans.h"
#include "access/transam.h"
#include "access/tuptoaster.h"
#include "access/twophase.h"
#include "access/ustore/undo/knl_uundoapi.h"
#include "access/xact.h"
#include "access/xlog_internal.h"
#include "access/xloginsert.h"
#include "access/xlogreader.h"
#include "access/xlogutils.h"
#include "access/xlogdefs.h"
#include "access/hash.h"
#include "access/xlogproc.h"
#include "access/parallel_recovery/dispatcher.h"
#include "access/extreme_rto/page_redo.h"
#include "access/ondemand_extreme_rto/page_redo.h"
#include "commands/tablespace.h"
#include "commands/matview.h"
#include "catalog/catalog.h"
#include "catalog/catversion.h"
#include "catalog/pg_control.h"
#include "catalog/pg_database.h"
#include "catalog/storage.h"
#include "catalog/storage_xlog.h"
#ifdef ENABLE_BBOX
#include "gs_bbox.h"
#endif
#include "libpq/pqsignal.h"
#include "miscadmin.h"
#ifdef PGXC
#include "pgxc/barrier.h"
#endif
#include "pgstat.h"
#include "port.h"
#include "postmaster/bgwriter.h"
#include "postmaster/startup.h"
#include "postmaster/postmaster.h"
#include "postmaster/pagewriter.h"
#include "postmaster/pagerepair.h"
#include "replication/logical.h"
#include "replication/bcm.h"
#include "replication/basebackup.h"
#include "replication/datareceiver.h"
#include "replication/datasender.h"
#include "replication/dataqueue.h"
#include "replication/dcf_data.h"
#include "replication/origin.h"
#include "replication/reorderbuffer.h"
#include "replication/replicainternal.h"
#include "replication/shared_storage_walreceiver.h"
#include "replication/ss_disaster_cluster.h"
#include "replication/slot.h"
#include "replication/snapbuild.h"
#include "replication/syncrep.h"
#include "replication/walreceiver.h"
#include "replication/walsender_private.h"
#include "replication/walsender.h"
#include "replication/walprotocol.h"
#include "replication/archive_walreceiver.h"
#include "replication/dcf_replication.h"
#include "storage/buf/bufmgr.h"
#include "storage/smgr/fd.h"
#include "storage/copydir.h"
#include "storage/ipc.h"
#include "storage/latch.h"
#include "storage/pmsignal.h"
#include "storage/predicate.h"
#include "storage/proc.h"
#include "storage/procarray.h"
#include "storage/reinit.h"
#include "storage/smgr/segment.h"
#include "storage/smgr/smgr.h"
#include "storage/spin.h"
#include "storage/smgr/relfilenode.h"
#include "storage/lock/lwlock.h"
#include "storage/dorado_operation/dorado_fd.h"
#include "storage/xlog_share_storage/xlog_share_storage.h"
#include "storage/gs_uwal/gs_uwal.h"
#include "utils/builtins.h"
#include "utils/elog.h"
#include "utils/guc.h"
#include "utils/pg_lsn.h"
#include "utils/ps_status.h"
#include "utils/relmapper.h"
#include "utils/snapmgr.h"
#include "utils/timestamp.h"
#include "utils/inet.h"
#include "utils/atomic.h"
#include "pg_trace.h"
#include "gssignal/gs_signal.h"
#include "gstrace/gstrace_infra.h"
#include "gstrace/access_gstrace.h"
#include "postmaster/pagerepair.h"
#include <sched.h>
#include <utmpx.h>
#include <time.h>
#include "access/slru.h"
#ifdef ENABLE_MOT
#include "storage/mot/mot_fdw.h"
#endif
#ifndef ENABLE_MULTIPLE_NODES
#include "dcf_interface.h"
#endif
#ifdef ENABLE_HTAP
#include "access/htap/imcucache_mgr.h"
#endif
#include "replication/libpqwalreceiver.h"
#include "postmaster/barrier_preparse.h"
#include "access/redo_statistic.h"
#include "access/multi_redo_api.h"
#include "access/parallel_recovery/dispatcher.h"
#include "vectorsonic/vsonichash.h"
#include "ddes/dms/ss_reform_common.h"
#include "ddes/dms/ss_transaction.h"
#include "ddes/dms/ss_dms_recovery.h"
#include "ddes/dms/ss_dms_bufmgr.h"
#include "storage/file/fio_device.h"
#ifdef ENABLE_UT
#define STATIC
#else
#define STATIC static
#endif
#define RECOVERY_COMMAND_FILE "recovery.conf"
#define RECOVERY_COMMAND_DONE "recovery.done"
#define FAILOVER_SIGNAL_FILE "failover"
#define SWITCHOVER_SIGNAL_FILE "switchover"
#define SWITCHOVER_TIMEOUT_SIGNAL_FILE "switchover_timeout"
#define PRIMARY_SIGNAL_FILE "primary"
#define STANDBY_SIGNAL_FILE "standby"
#define CASCADE_STANDBY_SIGNAL_FILE "cascade_standby"
#define XLOG_SWITCH_HISTORY_FILE "switch.history"
#define MAX_PATH_LEN 1024
#define MAX_BACKUP_LEN 1024
#define MAX(A, B) ((B) > (A) ? (B) : (A))
#define ENABLE_INCRE_CKPT g_instance.attr.attr_storage.enableIncrementalCheckpoint
#define RecoveryFromDummyStandby() (t_thrd.postmaster_cxt.ReplConnArray[2] != NULL && IS_DN_DUMMY_STANDYS_MODE())
#define MaxMacAddrList 10
#define BILLION 1000000000L
const int ONE_SECOND_TO_MICROSECOND = 1000000L;
const uint64 PAGE_SIZE_BYTES = 4096;
const uint64 XLOG_FLUSH_SIZE_INIT = 1024 * 1024;
const int SIZE_OF_UINT64 = 8;
const int SIZE_OF_UINT32 = 4;
const int SIZE_OF_TWO_UINT64 = 16;
const int XLOG_LSN_SWAP = 32;
const int XLOGFILENAMELEN = 24;
const char *DemoteModeDescs[] = { "unknown", "smart", "fast", "immediate" };
const int DemoteModeNum = sizeof(DemoteModeDescs) / sizeof(char *);
static const int g_retryTimes = 3;
static const uint64 REDO_SPEED_LOG_LEN = (XLogSegSize * 64);
static const int PG_TBLSPCS = 10;
static const int MPFL_PAGE_NUM = NUM_MAX_PAGE_FLUSH_LSN_PARTITIONS;
static const int MPFL_FILE_SIZE = (BLCKSZ * MPFL_PAGE_NUM);
static const int MPFL_FILE_FLAG = (O_RDWR | O_SYNC | O_DIRECT | PG_BINARY);
static const mode_t MPFL_FILE_PERM = (S_IRUSR | S_IWUSR);
static const Size MaxSendSizeBytes = 1048576;
THR_LOCAL bool redo_oldversion_xlog = false;
static const int ONE_SECOND = 1000000;
const int UWAL_SLEEP_TIME = 10000;
* XLOGfileslop is the maximum number of preallocated future XLOG segments.
* When we are done with an old XLOG segment file, we will recycle it as a
* future XLOG segment as long as there aren't already XLOGfileslop future
* segments; else we'll delete it. This could be made a separate GUC
* variable, but at present I think it's sufficient to hardwire it as
* 2*CheckPointSegments+1. Under normal conditions, a checkpoint will free
* no more than 2*CheckPointSegments log segments, and we want to recycle all
* of them; the +1 allows boundary cases to happen without wasting a
* delete/create-segment cycle.
*/
#define XLOGfileslop (2 * u_sess->attr.attr_storage.CheckPointSegments + 1)
* GUC support
*/
struct config_enum_entry sync_method_options[] = {
{ "fsync", SYNC_METHOD_FSYNC, false },
#ifdef HAVE_FSYNC_WRITETHROUGH
{ "fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false },
#endif
#ifdef HAVE_FDATASYNC
{ "fdatasync", SYNC_METHOD_FDATASYNC, false },
#endif
#ifdef OPEN_SYNC_FLAG
{ "open_sync", SYNC_METHOD_OPEN, false },
#endif
#ifdef OPEN_DATASYNC_FLAG
{ "open_datasync", SYNC_METHOD_OPEN_DSYNC, false },
#endif
{ NULL, 0, false }
};
XLogRecPtr latestValidRecord = InvalidXLogRecPtr;
pg_crc32 latestRecordCrc = InvalidXLogRecPtr;
uint32 latestRecordLen = 0;
XLogSegNo XlogRemoveSegPrimary = InvalidXLogSegPtr;
TransactionId NextXidAfterReovery;
TransactionId OldestXidAfterRecovery;
* We maintain an image of pg_control in shared memory.
*/
THR_LOCAL ControlFileData *ControlFile = NULL;
static void remove_xlogtemp_files(void);
static bool validate_parse_delay_ddl_file(DelayDDLRange *delayRange);
static bool write_delay_ddl_file(const DelayDDLRange &delayRange, bool onErrDelete);
extern void CalculateLocalLatestSnapshot(bool forceCalc);
extern int ock_uwal_append(IN UwalAppendParam *param, OUT uint64_t *offset, OUT void *result);
* Calculate the amount of space left on the page after 'endptr'. Beware
* multiple evaluation!
*/
#define INSERT_FREESPACE(endptr) (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
#define NextBufIdx(idx) (((idx) == t_thrd.shemem_ptr_cxt.XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
* Added for XLOG scaling
*
* XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
* would hold if it was in cache, the page containing 'recptr'.
*
* XLogRecEndPtrToBufIdx is the same, but a pointer to the first byte of a
* page is taken to mean the previous page.
*/
#define XLogRecPtrToBufIdx(recptr) (((recptr) / XLOG_BLCKSZ) % (t_thrd.shemem_ptr_cxt.XLogCtl->XLogCacheBlck + 1))
* Return the previous xlblock[i] mapped to the same page
*/
#define X_LOG_PREV_BLOCK_IN_BUF_PAGE(recptr) \
(((recptr) < (XLogRecPtr)(XLOG_BLCKSZ * (t_thrd.shemem_ptr_cxt.XLogCtl->XLogCacheBlck + 1))) \
? 0 \
: (XLogRecPtr)((recptr)-XLOG_BLCKSZ * (t_thrd.shemem_ptr_cxt.XLogCtl->XLogCacheBlck + 1)))
* These are the number of bytes in a WAL page and segment usable for WAL data.
*/
#define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
#define UsableBytesInSegment \
((XLogSegSize / XLOG_BLCKSZ) * UsableBytesInPage - (SizeOfXLogLongPHD - SizeOfXLogShortPHD))
* Add xlog reader private structure for page read.
*/
typedef struct XLogPageReadPrivate {
int emode;
bool fetching_ckpt;
bool randAccess;
} XLogPageReadPrivate;
* xlog switch information for forcing failover.
*/
typedef struct XLogSwitchInfo {
uint32 termId;
uint64 switchLsn;
uint64 catchLsn;
uint64 oldestRunningXid;
uint64 latestCompletedXid;
uint32 invaildPageCnt;
uint32 imcompleteActionCnt;
} XLogSwitchInfo;
volatile bool IsPendingXactsRecoveryDone = false;
static void XLogFlushCore(XLogRecPtr writeRqstPtr);
static void XLogSelfFlush(void);
static void XLogSelfFlushWithoutStatus(int numHitsOnStartPage, XLogRecPtr currPos, int currLRC);
static void XLogArchiveNotify(const char *xlog);
static void XLogArchiveNotifySeg(XLogSegNo segno);
static bool XLogArchiveCheckDone(const char *xlog);
static bool XLogArchiveIsBusy(const char *xlog);
static bool XLogArchiveIsReady(const char *xlog);
static void XLogArchiveCleanup(const char *xlog);
static void readRecoveryCommandFile(void);
static XLogSegNo GetOldestXLOGSegNo(const char *workingPath);
static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endSegNo);
static bool recoveryStopsHere(XLogReaderState *record, bool *includeThis);
static void recoveryPausesHere(void);
#ifndef ENABLE_MULTIPLE_NODES
static bool RecoveryApplyDelay(const XLogReaderState *record);
#endif
static void SetCurrentChunkStartTime(TimestampTz xtime);
static void CheckRequiredParameterValues(bool DBStateShutdown);
static void XLogReportParameters(void);
static void LocalSetXLogInsertAllowed(void);
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags, bool doFullCheckpoint);
static XlogKeeper* KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo, XLogRecPtr curInsert, bool getkeeper);
static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
template <bool isGroupInsert>
static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic, PGPROC *proc = NULL);
static bool XLogCheckpointNeeded(XLogSegNo new_segno);
static void XLogWrite(const XLogwrtRqst &WriteRqst, bool flexible);
#ifndef ENABLE_MULTIPLE_NODES
static bool XLogWritePaxos(XLogRecPtr WritePaxosRqst);
#endif
static void XLogWriteUwal(XLogRecPtr UwalWriteRqst);
int UwalXLogPageRead(XLogRecPtr targetPagePtr, char *readBuf, XLogRecPtr targetRecPtr);
static bool UwalAcquire();
static void SetUwalReadInfo(XLogRecPtr targetPagePtr, XLogRecPtr writeUwalOffSet, uint32 targetPageOff);
void XlogArchUwal(XLogRecPtr archRqstPtr);
int GsUwalCreate(uint64_t startOffset);
int GsUwalWrite(int nBytes, char *buf, uint64_t writeOffset);
int GsUwalRead(UwalId *id, XLogRecPtr targetPagePtr, char *readBuf, uint64_t readlen);
int GsUwalQuery(UwalId *id, UwalBaseInfo *info);
static void XLogFlushUwal(XLogRecPtr writeRqstPtr);
static bool InstallXLogFileSegment(XLogSegNo *segno, const char *tmppath, bool find_free, int *max_advance,
bool use_lock, const char *xlog_dir);
static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, int source, bool notexistOk);
static void XLogFileClose(void);
static void KeepFileRestoredFromArchive(const char *path, const char *xlogfname);
static bool RestoreArchivedFile(char *path, const char *xlogfname, const char *recovername, off_t expectedSize);
static void ExecuteRecoveryCommand(char *command, char *commandName, bool failOnerror);
static void PreallocXlogFiles(XLogRecPtr endptr);
static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr);
static void RemoveXlogFile(const char *segname, XLogRecPtr ednptr);
static void RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI);
static void UpdateLastRemovedPtr(const char *filename);
static void ValidateXLOGDirectoryStructure(void);
static void CleanupBackupHistory(void);
static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode, bool fetching_ckpt);
static bool existsTimeLineHistory(TimeLineID probeTLI);
static TimeLineID findNewestTimeLine(TimeLineID startTLI);
STATIC void WriteControlFile(void);
static void RecoverControlFile(void);
static char *str_time(pg_time_t tnow);
static bool CheckForPrimaryTrigger(void);
#ifdef WAL_DEBUG
static void xlog_outrec(StringInfo buf, XLogReaderState *record);
#endif
static void pg_start_backup_callback(int code, Datum arg);
static void DoAbortBackupCallback(int code, Datum arg);
static void AbortExcluseBackupCallback(int code, Datum arg);
static bool read_tablespace_map(List **tablespaces);
static bool read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired, bool *backupFromStandby,
bool *backupFromRoach);
static void check_roach_backup_condition(bool backup_started_in_recovery, const char *backupidstr);
static int get_sync_bit(int method);
static void ResetSlotLSNEndRecovery(StringInfo slotname);
static void ShutdownReadFileFacility(void);
static void SetDummyStandbyEndRecPtr(XLogReaderState *xlogreader);
static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, XLogRecPtr StartPos,
XLogRecPtr EndPos, int32 *const currlrc_ptr);
static void ReserveXLogInsertLocation(uint32 size, XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr,
int32 *const currlrc_ptr);
static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
static void StartSuspendWalInsert(int32 *const lastlrc_ptr);
static void StopSuspendWalInsert(int32 lastlrc);
void ResetRecoveryDelayLatch();
void WaitRecoveryDelayLatch(long waitTime);
template <bool isGroupInsert>
static char *GetXLogBuffer(XLogRecPtr ptr, PGPROC *proc = NULL);
static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
static XLogRecPtr XLogInsertRecordSingle(XLogRecData *rdata, XLogRecPtr fpw_lsn, bool need_flush = false);
static int SSReadXLog(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI, char* xlog_path);
static int SSDoradoReadXLog(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI, char* xlog_path);
void ArchiveXlogForForceFinishRedo(XLogReaderState *xlogreader, TermFileData *term_file);
TermFileData GetTermFileDataAndClear(void);
XLogRecPtr mpfl_read_max_flush_lsn();
void mpfl_new_file();
void mpfl_ulink_file();
bool mpfl_pread_file(int fd, void *buf, int32 size, int64 offset);
#ifdef __aarch64__
static XLogRecPtr XLogInsertRecordGroup(XLogRecData *rdata, XLogRecPtr fpw_lsn, bool need_flush);
static void XLogInsertRecordNolock(XLogRecData *rdata, PGPROC *proc, XLogRecPtr StartPos, XLogRecPtr EndPos,
XLogRecPtr PrevPos, int32 *const currlrc_ptr);
static void ReserveXLogInsertByteLocation(uint32 size, uint32 lastRecordSize, uint64 *StartBytePos, uint64 *EndBytePos,
uint64 *PrevBytePos, int32 *const currlrc_ptr);
static void CopyXLogRecordToWALForGroup(int write_len, XLogRecData *rdata, XLogRecPtr StartPos, XLogRecPtr EndPos,
PGPROC *proc, int32* const currlrc_ptr);
static void WakeUpProc(uint32 wakeidx)
{
while (wakeidx != INVALID_PGPROCNO) {
PGPROC *proc = g_instance.proc_base_all_procs[wakeidx];
wakeidx = pg_atomic_read_u32(&proc->xlogGroupNext);
pg_atomic_write_u32(&proc->xlogGroupNext, INVALID_PGPROCNO);
pg_memory_barrier();
proc->xlogGroupMember = false;
#ifdef ENABLE_MULTIPLE_NODES
if (proc != t_thrd.proc) {
PGSemaphoreUnlock(&proc->sem);
}
#else
pg_write_barrier();
#endif
}
}
static void XLogInsertRecordGroupLeader(PGPROC *leader, uint64 *end_byte_pos_ptr, int32 *const currlrc_ptr)
{
uint32 record_size;
uint64 prev_byte_pos = 0;
uint64 start_byte_pos = 0;
uint64 end_byte_pos = 0;
int32 current_lrc = 0;
uint64 dirty_page_queue_lsn = 0;
*leader->xlogGroupRedoRecPtr = t_thrd.shemem_ptr_cxt.XLogCtl->Insert.RedoRecPtr;
* In some cases, some xlog records about full page write are not needed to write to buffer,
* so reserving space for these xlog records is not needed.
*/
*leader->xlogGroupDoPageWrites = (t_thrd.shemem_ptr_cxt.XLogCtl->Insert.fullPageWrites ||
t_thrd.shemem_ptr_cxt.XLogCtl->Insert.forcePageWrites);
bool flag = (leader->xlogGroupfpw_lsn != InvalidXLogRecPtr) &&
(leader->xlogGroupfpw_lsn <= *leader->xlogGroupRedoRecPtr) && *leader->xlogGroupDoPageWrites;
if (unlikely(flag)) {
leader->xlogGroupReturntRecPtr = InvalidXLogRecPtr;
leader->xlogGroupIsFPW = true;
}
record_size = MAXALIGN(((XLogRecord *)(leader->xlogGrouprdata->data))->xl_tot_len);
Assert(record_size != 0);
if (unlikely(leader->xlogGroupIsFPW)) {
leader->xlogGroupIsFPW = false;
return;
} else {
if (likely(record_size != 0)) {
ReserveXLogInsertByteLocation(record_size, record_size, &start_byte_pos, &end_byte_pos, &prev_byte_pos,
¤t_lrc);
dirty_page_queue_lsn = start_byte_pos;
}
XLogInsertRecordNolock(leader->xlogGrouprdata, leader, XLogBytePosToRecPtr(start_byte_pos),
XLogBytePosToEndRecPtr(
start_byte_pos +
MAXALIGN(((XLogRecord *)(leader->xlogGrouprdata->data))->xl_tot_len)),
XLogBytePosToRecPtr(prev_byte_pos), ¤t_lrc);
}
if (dirty_page_queue_lsn != 0) {
update_dirty_page_queue_rec_lsn(XLogBytePosToRecPtr(dirty_page_queue_lsn));
}
if (end_byte_pos_ptr != NULL) {
*end_byte_pos_ptr = end_byte_pos;
}
if (currlrc_ptr != NULL) {
*currlrc_ptr = current_lrc;
}
}
static void XLogInsertRecordGroupFollowers(PGPROC *leader, const uint32 head, uint64 *end_byte_pos_ptr,
int32* const currlrc_ptr)
{
uint32 nextidx;
uint32 total_size = 0;
uint32 record_size = 0;
PGPROC *follower = NULL;
uint64 start_byte_pos = 0;
uint64 end_byte_pos = 0;
uint64 prev_byte_pos = 0;
int32 current_lrc = 0;
uint64 dirty_page_queue_lsn = 0;
nextidx = head;
while (nextidx != (uint32)(leader->pgprocno)) {
follower = g_instance.proc_base_all_procs[nextidx];
*follower->xlogGroupRedoRecPtr = t_thrd.shemem_ptr_cxt.XLogCtl->Insert.RedoRecPtr;
* In some cases, some xlog records about full page write are not needed to
* write to buffer, so reserving space for these xlog records is not needed.
*/
*follower->xlogGroupDoPageWrites = (t_thrd.shemem_ptr_cxt.XLogCtl->Insert.fullPageWrites ||
t_thrd.shemem_ptr_cxt.XLogCtl->Insert.forcePageWrites);
bool flag = (follower->xlogGroupfpw_lsn != InvalidXLogRecPtr) &&
(follower->xlogGroupfpw_lsn <= *follower->xlogGroupRedoRecPtr) && *follower->xlogGroupDoPageWrites;
if (unlikely(flag)) {
follower->xlogGroupReturntRecPtr = InvalidXLogRecPtr;
follower->xlogGroupIsFPW = true;
nextidx = pg_atomic_read_u32(&follower->xlogGroupNext);
continue;
}
record_size = MAXALIGN(((XLogRecord *)(follower->xlogGrouprdata->data))->xl_tot_len);
Assert(record_size != 0);
total_size += record_size;
nextidx = pg_atomic_read_u32(&follower->xlogGroupNext);
}
if (likely(total_size != 0)) {
ReserveXLogInsertByteLocation(total_size, record_size, &start_byte_pos, &end_byte_pos, &prev_byte_pos,
¤t_lrc);
dirty_page_queue_lsn = start_byte_pos;
}
nextidx = head;
while (nextidx != (uint32)(leader->pgprocno)) {
follower = g_instance.proc_base_all_procs[nextidx];
if (unlikely(follower->xlogGroupIsFPW)) {
nextidx = pg_atomic_read_u32(&follower->xlogGroupNext);
follower->xlogGroupIsFPW = false;
continue;
}
XLogInsertRecordNolock(follower->xlogGrouprdata, follower, XLogBytePosToRecPtr(start_byte_pos),
XLogBytePosToEndRecPtr(
start_byte_pos +
MAXALIGN(((XLogRecord *)(follower->xlogGrouprdata->data))->xl_tot_len)),
XLogBytePosToRecPtr(prev_byte_pos), ¤t_lrc);
prev_byte_pos = start_byte_pos;
start_byte_pos += MAXALIGN(((XLogRecord *)(follower->xlogGrouprdata->data))->xl_tot_len);
nextidx = pg_atomic_read_u32(&follower->xlogGroupNext);
}
if (dirty_page_queue_lsn != 0) {
update_dirty_page_queue_rec_lsn(XLogBytePosToRecPtr(dirty_page_queue_lsn));
}
if (end_byte_pos_ptr != NULL) {
*end_byte_pos_ptr = end_byte_pos;
}
if (currlrc_ptr != NULL) {
*currlrc_ptr = current_lrc;
}
}
static void XLogReportCopyLocation(const int32 current_lrc, const uint64 end_byte_pos, bool need_flush)
{
bool stop = true;
uint32 current_entry =
current_lrc &
(GET_WAL_INSERT_STATUS_ENTRY_CNT() - 1);
volatile WALInsertStatusEntry *status_entry_ptr =
&g_instance.wal_cxt.walInsertStatusTable[GET_STATUS_ENTRY_INDEX(current_entry)];
if (g_instance.wal_cxt.isWalWriterUp != 0) {
do {
stop = (status_entry_ptr->LRC == current_lrc && (status_entry_ptr->endLSN == 0));
} while (!stop);
} else {
while (status_entry_ptr->LRC != current_lrc) {
XLogSelfFlush();
}
}
XLogRecPtr lsn = XLogBytePosToRecPtr(end_byte_pos);
if (need_flush) {
lsn |= WAL_COMMIT_BIT;
}
(void)pg_atomic_exchange_u64(&status_entry_ptr->endLSN, lsn);
}
* @Description: Insert an XLOG record represented by an already-constructed chain of data
* chunks. some threads in one xlog insert lock can be inserted in a group.
* @in rdata: the xlog data.
* @in fpw_lsn: the LSN of full page write.
* @return: the LSN of the insert end position.
*/
static XLogRecPtr XLogInsertRecordGroup(XLogRecData *rdata, XLogRecPtr fpw_lsn, bool need_flush)
{
PGPROC *proc = t_thrd.proc;
uint32 head = 0;
uint32 nextidx = 0;
int32 current_lrc_leader = 0;
int32 current_lrc = 0;
uint64 end_byte_pos_leader = 0;
uint64 end_byte_pos = 0;
if (unlikely(!XLogInsertAllowed())) {
ereport(ERROR, (errcode(ERRCODE_CASE_NOT_FOUND), errmsg("cannot make new WAL entries during recovery")));
}
if (SS_DORADO_STANDBY_CLUSTER) {
ereport(FATAL, (errmsg("SS dorado standby cluster cannot insert XLOG entries")));
}
START_CRIT_SECTION();
proc->xlogGroupMember = true;
proc->xlogGrouprdata = rdata;
proc->xlogGroupfpw_lsn = fpw_lsn;
proc->xlogGroupProcLastRecPtr = &t_thrd.xlog_cxt.ProcLastRecPtr;
proc->xlogGroupXactLastRecEnd = &t_thrd.xlog_cxt.XactLastRecEnd;
proc->xlogGroupCurrentTransactionState = GetCurrentTransactionState();
proc->xlogGroupRedoRecPtr = &t_thrd.xlog_cxt.RedoRecPtr;
proc->xlogGroupLogwrtResult = t_thrd.xlog_cxt.LogwrtResult;
proc->xlogGroupTimeLineID = t_thrd.xlog_cxt.ThisTimeLineID;
proc->xlogGroupDoPageWrites = &t_thrd.xlog_cxt.doPageWrites;
int groupnum = (proc->pgprocno / g_instance.shmem_cxt.numaNodeNum) % (g_instance.wal_cxt.num_locks_in_group);
nextidx = pg_atomic_read_u32(&t_thrd.shemem_ptr_cxt.LocalGroupWALInsertLocks[groupnum].l.xlogGroupFirst);
while (true) {
pg_atomic_write_u32(&proc->xlogGroupNext, nextidx);
pg_write_barrier();
if (pg_atomic_compare_exchange_u32(&t_thrd.shemem_ptr_cxt.LocalGroupWALInsertLocks[groupnum].l.xlogGroupFirst,
&nextidx, (uint32)proc->pgprocno)) {
break;
}
}
* If the list was not empty, the leader will insert all xlog in the same xlog insert slot.
* It is impossible to have followers without a leader because the first process that
* has added itself to the list will always have nextidx as INVALID_PGPROCNO.
*/
if (nextidx != INVALID_PGPROCNO) {
int extra_waits = 0;
for (;;) {
#ifdef ENABLE_MULTIPLE_NODES
PGSemaphoreLock(&proc->sem, false);
if (!proc->xlogGroupMember) {
break;
}
#else
if (extra_waits % FOLLOWER_TRIGER_SLEEP_LOOP_COUNT == 0 && extra_waits > 0) {
pg_usleep(FOLLOWER_SLEEP_USECS);
}
pg_read_barrier();
if (!proc->xlogGroupMember) {
break;
} else {
(void)sched_yield();
}
#endif
extra_waits++;
}
#ifdef ENABLE_MULTIPLE_NODES
while (extra_waits-- > 0) {
PGSemaphoreUnlock(&proc->sem);
}
#endif
END_CRIT_SECTION();
* We needs a memory barrier between reading "xlogGroupMember" and "xlogGroupReturntRecPtr'
* in case of memory reordering in relaxed memory model like ARM.
*/
pg_memory_barrier();
Assert(pg_atomic_read_u32(&proc->xlogGroupNext) == INVALID_PGPROCNO);
return proc->xlogGroupReturntRecPtr;
}
PGPROC *leader = t_thrd.proc;
XLogInsertRecordGroupLeader(leader, &end_byte_pos_leader, ¤t_lrc_leader);
if (end_byte_pos_leader != 0) {
XLogReportCopyLocation(current_lrc_leader, end_byte_pos_leader, need_flush);
}
* Clear the list of processes waiting for group xlog insert, saving a pointer to the head of the list.
* Trying to pop elements one at a time could lead to an ABA problem.
*/
head = pg_atomic_exchange_u32(&t_thrd.shemem_ptr_cxt.LocalGroupWALInsertLocks[groupnum].l.xlogGroupFirst,
INVALID_PGPROCNO);
bool has_follower = (head != (uint32)(leader->pgprocno));
if (has_follower) {
XLogInsertRecordGroupFollowers(leader, head, &end_byte_pos, ¤t_lrc);
}
* Wake all waiting threads up.
*/
WakeUpProc(head);
* Examine the entry's LRC and status all together to make sure the writer already
* finishes updating this entry and this entry is now owned by this thread
*
* If the status entry is owned by this thread, update it with the copied xlog record's
* end LSN and set the WAL copy status to WAL_COPIED to signal copy completion as well
*/
if (has_follower && end_byte_pos != 0) {
XLogReportCopyLocation(current_lrc, end_byte_pos, true);
}
pg_write_barrier();
if (g_instance.wal_cxt.isWalWriterSleeping) {
pthread_mutex_lock(&g_instance.wal_cxt.criticalEntryMutex);
pthread_cond_signal(&g_instance.wal_cxt.criticalEntryCV);
pthread_mutex_unlock(&g_instance.wal_cxt.criticalEntryMutex);
}
END_CRIT_SECTION();
return proc->xlogGroupReturntRecPtr;
}
* @Description: Insert an XLOG record represented by an already-constructed chain of data
* chunks. Becaus of the group insert mode, the xlog insert lock is not needed.
* @in rdata: the xlog data.
* @in proc: the waiting proc.
* @in StartPos: the start LSN postion.
* @in EndPos: the end LSN postion.
* @in PrevPos: the prev LSN postion.
*/
static void XLogInsertRecordNolock(XLogRecData *rdata, PGPROC *proc, XLogRecPtr StartPos, XLogRecPtr EndPos,
XLogRecPtr PrevPos, int32 *const currlrc_ptr)
{
XLogRecord *rechdr = (XLogRecord *)rdata->data;
rechdr->xl_prev = PrevPos;
pg_crc32c rdata_crc;
Assert(XLByteLT(PrevPos, StartPos));
Assert(rdata->len >= SizeOfXLogRecord);
START_CRIT_SECTION();
rdata_crc = ((XLogRecord *)rechdr)->xl_crc;
COMP_CRC32C(rdata_crc, (XLogRecord *)rechdr, offsetof(XLogRecord, xl_crc));
FIN_CRC32C(rdata_crc);
((XLogRecord *)rechdr)->xl_crc = rdata_crc;
* All the record data, including the header, is now ready to be
* inserted. Copy the record in the space reserved.
*/
CopyXLogRecordToWALForGroup(rechdr->xl_tot_len, rdata, StartPos, EndPos, proc, currlrc_ptr);
CopyTransactionIdLoggedIfAny((TransactionState)proc->xlogGroupCurrentTransactionState);
END_CRIT_SECTION();
if (unlikely(StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)) {
SpinLockAcquire(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck);
if (t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtRqst.Write < EndPos) {
t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtRqst.Write = EndPos;
}
*((XLogwrtResult *)proc->xlogGroupLogwrtResult) = t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtResult;
SpinLockRelease(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck);
}
#ifdef WAL_DEBUG
if (u_sess->attr.attr_storage.XLOG_DEBUG) {
StringInfoData buf;
initStringInfo(&buf);
appendStringInfo(&buf, "INSERT @ %X/%X: ", (uint32)(EndPos >> 32), (uint32)EndPos);
xlog_outrec(&buf, rechdr);
if (rdata->data != NULL) {
appendStringInfo(&buf, " - ");
RmgrTable[rechdr->xl_rmid].rm_desc(&buf, rechdr->xl_info, rdata->data);
}
ereport(LOG, (errmsg("%s", buf.data)));
pfree_ext(buf.data);
}
#endif
*proc->xlogGroupProcLastRecPtr = StartPos;
*proc->xlogGroupXactLastRecEnd = EndPos;
proc->xlogGroupReturntRecPtr = EndPos;
return;
}
* @Description: Reserves the right amount of space for a given size from the WAL.
* already-reserved area in the WAL. The StartBytePos, EndBytePos and PrevBytePos
* are stored as "usable byte positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
* @in size: the size for right amount of space.
* @in lastRecordSize: the last record size in the group.
* @out StartBytePos: the start position of the WAL.
* @out EndBytePos: the end position of the WAL.
* @out PrevBytePos: the previous position of the WAL.
*/
static void ReserveXLogInsertByteLocation(uint32 size, uint32 lastRecordSize, uint64 *StartBytePos, uint64 *EndBytePos,
uint64 *PrevBytePos, int32* const currlrc_ptr)
{
volatile XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
size = MAXALIGN(size);
Assert(size > SizeOfXLogRecord);
* The duration the spinlock needs to be held is minimized by minimizing
* the calculations that have to be done while holding the lock. The
* current tip of reserved WAL is kept in CurrBytePos, as a byte position
* that only counts "usable" bytes in WAL, that is, it excludes all WAL
* page headers. The mapping between "usable" byte positions and physical
* positions (XLogRecPtrs) can be done outside the locked region, and
* because the usable byte position doesn't include any headers, reserving
* X bytes from WAL is almost as simple as "CurrBytePos += X".
*/
#if defined(__x86_64__) || defined(__aarch64__) && !defined(__USE_SPINLOCK)
union Union128 compare;
union Union128 exchange;
union Union128 current;
compare.value = atomic_compare_and_swap_u128((volatile uint128_u *)&Insert->CurrBytePos);
Assert(sizeof(Insert->CurrBytePos) == SIZE_OF_UINT64);
Assert(sizeof(Insert->PrevByteSize) == SIZE_OF_UINT32);
Assert(sizeof(Insert->CurrLRC) == SIZE_OF_UINT32);
loop:
* |CurrBytePos |bytePos | CurrLRC
* ------------------------+------------ +-----------
* |64 bits |32 bits |32 bits
*/
* Suspend WAL insert threads when currlrc equals WAL_COPY_SUSPEND
*/
if (unlikely(compare.struct128.LRC == WAL_COPY_SUSPEND)) {
compare.value = atomic_compare_and_swap_u128((volatile uint128_u *)&Insert->CurrBytePos);
goto loop;
}
exchange.struct128.currentBytePos = compare.struct128.currentBytePos + size;
exchange.struct128.byteSize = lastRecordSize;
exchange.struct128.LRC = (compare.struct128.LRC + 1) & 0x7FFFFFFF;
current.value = atomic_compare_and_swap_u128((volatile uint128_u *)&Insert->CurrBytePos, compare.value,
exchange.value);
if (!UINT128_IS_EQUAL(compare.value, current.value)) {
UINT128_COPY(compare.value, current.value);
goto loop;
}
*currlrc_ptr = compare.struct128.LRC;
*StartBytePos = compare.struct128.currentBytePos;
*EndBytePos = exchange.struct128.currentBytePos;
*PrevBytePos = compare.struct128.currentBytePos - compare.struct128.byteSize;
#else
loop1:
SpinLockAcquire(&Insert->insertpos_lck);
if (unlikely(Insert->CurrLRC == WAL_COPY_SUSPEND)) {
SpinLockRelease(&Insert->insertpos_lck);
goto loop1;
}
*currlrc_ptr = Insert->CurrLRC;
*StartBytePos = Insert->CurrBytePos;
*EndBytePos = Insert->CurrBytePos + size;
*PrevBytePos = Insert->CurrBytePos - Insert->PrevByteSize;
Insert->CurrLRC = (Insert->CurrLRC + 1) & 0x7FFFFFFF;
Insert->PrevByteSize = lastRecordSize;
Insert->CurrBytePos = *EndBytePos;
SpinLockRelease(&Insert->insertpos_lck);
#endif
}
* @Description: In xlog group insert mode, copy a WAL record to an
* already-reserved area in the WAL.
* @in write_len: the length of xlog data.
* @in rdata: the xlog data.
* @in StartPos: the start LSN.
* @in EndPos: the end LSN.
* @in proc: the waiting proc.
*/
static void CopyXLogRecordToWALForGroup(int write_len, XLogRecData *rdata, XLogRecPtr StartPos, XLogRecPtr EndPos,
PGPROC *proc, int32 *const currlrc_ptr)
{
char *currpos = NULL;
uint32 freespace;
int written;
XLogRecPtr CurrPos;
XLogPageHeader pagehdr;
errno_t errorno = EOK;
* The following 2 variables are used to handle large records
*/
uint32 startBufPageIdx = XLogRecPtrToBufIdx(StartPos);
int numHitsOnStartPage = 1;
CurrPos = StartPos;
currpos = GetXLogBuffer<true>(CurrPos, proc);
freespace = INSERT_FREESPACE(CurrPos);
* there should be enough space for at least the first field (xl_tot_len)
* on this page.
*/
Assert(freespace >= sizeof(uint32));
written = 0;
while (rdata != NULL) {
char *rdata_data = rdata->data;
uint32 rdata_len = rdata->len;
while (rdata_len > freespace) {
Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
errorno = memcpy_s(currpos, SECUREC_STRING_MAX_LEN, rdata_data, freespace);
securec_check(errorno, "", "");
rdata_data += freespace;
rdata_len -= freespace;
written += freespace;
CurrPos += freespace;
* We hit the page we start from within the same copy. We must initiate
* a flush without updating WAL copy status entry or we will be stuck
*/
if (unlikely(XLogRecPtrToBufIdx(CurrPos) == startBufPageIdx)) {
numHitsOnStartPage++;
XLogSelfFlushWithoutStatus(numHitsOnStartPage, CurrPos, *currlrc_ptr);
}
* Get pointer to beginning of next page, and set the xlp_rem_len
* in the page header. Set XLP_FIRST_IS_CONTRECORD.
*
* It's safe to set the contrecord flag and xlp_rem_len without a
* lock on the page. All the other flags were already set when the
* page was initialized, in AdvanceXLInsertBuffer, and we're the
* only backend that needs to set the contrecord flag.
*/
currpos = GetXLogBuffer<true>(CurrPos, proc);
pagehdr = (XLogPageHeader)currpos;
pagehdr->xlp_rem_len = write_len - written;
pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
if (CurrPos % XLogSegSize == 0) {
CurrPos += SizeOfXLogLongPHD;
currpos += SizeOfXLogLongPHD;
} else {
CurrPos += SizeOfXLogShortPHD;
currpos += SizeOfXLogShortPHD;
}
freespace = INSERT_FREESPACE(CurrPos);
}
Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
errorno = memcpy_s(currpos, SECUREC_STRING_MAX_LEN, rdata_data, rdata_len);
securec_check(errorno, "", "");
currpos += rdata_len;
CurrPos += rdata_len;
freespace -= rdata_len;
written += rdata_len;
rdata = rdata->next;
}
Assert(written == write_len);
CurrPos = MAXALIGN(CurrPos);
if (SECUREC_UNLIKELY(CurrPos != EndPos)) {
ereport(PANIC, (errmsg("space reserved for WAL record does not match what was written")));
}
pgstat_report_waitevent_count(WAIT_EVENT_WAL_BUFFER_ACCESS);
}
#endif
inline void SetMinRecoverPointForStats(XLogRecPtr lsn)
{
if (XLByteLT(g_instance.comm_cxt.predo_cxt.redoPf.min_recovery_point, lsn)) {
g_instance.comm_cxt.predo_cxt.redoPf.min_recovery_point = lsn;
}
}
* @Description: Insert an XLOG record represented by an already-constructed chain of data
* chunks. The action is the same as the function XLogInsertRecord.
* @in rdata: the xlog data.
* @in fpw_lsn: the LSN of full page write.
* @in isupgrade: whether this xlog insert is for upgrade.
* @return: the LSN of the insert end position.
*/
static XLogRecPtr XLogInsertRecordSingle(XLogRecData *rdata, XLogRecPtr fpw_lsn, bool need_flush)
{
XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
XLogRecord *rechdr = (XLogRecord *)rdata->data;
pg_crc32c rdata_crc;
bool inserted = false;
bool stop = true;
XLogRecPtr StartPos = InvalidXLogRecPtr;
XLogRecPtr EndPos = InvalidXLogRecPtr;
int32 currlrc = 0;
bool isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID && rechdr->xl_info == XLOG_SWITCH);
Assert(rdata->len >= SizeOfXLogRecord);
if (!XLogInsertAllowed()) {
ereport(ERROR, (errcode(ERRCODE_CASE_NOT_FOUND), errmsg("cannot make new WAL entries during recovery")));
}
if (SS_DORADO_STANDBY_CLUSTER) {
ereport(FATAL, (errmsg("SS dorado standby cluster cannot insert XLOG entries")));
}
*
* We have now done all the preparatory work we can without holding a
* lock or modifying shared state. From here on, inserting the new WAL
* record to the shared WAL buffer cache is a two-step process:
*
* 1. Reserve the right amount of space from the WAL. The current head of
* reserved space is kept in Insert->CurrBytePos, and is protected by
* insertpos_lck.
*
* 2. Copy the record to the reserved WAL space. This involves finding the
* correct WAL buffer containing the reserved space, and copying the
* record in place. This can be done concurrently in multiple processes.
*
* To keep track of which insertions are still in-progress, each concurrent
* inserter acquires an insertion lock. In addition to just indicating that
* an insertion is in progress, the lock tells others how far the inserter
* has progressed. There is a small fixed number of insertion locks,
* determined by num_xloginsert_locks. When an inserter crosses a page
* boundary, it updates the value stored in the lock to the how far it has
* inserted, to allow the previous buffer to be flushed.
*
* Holding onto an insertion lock also protects RedoRecPtr and
* fullPageWrites from changing until the insertion is finished.
*
* Step 2 can usually be done completely in parallel. If the required WAL
* page is not initialized yet, you have to grab WALBufMappingLock to
* initialize it, but the WAL writer tries to do that ahead of insertions
* to avoid that from happening in the critical path.
*
* ----------
*/
START_CRIT_SECTION();
if (isLogSwitch) {
StartSuspendWalInsert(&currlrc);
}
* Check to see if my copy of RedoRecPtr or doPageWrites is out of date.
* If so, may have to go back and have the caller recompute everything.
* This can only happen just after a checkpoint, so it's better to be slow
* in this case and fast otherwise.
*
* If we aren't doing full-page writes then RedoRecPtr doesn't actually
* affect the contents of the XLOG record, so we'll update our local copy
* but not force a recomputation. (If doPageWrites was just turned off,
* we could recompute the record without full pages, but we choose not to
* bother.)
*/
if (t_thrd.xlog_cxt.RedoRecPtr != Insert->RedoRecPtr) {
Assert(t_thrd.xlog_cxt.RedoRecPtr < Insert->RedoRecPtr);
t_thrd.xlog_cxt.RedoRecPtr = Insert->RedoRecPtr;
}
t_thrd.xlog_cxt.doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
if (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= t_thrd.xlog_cxt.RedoRecPtr && t_thrd.xlog_cxt.doPageWrites) {
END_CRIT_SECTION();
return InvalidXLogRecPtr;
}
* Reserve space for the record in the WAL. This also sets the xl_prev
* pointer.
*/
if (isLogSwitch) {
XLogRecPtr tmp_xl_prev = InvalidXLogRecPtr;
inserted = ReserveXLogSwitch(&StartPos, &EndPos, &tmp_xl_prev);
rechdr->xl_prev = tmp_xl_prev;
} else {
ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos, &rechdr->xl_prev, &currlrc);
inserted = true;
}
if (inserted) {
uint32 current_entry =
currlrc &
(GET_WAL_INSERT_STATUS_ENTRY_CNT() - 1);
volatile WALInsertStatusEntry *status_entry_ptr =
&g_instance.wal_cxt.walInsertStatusTable[GET_STATUS_ENTRY_INDEX(current_entry)];
rdata_crc = rechdr->xl_crc;
COMP_CRC32C(rdata_crc, (XLogRecord *)rechdr, offsetof(XLogRecord, xl_crc));
FIN_CRC32C(rdata_crc);
rechdr->xl_crc = rdata_crc;
* All the record data, including the header, is now ready to be
* inserted. Copy the record in the space reserved.
*/
CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata, StartPos, EndPos, &currlrc);
* Examine the entry's LRC and status all together to make sure the writer already
* finishes updating this entry and this entry is now owned by this thread
*
* If the status entry is owned by this thread, update it with the copied xlog record's
* end LSN and set the WAL copy status to WAL_COPIED to signal copy completion as well
*
* XLog switch already handles its flush in CopyXLogRecordToWAL so here we only
* update the WAL copy status for the non switch records
*/
if (!isLogSwitch) {
if (g_instance.wal_cxt.isWalWriterUp) {
do {
stop = (status_entry_ptr->LRC == currlrc && (status_entry_ptr->endLSN == 0));
} while (!stop);
} else {
while (status_entry_ptr->LRC != currlrc) {
XLogSelfFlush();
}
}
* We now own this status entry
* Update status_entry_ptr->endLSN with an atomic call to prevent half updated value being
* read by the writer. The writer should use atomic read
* The 2nd CAS call should succeed in updating the value since this thread is the only
* one updating this field at this moment
*/
(void)pg_atomic_write_u64((volatile uint64 *)(&status_entry_ptr->endLSN), need_flush ? EndPos | WAL_COMMIT_BIT : EndPos);
}
if (g_instance.wal_cxt.isWalWriterSleeping) {
pthread_mutex_lock(&g_instance.wal_cxt.criticalEntryMutex);
pthread_cond_signal(&g_instance.wal_cxt.criticalEntryCV);
pthread_mutex_unlock(&g_instance.wal_cxt.criticalEntryMutex);
}
} else {
* This was an xlog-switch record, but the current insert location was
* already exactly at the beginning of a segment, so there was no need
* to do anything.
*/
}
if (isLogSwitch) {
* Swap the recorded LRC back into the shared memory location after adding 1
* for the next WAL insert thread to pick up
*/
StopSuspendWalInsert(currlrc);
}
MarkCurrentTransactionIdLoggedIfAny();
END_CRIT_SECTION();
* Use the StartPos update the dirty page queue reclsn, the dirty page push to
* dirty page queue use this reclsn, then record XLog, the dirty page reclsn is
* smaller than the redo lsn.
*/
update_dirty_page_queue_rec_lsn(StartPos);
* Update shared LogwrtRqst.Write, if we crossed page boundary.
*/
if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ) {
SpinLockAcquire(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck);
if (t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtRqst.Write < EndPos) {
t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtRqst.Write = EndPos;
}
*t_thrd.xlog_cxt.LogwrtResult = t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtResult;
SpinLockRelease(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck);
}
* If this was an XLOG_SWITCH record, flush the record and the empty
* padding space that fills the rest of the segment, and perform
* end-of-segment actions (eg, notifying archiver).
*/
if (isLogSwitch) {
TRACE_POSTGRESQL_XLOG_SWITCH();
XLogWaitFlush(EndPos);
* Even though we reserved the rest of the segment for us, which is
* reflected in EndPos, we return a pointer to just the end of the
* xlog-switch record.
*/
if (inserted) {
EndPos = StartPos + MAXALIGN(SizeOfXLogRecord);
if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ && EndPos % XLOG_BLCKSZ != 0) {
if (EndPos % XLogSegSize == EndPos % XLOG_BLCKSZ) {
EndPos += SizeOfXLogLongPHD;
} else {
EndPos += SizeOfXLogShortPHD;
}
}
}
}
#ifdef WAL_DEBUG
if (u_sess->attr.attr_storage.XLOG_DEBUG) {
StringInfoData buf;
initStringInfo(&buf);
appendStringInfo(&buf, "INSERT @ %X/%X: ", (uint32)(EndPos >> 32), (uint32)EndPos);
xlog_outrec(&buf, rechdr);
if (rdata->data != NULL) {
appendStringInfo(&buf, " - ");
RmgrTable[rechdr->xl_rmid].rm_desc(&buf, rechdr->xl_info, rdata->data);
}
ereport(LOG, (errmsg("%s", buf.data)));
pfree_ext(buf.data);
}
#endif
t_thrd.xlog_cxt.ProcLastRecPtr = StartPos;
t_thrd.xlog_cxt.XactLastRecEnd = EndPos;
return EndPos;
}
* Insert an XLOG record represented by an already-constructed chain of data
* chunks. This is a low-level routine; to construct the WAL record header
* and data, use the higher-level routines in xloginsert.cpp.
*
* If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
* WAL record applies to, that were not included in the record as full page
* images. If fpw_lsn >= RedoRecPtr, the function does not perform the
* insertion and returns InvalidXLogRecPtr. The caller can then recalculate
* which pages need a full-page image, and retry. If fpw_lsn is invalid, the
* record is always inserted.
*
* The first XLogRecData in the chain must be for the record header, and its
* data must be MAXALIGNed. XLogInsertRecord fills in the xl_prev and
* xl_crc fields in the header, the rest of the header must already be filled
* by the caller.
*
* Returns XLOG pointer to end of record (beginning of next record).
* This can be used as LSN for data pages affected by the logged action.
* (LSN is the XLOG point up to which the XLOG must be flushed to disk
* before the data page can be written out. This implements the basic
* WAL rule "write the log before the data".)
*
* @in rdata: the xlog data.
* @in fpw_lsn: the LSN of full page write.
* @in isupgrade: whether this xlog insert is for upgrade.
* @return: the LSN of the insert end position.
*/
XLogRecPtr XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn, bool need_flush)
{
#ifdef __aarch64__
* In ARM architecture, insert an XLOG record represented by an already-constructed chain of data
* chunks. If the record is LogSwitch or upgrade data, insert the record in single mode, and in other
* situation, insert the record in group mode.
*/
XLogRecord *rechdr = (XLogRecord *)rdata->data;
bool isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID && rechdr->xl_info == XLOG_SWITCH);
if (isLogSwitch || !u_sess->attr.attr_storage.enable_xlog_insert_record_group) {
return XLogInsertRecordSingle(rdata, fpw_lsn, need_flush);
} else {
return XLogInsertRecordGroup(rdata, fpw_lsn, need_flush);
}
#else
return XLogInsertRecordSingle(rdata, fpw_lsn);
#endif
}
* Reserves the right amount of space for a record of given size from the WAL.
* *StartPos is set to the beginning of the reserved section, *EndPos to
* its end+1. *PrevPtr is set to the beginning of the previous record; it is
* used to set the xl_prev of this record.
*
* This is the performance critical part of XLogInsert that must be serialized
* across backends. The rest can happen mostly in parallel. Try to keep this
* section as short as possible, insertpos_lck can be heavily contended on a
* busy system.
*
* NB: The space calculation here must match the code in CopyXLogRecordToWAL,
* where we actually copy the record to the reserved space.
*/
static void ReserveXLogInsertLocation(uint32 size, XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr,
int32* const currlrc_ptr)
{
volatile XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
size = MAXALIGN(size);
Assert(size > SizeOfXLogRecord);
* The duration the spinlock needs to be held is minimized by minimizing
* the calculations that have to be done while holding the lock. The
* current tip of reserved WAL is kept in CurrBytePos, as a byte position
* that only counts "usable" bytes in WAL, that is, it excludes all WAL
* page headers. The mapping between "usable" byte positions and physical
* positions (XLogRecPtrs) can be done outside the locked region, and
* because the usable byte position doesn't include any headers, reserving
* X bytes from WAL is almost as simple as "CurrBytePos += X".
*/
#if defined(__x86_64__) || defined(__aarch64__) && !defined(__USE_SPINLOCK)
union Union128 compare;
union Union128 exchange;
union Union128 current;
compare.value = atomic_compare_and_swap_u128((uint128_u *)&Insert->CurrBytePos);
Assert(sizeof(Insert->CurrBytePos) == SIZE_OF_UINT64);
Assert(sizeof(Insert->PrevByteSize) == SIZE_OF_UINT32);
Assert(sizeof(Insert->CurrLRC) == SIZE_OF_UINT32);
loop1:
* |CurrBytePos |bytePos | CurrLRC
* ------------------------+------------ +-----------
* |64 bits |32 bits |32 bits
*/
* Suspend WAL insert threads when currlrc equals WAL_COPY_SUSPEND
*/
if (unlikely(compare.struct128.LRC == WAL_COPY_SUSPEND)) {
compare.value = atomic_compare_and_swap_u128((volatile uint128_u *)&Insert->CurrBytePos);
pg_usleep(1);
goto loop1;
}
exchange.struct128.currentBytePos = compare.struct128.currentBytePos + size;
exchange.struct128.byteSize = size;
exchange.struct128.LRC = (compare.struct128.LRC + 1) & 0x7FFFFFFF;
current.value = atomic_compare_and_swap_u128((volatile uint128_u *)&Insert->CurrBytePos, compare.value,
exchange.value);
if (!UINT128_IS_EQUAL(compare.value, current.value)) {
UINT128_COPY(compare.value, current.value);
goto loop1;
}
*currlrc_ptr = compare.struct128.LRC;
*StartPos = XLogBytePosToRecPtr(compare.struct128.currentBytePos);
*EndPos = XLogBytePosToEndRecPtr(exchange.struct128.currentBytePos);
*PrevPtr = XLogBytePosToRecPtr(compare.struct128.currentBytePos - compare.struct128.byteSize);
#else
loop1:
SpinLockAcquire(&Insert->insertpos_lck);
if (unlikely(Insert->CurrLRC == WAL_COPY_SUSPEND)) {
SpinLockRelease(&Insert->insertpos_lck);
pg_usleep(1);
goto loop1;
}
*currlrc_ptr = Insert->CurrLRC;
*PrevPtr = XLogBytePosToRecPtr(Insert->CurrBytePos - Insert->PrevByteSize);
*StartPos = XLogBytePosToRecPtr(Insert->CurrBytePos);
Insert->CurrBytePos = Insert->CurrBytePos + size;
*EndPos = XLogBytePosToEndRecPtr(Insert->CurrBytePos);
Insert->PrevByteSize = size;
Insert->CurrLRC = (Insert->CurrLRC + 1) & 0x7FFFFFFF;
SpinLockRelease(&Insert->insertpos_lck);
#endif
}
* Like ReserveXLogInsertLocation(), but for an xlog-switch record.
*
* A log-switch record is handled slightly differently. The rest of the
* segment will be reserved for this insertion, as indicated by the returned
* *EndPos value. However, if we are already at the beginning of the current
* segment, *StartPos and *EndPos are set to the current location without
* reserving any space, and the function returns false.
*/
static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
{
volatile XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
uint64 startbytepos;
uint64 endbytepos;
uint32 prevbytesize;
int32 currlrc;
uint32 size = MAXALIGN(SizeOfXLogRecord);
XLogRecPtr ptr;
uint32 segleft;
* These calculations are a bit heavy-weight to be done while holding a
* spinlock, but since we're holding all the WAL insertion locks, there
* are no other inserters competing for it. GetXLogInsertRecPtr() does
* compete for it, but that's not called very frequently.
*/
#if defined(__x86_64__) || defined(__aarch64__) && !defined(__USE_SPINLOCK)
uint128_u exchange;
uint128_u current;
uint128_u compare = atomic_compare_and_swap_u128((uint128_u *)&Insert->CurrBytePos);
loop:
startbytepos = compare.u64[0];
ptr = XLogBytePosToEndRecPtr(startbytepos);
if (ptr % XLogSegSize == 0) {
*EndPos = *StartPos = ptr;
return false;
}
endbytepos = startbytepos + size;
prevbytesize = compare.u32[2];
currlrc = (int32)compare.u32[3];
*StartPos = XLogBytePosToRecPtr(startbytepos);
*EndPos = XLogBytePosToEndRecPtr(endbytepos);
segleft = XLogSegSize - ((*EndPos) % XLogSegSize);
if (segleft != XLogSegSize) {
*EndPos += segleft;
endbytepos = XLogRecPtrToBytePos(*EndPos);
}
exchange.u64[0] = endbytepos;
size = (uint32)(endbytepos - startbytepos);
exchange.u64[1] = (uint64)size;
exchange.u32[3] = currlrc;
current = atomic_compare_and_swap_u128((uint128_u *)&Insert->CurrBytePos, compare, exchange);
if (!UINT128_IS_EQUAL(compare, current)) {
UINT128_COPY(compare, current);
goto loop;
}
#else
SpinLockAcquire(&Insert->insertpos_lck);
startbytepos = Insert->CurrBytePos;
ptr = XLogBytePosToEndRecPtr(startbytepos);
if (ptr % XLogSegSize == 0) {
SpinLockRelease(&Insert->insertpos_lck);
*EndPos = *StartPos = ptr;
return false;
}
endbytepos = startbytepos + size;
prevbytesize = Insert->PrevByteSize;
currlrc = Insert->CurrLRC;
*StartPos = XLogBytePosToRecPtr(startbytepos);
*EndPos = XLogBytePosToEndRecPtr(endbytepos);
segleft = XLogSegSize - ((*EndPos) % XLogSegSize);
if (segleft != XLogSegSize) {
*EndPos += segleft;
endbytepos = XLogRecPtrToBytePos(*EndPos);
}
Insert->CurrBytePos = endbytepos;
Insert->PrevByteSize = (uint32)(endbytepos - startbytepos);
Insert->CurrLRC = currlrc;
SpinLockRelease(&Insert->insertpos_lck);
#endif
*PrevPtr = XLogBytePosToRecPtr(startbytepos - prevbytesize);
Assert((*EndPos) % XLogSegSize == 0);
Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
Assert(XLogRecPtrToBytePos(*PrevPtr) == (startbytepos - prevbytesize));
return true;
}
static void StartSuspendWalInsert(int32 *const lastlrc_ptr)
{
volatile XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
int32 currlrc;
int32 ientry;
volatile WALInsertStatusEntry *entry;
#if (defined(__x86_64__) || defined(__aarch64__)) && !defined(__USE_SPINLOCK)
uint64 startbytepos;
uint32 prevbytesize;
uint128_u compare;
uint128_u exchange;
uint128_u current;
compare = atomic_compare_and_swap_u128((uint128_u *)&Insert->CurrBytePos);
Assert(sizeof(Insert->CurrBytePos) == 8);
Assert(sizeof(Insert->PrevByteSize) == 4);
Assert(sizeof(Insert->CurrLRC) == 4);
loop:
* |CurrBytePos |PrevByteSize | CurrLRC
* ------------------------+------------ +-----------
* |64 bits |32 bits |32 bits
*/
startbytepos = compare.u64[0];
prevbytesize = compare.u32[2];
currlrc = (int32)compare.u32[3];
* if WAL Insert threads are already suspended, wait
*/
if (unlikely(currlrc == WAL_COPY_SUSPEND)) {
compare = atomic_compare_and_swap_u128((uint128_u *)&Insert->CurrBytePos);
goto loop;
}
* Return the last LRC
*/
*lastlrc_ptr = currlrc;
currlrc = WAL_COPY_SUSPEND;
* Swap in currlrc to suspend WAL copy threads but keep startbytepos and prevbytesize
* as it is
*/
exchange.u64[0] = startbytepos;
exchange.u32[2] = prevbytesize;
exchange.u32[3] = currlrc;
current = atomic_compare_and_swap_u128((uint128_u *)&Insert->CurrBytePos, compare, exchange);
if (!UINT128_IS_EQUAL(compare, current)) {
UINT128_COPY(compare, current);
goto loop;
}
#else
loop:
SpinLockAcquire(&Insert->insertpos_lck);
currlrc = Insert->CurrLRC;
if (unlikely(currlrc == WAL_COPY_SUSPEND)) {
SpinLockRelease(&Insert->insertpos_lck);
goto loop;
}
* Return the last LRC
*/
*lastlrc_ptr = currlrc;
Insert->CurrLRC = WAL_COPY_SUSPEND;
SpinLockRelease(&Insert->insertpos_lck);
#endif
* Wait for the WAL copy thread obtaining the "*lastlrc_ptr - 1" to finish
* *lastlrc_ptr is the LRC for the next WAL insert thread to acquire
* I probably should give it a better name other than "lastlrc_ptr" that is
* somewhat misleading but I am trying to say that is the last LRC stored
* in the shared memory before we swap in WAL_COPY_SUSPEND
* The last working WAL insert thread should be holding "*lastlrc_ptr - 1"
*/
int ilrc = (*lastlrc_ptr - 1) & 0x7FFFFFFF;
ientry = ilrc & (GET_WAL_INSERT_STATUS_ENTRY_CNT() - 1);
entry = &g_instance.wal_cxt.walInsertStatusTable[ientry];
* Wait if the WAL copy thread obtaining the "*lastlrc_ptr - 1" didn't finish.
*/
bool stop = true;
while (stop) {
if (!g_instance.wal_cxt.isWalWriterUp) {
XLogSelfFlush();
}
* Wait till the Wal writer scans to "*lastlrc_ptr - 1"
* except that there is nobody before me after StartupXLOG
* this edge case needs to be excluded can be identified by:
*
* (*lastlrc_ptr == 0 && entry->endLSN == 0)
*
* if the current LRC is 0 and the endLSN corresponding to the LRC before
* the current LRC is also 0 then we know there is nobody doing the WAL
* insert till this point. We need to also check entry->endLSN because we
* need to consider the 31bit LRC wrap-around
*/
stop = ((*lastlrc_ptr != 0 || entry->endLSN != 0) && (g_instance.wal_cxt.lastLRCScanned != ilrc));
}
return;
}
* This function must be called by whoever successfully suspended all WAL insert threads or
* it will be a disaster
*/
static void StopSuspendWalInsert(int32 lastlrc)
{
volatile XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
* Since I already suspend all other WAL copy threads,
* it is safe to do a direct 32 bit value assignment
*/
Insert->CurrLRC = lastlrc;
return;
}
* Subroutine of XLogInsert. Copies a WAL record to an already-reserved
* area in the WAL.
*/
static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, XLogRecPtr StartPos,
XLogRecPtr EndPos, int32 *const currlrc_ptr)
{
char *currpos = NULL;
uint32 freespace;
int written;
XLogRecPtr CurrPos;
XLogPageHeader pagehdr;
errno_t errorno = EOK;
bool stop = true;
* The following 2 variables are used to handle large records
*/
uint32 startBufPageIdx = XLogRecPtrToBufIdx(StartPos);
int numHitsOnStartPage = 1;
* Get a pointer to the right place in the right WAL buffer to start
* inserting to.
*/
CurrPos = StartPos;
currpos = GetXLogBuffer<false>(CurrPos);
freespace = INSERT_FREESPACE(CurrPos);
* there should be enough space for at least the first field (xl_tot_len)
* on this page.
*/
Assert(freespace >= sizeof(uint32));
written = 0;
while (rdata != NULL) {
char *rdata_data = rdata->data;
uint32 rdata_len = rdata->len;
while (rdata_len > freespace) {
* Write what fits on this page, and continue on the next page.
*/
Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
errorno = memcpy_s(currpos, SECUREC_STRING_MAX_LEN, rdata_data, freespace);
securec_check(errorno, "", "");
rdata_data += freespace;
rdata_len -= freespace;
written += freespace;
CurrPos += freespace;
* We hit the page we start from within the same copy. We must initiate
* a flush without updating WAL copy status entry or we will be stuck
*/
if (unlikely(XLogRecPtrToBufIdx(CurrPos) == startBufPageIdx)) {
numHitsOnStartPage++;
XLogSelfFlushWithoutStatus(numHitsOnStartPage, CurrPos, *currlrc_ptr);
}
* Get pointer to beginning of next page, and set the xlp_rem_len
* in the page header. Set XLP_FIRST_IS_CONTRECORD.
*
* It's safe to set the contrecord flag and xlp_rem_len without a
* lock on the page. All the other flags were already set when the
* page was initialized, in AdvanceXLInsertBuffer, and we're the
* only backend that needs to set the contrecord flag.
*/
currpos = GetXLogBuffer<false>(CurrPos);
pagehdr = (XLogPageHeader)currpos;
pagehdr->xlp_rem_len = write_len - written;
pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
if (CurrPos % XLogSegSize == 0) {
CurrPos += SizeOfXLogLongPHD;
currpos += SizeOfXLogLongPHD;
} else {
CurrPos += SizeOfXLogShortPHD;
currpos += SizeOfXLogShortPHD;
}
freespace = INSERT_FREESPACE(CurrPos);
}
Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
errorno = memcpy_s(currpos, SECUREC_STRING_MAX_LEN, rdata_data, rdata_len);
securec_check(errorno, "", "");
currpos += rdata_len;
CurrPos += rdata_len;
freespace -= rdata_len;
written += rdata_len;
rdata = rdata->next;
}
Assert(written == write_len);
CurrPos = MAXALIGN(CurrPos);
* If this was an xlog-switch, it's not enough to write the switch record,
* we also have to consume all the remaining space in the WAL segment.
* We have already reserved it for us, but we still need to make sure it's
* allocated and zeroed in the WAL buffers so that when the caller (or
* someone else) does XLogWrite(), it can really write out all the zeros.
*/
if (isLogSwitch && CurrPos % XLogSegSize != 0) {
int currlrc = *currlrc_ptr;
Assert(write_len == SizeOfXLogRecord);
* We do this one page at a time, to make sure we don't deadlock
* against ourselves if wal_buffers < XLOG_SEG_SIZE.
*/
Assert(EndPos % XLogSegSize == 0);
CurrPos += freespace;
* When inserting XLOG switch, we communicate with the WAL writer
* page by page
*/
while (CurrPos <= EndPos) {
uint32 current_entry =
currlrc &
(GET_WAL_INSERT_STATUS_ENTRY_CNT() - 1);
volatile WALInsertStatusEntry *status_entry_ptr =
&g_instance.wal_cxt.walInsertStatusTable[GET_STATUS_ENTRY_INDEX(current_entry)];
* Initialize the WAL buffer page if needed
* Please note that endPtr passed into XLogWaitBufferInit is the LSN currently tagged
* on the buffer page we want
*/
int idx = (int)XLogRecPtrToBufIdx(CurrPos - 1);
XLogRecPtr endPtr = t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks[idx];
XLogRecPtr expectedEndPtr = CurrPos;
if (expectedEndPtr != endPtr) {
pgstat_report_waitevent(WAIT_EVENT_WAL_BUFFER_FULL);
XLogRecPtr waitEndPtr = X_LOG_PREV_BLOCK_IN_BUF_PAGE(expectedEndPtr);
XLogWaitBufferInit(waitEndPtr);
pgstat_report_waitevent(WAIT_EVENT_END);
}
* Update the WAL copy status entry to trigger flush
*
* First we wait to own a WAL copy status entry assigned
* to us if the WAL writer is alive
*
* If the wal writer is not up for the xlog flush job, we need to invoke XLogBackgroundFlush
* ourselves to push the status array recycling.
*/
if (g_instance.wal_cxt.isWalWriterUp) {
do {
stop = (status_entry_ptr->LRC == currlrc && (status_entry_ptr->endLSN == 0));
} while (!stop);
} else if (status_entry_ptr->LRC != currlrc) {
* Here we don't need to loop on XLogSelfFlush to ensure our stuff is
* flushed because:
* 1) I am an XLog switch so I already called StartSuspendWalInsert to suspend
* all other XLog insert accesses and the flush function already scanned till
* the LRC right before the suspended LRC
* 2) XLog switch is inserted and flushed page by page sequentially
*/
XLogSelfFlush();
}
* We own the entry so we don't expect need to loop the CAS call
* The CAS call here is to protext reads in the WAL writer thread
*/
(void)pg_atomic_exchange_u64(&status_entry_ptr->endLSN, CurrPos);
* Trigger a flush if WAL writer thread is not up yet
*/
if (!g_instance.wal_cxt.isWalWriterUp) {
* Here we don't need to loop on XLogSelfFlush to ensure our stuff is
* flushed because:
* 1) I am an XLog switch so I already called StartSuspendWalInsert to suspend
* all other XLog insert accesses and the flush function already scanned till
* the LRC right before the suspended LRC
* 2) XLog switch is inserted and flushed page by page sequentially
*/
XLogSelfFlush();
}
currlrc = (currlrc + 1) & 0x7FFFFFFF;
CurrPos += XLOG_BLCKSZ;
}
*currlrc_ptr = currlrc;
}
pgstat_report_waitevent_count(WAIT_EVENT_WAL_BUFFER_ACCESS);
}
* Get a pointer to the right location in the WAL buffer containing the
* given XLogRecPtr.
*
* If the page is not initialized yet, it is initialized. That might require
* evicting an old dirty buffer from the buffer cache, which means I/O.
*
* The caller must ensure that the page containing the requested location
* isn't evicted yet, and won't be evicted. The way to ensure that is to
* hold onto an XLogInsertLock with the xlogInsertingAt position set to
* something <= ptr. GetXLogBuffer() will update xlogInsertingAt if it needs
* to evict an old page from the buffer. (This means that once you call
* GetXLogBuffer() with a given 'ptr', you must not access anything before
* that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
* later, because older buffers might be recycled already)
*/
template <bool isGroupInsert>
static char *GetXLogBuffer(XLogRecPtr ptr, PGPROC *proc)
{
int idx;
XLogRecPtr endptr;
XLogRecPtr pastEndPtr = InvalidXLogRecPtr;
XLogRecPtr expectedEndPtr;
* Fast path for the common case that we need to access again the same
* page as last time.
*/
if (ptr / XLOG_BLCKSZ == t_thrd.xlog_cxt.cachedPage) {
Assert(((XLogPageHeader)t_thrd.xlog_cxt.cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
Assert(((XLogPageHeader)t_thrd.xlog_cxt.cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
return t_thrd.xlog_cxt.cachedPos + ptr % XLOG_BLCKSZ;
}
* The XLog buffer cache is organized so that a page is always loaded
* to a particular buffer. That way we can easily calculate the buffer
* a given page must be loaded into, from the XLogRecPtr alone.
*/
idx = XLogRecPtrToBufIdx(ptr);
* See what page is loaded in the buffer at the moment. It could be the
* page we're looking for, or something older. It can't be anything newer
* - that would imply the page we're looking for has already been written
* out to disk and evicted, and the caller is responsible for making sure
* that doesn't happen.
*
* However, we don't hold a lock while we read the value. If someone has
* just initialized the page, it's possible that we get a "torn read" of
* the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
* that case we will see a bogus value. That's ok, we'll grab the mapping
* lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
* the page we're looking for. But it means that when we do this unlocked
* read, we might see a value that appears to be ahead of the page we're
* looking for. Don't PANIC on that, until we've verified the value while
* holding the lock.
*/
expectedEndPtr = ptr;
expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
endptr = t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks[idx];
if (expectedEndPtr != endptr) {
pgstat_report_waitevent(WAIT_EVENT_WAL_BUFFER_FULL);
* We need to wait for the background flush to flush till the endptr
* The background flush will also initialize the flushed pages
* with new pages
*/
XLogRecPtr waitEndPtr = X_LOG_PREV_BLOCK_IN_BUF_PAGE(expectedEndPtr);
XLogWaitBufferInit(waitEndPtr);
pgstat_report_waitevent(WAIT_EVENT_END);
pastEndPtr = endptr;
endptr = t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks[idx];
if (expectedEndPtr != endptr) {
ereport(PANIC, (errmsg("could not find WAL buffer for %X/%X", (uint32)(ptr >> 32), (uint32)ptr),
errdetail("endptr is %X/%X, pastEndptr is %X/%X.", (uint32)(endptr >> 32), (uint32)endptr,
(uint32)(pastEndPtr >> 32), (uint32)pastEndPtr)));
}
} else {
* Make sure the initialization of the page is visible to us, and
* won't arrive later to overwrite the WAL data we write on the page.
*/
pg_memory_barrier();
}
* Found the buffer holding this page. Return a pointer to the right
* offset within the page.
*/
t_thrd.xlog_cxt.cachedPage = ptr / XLOG_BLCKSZ;
t_thrd.xlog_cxt.cachedPos = t_thrd.shemem_ptr_cxt.XLogCtl->pages + idx * (Size)XLOG_BLCKSZ;
Assert((((XLogPageHeader)t_thrd.xlog_cxt.cachedPos)->xlp_magic == XLOG_PAGE_MAGIC));
Assert(((XLogPageHeader)t_thrd.xlog_cxt.cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ) ||
(((((XLogPageHeader)t_thrd.xlog_cxt.cachedPos)->xlp_pageaddr >> 32) |
(((XLogPageHeader)t_thrd.xlog_cxt.cachedPos)->xlp_pageaddr << 32))) == ptr - (ptr % XLOG_BLCKSZ));
return t_thrd.xlog_cxt.cachedPos + ptr % XLOG_BLCKSZ;
}
* Converts a "usable byte position" to XLogRecPtr. A usable byte position
* is the position starting from the beginning of WAL, excluding all WAL
* page headers.
*/
static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos)
{
uint64 fullsegs;
uint64 fullpages;
uint64 bytesleft;
uint32 seg_offset;
XLogRecPtr result;
fullsegs = bytepos / UsableBytesInSegment;
bytesleft = bytepos % UsableBytesInSegment;
if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD) {
seg_offset = bytesleft + SizeOfXLogLongPHD;
} else {
seg_offset = XLOG_BLCKSZ;
bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
fullpages = bytesleft / UsableBytesInPage;
bytesleft = bytesleft % UsableBytesInPage;
seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
}
XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
return result;
}
* Like XLogBytePosToRecPtr, but if the position is at a page boundary,
* returns a pointer to the beginning of the page (ie. before page header),
* not to where the first xlog record on that page would go to. This is used
* when converting a pointer to the end of a record.
*/
static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos)
{
uint64 fullsegs;
uint64 fullpages;
uint64 bytesleft;
uint32 seg_offset;
XLogRecPtr result;
fullsegs = bytepos / UsableBytesInSegment;
bytesleft = bytepos % UsableBytesInSegment;
if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD) {
if (bytesleft == 0) {
seg_offset = 0;
} else {
seg_offset = bytesleft + SizeOfXLogLongPHD;
}
} else {
seg_offset = XLOG_BLCKSZ;
bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
fullpages = bytesleft / UsableBytesInPage;
bytesleft = bytesleft % UsableBytesInPage;
if (bytesleft == 0) {
seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
} else {
seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
}
}
XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
return result;
}
* Convert an XLogRecPtr to a "usable byte position".
*/
static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr)
{
uint64 fullsegs;
uint32 fullpages;
uint32 offset;
uint64 result;
XLByteToSeg(ptr, fullsegs);
fullpages = (ptr % XLogSegSize) / XLOG_BLCKSZ;
offset = ptr % XLOG_BLCKSZ;
if (fullpages == 0) {
result = fullsegs * UsableBytesInSegment;
if (offset > 0) {
Assert(offset >= SizeOfXLogLongPHD);
result += offset - SizeOfXLogLongPHD;
}
} else {
result = fullsegs * UsableBytesInSegment + (XLOG_BLCKSZ - SizeOfXLogLongPHD)
+ (fullpages - 1) * UsableBytesInPage;
if (offset > 0) {
Assert(offset >= SizeOfXLogShortPHD);
result += offset - SizeOfXLogShortPHD;
}
}
return result;
}
* XLogArchiveNotify
*
* Create an archive notification file
*
* The name of the notification file is the message that will be picked up
* by the archiver, e.g. we write 0000000100000001000000C6.ready
* and the archiver then knows to archive XLOGDIR/0000000100000001000000C6,
* then when complete, rename it to 0000000100000001000000C6.done
*/
static void XLogArchiveNotify(const char *xlog)
{
char archiveStatusPath[MAXPGPATH];
FILE *fd = NULL;
errno_t errorno = EOK;
errorno = snprintf_s(archiveStatusPath, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".ready");
securec_check_ss(errorno, "", "");
fd = AllocateFile(archiveStatusPath, "w");
if (fd == NULL) {
ereport(LOG, (errcode_for_file_access(),
errmsg("could not create archive status file \"%s\": %s", archiveStatusPath, TRANSLATE_ERRNO)));
return;
}
if (FreeFile(fd)) {
ereport(LOG, (errcode_for_file_access(),
errmsg("could not write archive status file \"%s\": %s", archiveStatusPath, TRANSLATE_ERRNO)));
fd = NULL;
return;
}
if (IsUnderPostmaster) {
SendPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER);
}
}
* Convenience routine to notify using segment number representation of filename
*/
static void XLogArchiveNotifySeg(XLogSegNo segno)
{
char xlog[MAXFNAMELEN];
errno_t errorno = EOK;
errorno = snprintf_s(xlog, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", t_thrd.xlog_cxt.ThisTimeLineID,
(uint32)((segno) / XLogSegmentsPerXLogId), (uint32)((segno) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
XLogArchiveNotify(xlog);
}
* XLogArchiveForceDone
*
* Emit notification forcibly that an XLOG segment file has been successfully
* archived, by creating <XLOG>.done regardless of whether <XLOG>.ready
* exists or not.
*/
void XLogArchiveForceDone(const char *xlog)
{
char archiveReady[MAXPGPATH];
char archiveDone[MAXPGPATH];
struct stat stat_buf;
FILE *fd = NULL;
errno_t errorno = EOK;
errorno = snprintf_s(archiveDone, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".done");
securec_check_ss(errorno, "", "");
if (stat(archiveDone, &stat_buf) == 0) {
return;
}
errorno = snprintf_s(archiveReady, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".ready");
securec_check_ss(errorno, "", "");
if (stat(archiveReady, &stat_buf) == 0) {
(void)durable_rename(archiveReady, archiveDone, WARNING);
return;
}
fd = AllocateFile(archiveDone, "w");
if (fd == NULL) {
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not create archive status file \"%s\": %s", archiveDone, TRANSLATE_ERRNO)));
return;
}
if (FreeFile(fd)) {
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not write archive status file \"%s\": %s", archiveDone, TRANSLATE_ERRNO)));
fd = NULL;
return;
}
}
* XLogArchiveCheckDone
*
* This is called when we are ready to delete or recycle an old XLOG segment
* file or backup history file. If it is okay to delete it then return true.
* If it is not time to delete it, make sure a .ready file exists, and return
* false.
*
* If <XLOG>.done exists, then return true; else if <XLOG>.ready exists,
* then return false; else create <XLOG>.ready and return false.
*
* The reason we do things this way is so that if the original attempt to
* create <XLOG>.ready fails, we'll retry during subsequent checkpoints.
*/
static bool XLogArchiveCheckDone(const char *xlog)
{
char archiveStatusPath[MAXPGPATH];
struct stat stat_buf;
errno_t errorno = EOK;
if (!XLogArchivingActive() || RecoveryInProgress()) {
return true;
}
errorno = snprintf_s(archiveStatusPath, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".done");
securec_check_ss(errorno, "", "");
if (stat(archiveStatusPath, &stat_buf) == 0) {
return true;
}
errorno = snprintf_s(archiveStatusPath, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".ready");
securec_check_ss(errorno, "", "");
if (stat(archiveStatusPath, &stat_buf) == 0) {
return false;
}
errorno = snprintf_s(archiveStatusPath, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".done");
securec_check_ss(errorno, "", "");
if (stat(archiveStatusPath, &stat_buf) == 0) {
return true;
}
XLogArchiveNotify(xlog);
return false;
}
* XLogArchiveIsBusy
*
* Check to see if an XLOG segment file is still unarchived.
* This is almost but not quite the inverse of XLogArchiveCheckDone: in
* the first place we aren't chartered to recreate the .ready file, and
* in the second place we should consider that if the file is already gone
* then it's not busy. (This check is needed to handle the race condition
* that a checkpoint already deleted the no-longer-needed file.)
*/
static bool XLogArchiveIsBusy(const char *xlog)
{
char archiveStatusPath[MAXPGPATH];
struct stat stat_buf;
errno_t errorno = EOK;
errorno = snprintf_s(archiveStatusPath, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".done");
securec_check_ss(errorno, "", "");
if (stat(archiveStatusPath, &stat_buf) == 0) {
return false;
}
errorno = snprintf_s(archiveStatusPath, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".ready");
securec_check_ss(errorno, "", "");
if (stat(archiveStatusPath, &stat_buf) == 0) {
return true;
}
errorno = snprintf_s(archiveStatusPath, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".done");
securec_check_ss(errorno, "", "");
if (stat(archiveStatusPath, &stat_buf) == 0) {
return false;
}
* Check to see if the WAL file has been removed by checkpoint, which
* implies it has already been archived, and explains why we can't see a
* status file for it.
*/
errorno = snprintf_s(archiveStatusPath, MAXPGPATH, MAXPGPATH - 1, "%s/%s", SS_XLOGDIR, xlog);
securec_check_ss(errorno, "", "");
if (stat(archiveStatusPath, &stat_buf) != 0 && errno == ENOENT) {
return false;
}
return true;
}
* XLogArchiveIsReady
*
* Check to see if an XLOG segment file has an archive notification (.ready)
* file.
*/
bool XLogArchiveIsReady(const char *xlog)
{
char archiveStatusPath[MAXPGPATH];
struct stat stat_buf;
errno_t rc = EOK;
rc = snprintf_s(archiveStatusPath, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".ready");
securec_check_ss(rc, "", "");
if (stat(archiveStatusPath, &stat_buf) == 0) {
return true;
}
return false;
}
* XLogArchiveCleanup
*
* Cleanup archive notification file(s) for a particular xlog segment
*/
static void XLogArchiveCleanup(const char *xlog)
{
char archiveStatusPath[MAXPGPATH];
errno_t errorno = EOK;
errorno = snprintf_s(archiveStatusPath, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".done");
securec_check_ss(errorno, "", "");
unlink(archiveStatusPath);
errorno = snprintf_s(archiveStatusPath, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".ready");
securec_check_ss(errorno, "", "");
unlink(archiveStatusPath);
}
* Initialize XLOG buffers, writing out old buffers if they still contain
* unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
* true, initialize as many pages as we can without having to write out
* unwritten data. Any new pages are initialized to zeros, with pages headers
* initialized properly.
*/
template <bool isGroupInsert>
static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic, PGPROC *proc)
{
XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
int nextidx;
XLogRecPtr OldPageRqstPtr;
XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr;
XLogRecPtr NewPageBeginPtr;
XLogPageHeader NewPage;
int npages = 0;
errno_t errorno = EOK;
XLogwrtResult *LogwrtResultPtr = NULL;
TimeLineID xlogTimeLineID = 0;
#ifdef __aarch64__
if (isGroupInsert) {
LogwrtResultPtr = (XLogwrtResult *)proc->xlogGroupLogwrtResult;
xlogTimeLineID = proc->xlogGroupTimeLineID;
} else {
LogwrtResultPtr = t_thrd.xlog_cxt.LogwrtResult;
xlogTimeLineID = t_thrd.xlog_cxt.ThisTimeLineID;
}
#else
LogwrtResultPtr = t_thrd.xlog_cxt.LogwrtResult;
xlogTimeLineID = t_thrd.xlog_cxt.ThisTimeLineID;
#endif
LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
nextidx = (int)XLogRecPtrToBufIdx(t_thrd.shemem_ptr_cxt.XLogCtl->InitializedUpTo);
* Get ending-offset of the buffer page we need to replace (this may
* be zero if the buffer hasn't been used yet). Fall through if it's
* already written out.
*/
OldPageRqstPtr = t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks[nextidx];
* Now that we have the lock, check if someone initialized the page
* already.
*/
while (upto >= OldPageRqstPtr) {
NewPageBeginPtr = t_thrd.shemem_ptr_cxt.XLogCtl->InitializedUpTo;
NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == (uint32)nextidx);
NewPage = (XLogPageHeader)(t_thrd.shemem_ptr_cxt.XLogCtl->pages + nextidx * (Size)XLOG_BLCKSZ);
* Be sure to re-zero the buffer so that bytes beyond what we've
* written will look like zeroes and not valid XLOG records...
*/
errorno = memset_s((char *)NewPage, XLOG_BLCKSZ, 0, XLOG_BLCKSZ);
securec_check(errorno, "", "");
NewPage->xlp_magic = XLOG_PAGE_MAGIC;
NewPage->xlp_tli = xlogTimeLineID;
NewPage->xlp_pageaddr = NewPageBeginPtr;
* If online backup is not in progress, mark the header to indicate
* that* WAL records beginning in this page have removable backup
* blocks. This allows the WAL archiver to know whether it is safe to
* compress archived WAL data by transforming full-block records into
* the non-full-block format. It is sufficient to record this at the
* page level because we force a page switch (in fact a segment switch)
* when starting a backup, so the flag will be off before any records
* can be written during the backup. At the end of a backup, the last
* page will be marked as all unsafe when perhaps only part is unsafe,
* but at worst the archiver would miss the opportunity to compress a
* few records.
*/
if (!Insert->forcePageWrites) {
NewPage->xlp_info |= XLP_BKP_REMOVABLE;
}
if ((NewPage->xlp_pageaddr % XLogSegSize) == 0) {
XLogLongPageHeader NewLongPage = (XLogLongPageHeader)NewPage;
NewLongPage->xlp_sysid = t_thrd.shemem_ptr_cxt.ControlFile->system_identifier;
NewLongPage->xlp_seg_size = XLogSegSize;
NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
NewPage->xlp_info |= XLP_LONG_HEADER;
}
* Make sure the initialization of the page becomes visible to others
* before the xlblocks update. GetXLogBuffer() reads xlblocks without
* holding a lock.
*/
pg_write_barrier();
*((volatile XLogRecPtr *)&t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
t_thrd.shemem_ptr_cxt.XLogCtl->InitializedUpTo = NewPageEndPtr;
nextidx = (int)XLogRecPtrToBufIdx(t_thrd.shemem_ptr_cxt.XLogCtl->InitializedUpTo);
OldPageRqstPtr = t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks[nextidx];
npages++;
}
LWLockRelease(WALBufMappingLock);
#ifdef WAL_DEBUG
if (npages > 0) {
ereport(DEBUG1, (errmsg("initialized %d pages, upto %X/%X", npages, (uint32)(NewPageEndPtr >> 32),
(uint32)NewPageEndPtr)));
}
#endif
}
template static void AdvanceXLInsertBuffer<true>(XLogRecPtr, bool, PGPROC *);
template static void AdvanceXLInsertBuffer<false>(XLogRecPtr, bool, PGPROC *);
* Check whether we've consumed enough xlog space that a checkpoint is needed.
*
* new_segno indicates a log file that has just been filled up (or read
* during recovery). We measure the distance from RedoRecPtr to new_segno
* and see if that exceeds CheckPointSegments.
*
* Note: it is caller's responsibility that RedoRecPtr is up-to-date.
*/
static bool XLogCheckpointNeeded(XLogSegNo new_segno)
{
XLogSegNo old_segno;
XLByteToSeg(t_thrd.xlog_cxt.RedoRecPtr, old_segno);
if (new_segno >= old_segno + ((uint32)(u_sess->attr.attr_storage.CheckPointSegments) - 1)) {
return true;
}
return false;
}
#ifndef ENABLE_MULTIPLE_NODES
* Write the log to paxos storage at least as far as PaxosWriteRqst indicates.
* Must be called with WALPaxosWriteLock held.
*/
static bool XLogWritePaxos(XLogRecPtr PaxosWriteRqst)
{
int curridx;
uint32 offset;
Size WriteTotal;
Size CriticalNBytes;
Size nBytes;
Size XLogCacheSize;
XLogRecPtr WriteLSN;
* Update local LogwrtResult (caller probably did this already, but...)
*/
*t_thrd.xlog_cxt.LogwrtPaxos = t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtPaxos;
XLogCacheSize = (t_thrd.shemem_ptr_cxt.XLogCtl->XLogCacheBlck + 1) * XLOG_BLCKSZ;
while (XLByteLT(t_thrd.xlog_cxt.LogwrtPaxos->Write, PaxosWriteRqst)) {
curridx = XLogRecPtrToBufIdx(t_thrd.xlog_cxt.LogwrtPaxos->Write);
offset = t_thrd.xlog_cxt.LogwrtPaxos->Write % XLOG_BLCKSZ;
WriteTotal = PaxosWriteRqst - t_thrd.xlog_cxt.LogwrtPaxos->Write;
CriticalNBytes = (t_thrd.shemem_ptr_cxt.XLogCtl->XLogCacheBlck - curridx + 1) * XLOG_BLCKSZ - offset;
* nBytes is the maximum size that we can send to paxos this time, we set it to be
* the minimum of WriteTotal/MaxSendSizeBytes/CriticalNBytes. But when MaxSendSizeBytes
* is the minimum, it may cut a record, so we round back to page boundary by reference
* to stream replcation's flow limit.
*/
nBytes = Min(WriteTotal, Min(MaxSendSizeBytes, CriticalNBytes));
if (nBytes != WriteTotal && nBytes != CriticalNBytes) {
XLogRecPtr tmpLsn = t_thrd.xlog_cxt.LogwrtPaxos->Write + MaxSendSizeBytes;
uint32 tmpoffset = tmpLsn % XLOG_BLCKSZ;
Assert(MaxSendSizeBytes > tmpoffset);
nBytes = MaxSendSizeBytes - tmpoffset;
ereport(
DEBUG1,
(errmsg("Flow limit! nBytes is %ld, WriteTotal is %ld, SendLimitBytes is %ld, CriticalNBytes is %ld",
nBytes, WriteTotal, MaxSendSizeBytes, CriticalNBytes)));
}
Assert(XLByteLE(nBytes, XLogCacheSize));
char *from = t_thrd.shemem_ptr_cxt.XLogCtl->pages + curridx * (Size)XLOG_BLCKSZ + offset;
WriteLSN = t_thrd.xlog_cxt.LogwrtPaxos->Write + nBytes;
unsigned long long paxosIdx = 0;
while (dcf_write(1, from, nBytes, WriteLSN, &paxosIdx) != 0) {
ereport(WARNING, (errmsg("Failed writing to paxos: lsn = %ld, len = %ld, paxosIdx = %llu.", WriteLSN,
nBytes, paxosIdx)));
pg_usleep(5000);
}
ereport(DEBUG1,
(errmsg("Writed to paxos: lsn = %ld, len = %ld, paxosIdx = %llu.", WriteLSN, nBytes, paxosIdx)));
t_thrd.xlog_cxt.LogwrtPaxos->Write = WriteLSN;
}
{
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->LogwrtPaxos.Write = t_thrd.xlog_cxt.LogwrtPaxos->Write;
SpinLockRelease(&xlogctl->info_lck);
volatile DcfContextInfo *dcfCtx = t_thrd.dcf_cxt.dcfCtxInfo;
SpinLockAcquire(&dcfCtx->recordDcfIdxMutex);
* XLog upto recordLsn has been flushed to Disk, now it's safe
* to set applied index corresponding to recordLsn in DCF, which is
* called in Consensus Log call back.
*/
if (XLByteLE(dcfCtx->recordLsn, xlogctl->LogwrtResult.Flush))
dcfCtx->isRecordIdxBlocked = false;
SpinLockRelease(&dcfCtx->recordDcfIdxMutex);
}
return true;
}
#endif
void XLogUpdateShareStatus()
{
{
XLogCtlData *xlog_ctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlog_ctl->info_lck);
xlog_ctl->LogwrtResult = *t_thrd.xlog_cxt.LogwrtResult;
if (XLByteLT(xlog_ctl->LogwrtRqst.Write, t_thrd.xlog_cxt.LogwrtResult->Write)) {
xlog_ctl->LogwrtRqst.Write = t_thrd.xlog_cxt.LogwrtResult->Write;
}
if (XLByteLT(xlog_ctl->LogwrtRqst.Flush, t_thrd.xlog_cxt.LogwrtResult->Flush)) {
xlog_ctl->LogwrtRqst.Flush = t_thrd.xlog_cxt.LogwrtResult->Flush;
g_instance.comm_cxt.predo_cxt.redoPf.primary_flush_ptr = t_thrd.xlog_cxt.LogwrtResult->Flush;
}
SpinLockRelease(&xlog_ctl->info_lck);
}
g_instance.wal_cxt.lastLRCFlushed = g_instance.wal_cxt.lastLRCWrited;
pg_atomic_write_u64((volatile uint64 *)&g_instance.wal_cxt.flushResult, t_thrd.xlog_cxt.LogwrtResult->Flush);
pg_write_barrier();
WalSndWakeupProcessRequests();
SendShareStorageXLogCopyBackendWakeupRequest();
}
* Write and/or fsync the log at least as far as WriteRqst indicates.
*
* If flexible == TRUE, we don't have to write as far as WriteRqst, but
* may stop at any convenient boundary (such as a cache or logfile boundary).
* This option allows us to avoid uselessly issuing multiple writes when a
* single one would do.
*
* If xlog_switch == TRUE, we are intending an xlog segment switch, so
* perform end-of-segment actions after writing the last page, even if
* it's not physically the end of its segment. (NB: this will work properly
* only if caller specifies WriteRqst == page-end and flexible == false,
* and there is some data to write.)
*
* Must be called with WALWriteLock held.
*/
static void XLogWrite(const XLogwrtRqst &WriteRqst, bool flexible)
{
bool ispartialpage = false;
bool last_iteration = false;
bool finishing_seg = false;
bool use_existent = false;
bool segs_enough = true;
int curridx = 0;
int npages = 0;
int startidx = 0;
uint32 startoffset = 0;
Size actualBytes;
XLogRecPtr lastWrited = InvalidXLogRecPtr;
Assert(t_thrd.int_cxt.CritSectionCount > 0);
*t_thrd.xlog_cxt.LogwrtResult = t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtResult;
lastWrited = t_thrd.xlog_cxt.LogwrtResult->Write;
* Since successive pages in the xlog cache are consecutively allocated,
* we can usually gather multiple pages together and issue just one
* write() call. npages is the number of pages we have determined can be
* written together; startidx is the cache block index of the first one,
* and startoffset is the file offset at which it should go. The latter
* two variables are only valid when npages > 0, but we must initialize
* all of them to keep the compiler quiet.
*/
npages = 0;
startidx = 0;
startoffset = 0;
* Within the loop, curridx is the cache block index of the page to
* consider writing. We advance Write->curridx only after successfully
* writing pages. (Right now, this refinement is useless since we are
* going to PANIC if any error occurs anyway; but someday it may come in
* useful.)
*/
curridx = XLogRecPtrToBufIdx(t_thrd.xlog_cxt.LogwrtResult->Write);
while (XLByteLT(t_thrd.xlog_cxt.LogwrtResult->Write, WriteRqst.Write)) {
* Make sure we're not ahead of the insert process. This could happen
* if we're passed a bogus WriteRqst.Write that is past the end of the
* last page that's been initialized by AdvanceXLInsertBuffer.
*/
XLogRecPtr EndPtr = t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks[curridx];
if (!XLByteLT(t_thrd.xlog_cxt.LogwrtResult->Write, EndPtr))
ereport(PANIC,
(errmsg("xlog write request %X/%X is past end of log %X/%X",
(uint32)(t_thrd.xlog_cxt.LogwrtResult->Write >> 32),
(uint32)t_thrd.xlog_cxt.LogwrtResult->Write, (uint32)(EndPtr >> 32), (uint32)EndPtr)));
t_thrd.xlog_cxt.LogwrtResult->Write = EndPtr;
ispartialpage = XLByteLT(WriteRqst.Write, t_thrd.xlog_cxt.LogwrtResult->Write);
if (!XLByteInPrevSeg(t_thrd.xlog_cxt.LogwrtResult->Write, t_thrd.xlog_cxt.openLogSegNo)) {
* Switch to new logfile segment. We cannot have any pending
* pages here (since we dump what we have at segment end).
*/
Assert(npages == 0);
if (t_thrd.xlog_cxt.openLogFile >= 0) {
XLogFileClose();
}
XLByteToPrevSeg(t_thrd.xlog_cxt.LogwrtResult->Write, t_thrd.xlog_cxt.openLogSegNo);
use_existent = true;
t_thrd.xlog_cxt.openLogFile = XLogFileInit(t_thrd.xlog_cxt.openLogSegNo, &use_existent, true);
t_thrd.xlog_cxt.openLogOff = 0;
segs_enough = true;
const int fullThreshold = 100;
int threshold = u_sess->attr.attr_storage.walFilePreinitThreshold;
int initNum = g_instance.attr.attr_storage.wal_file_init_num;
if (threshold < fullThreshold && initNum > 0 &&
g_instance.wal_cxt.globalEndPosSegNo != InvalidXLogSegPtr &&
g_instance.wal_cxt.globalEndPosSegNo >= t_thrd.xlog_cxt.openLogSegNo) {
segs_enough = (g_instance.wal_cxt.globalEndPosSegNo - t_thrd.xlog_cxt.openLogSegNo) >
(1.0 * initNum * (fullThreshold - threshold) / fullThreshold);
}
* Unlock WalAuxiliary thread to init new xlog segment if we are running out
* of xlog segments, or used segments is more than wal_file_preinit_threshold.
*/
if (!segs_enough) {
g_instance.wal_cxt.globalEndPosSegNo = Max(g_instance.wal_cxt.globalEndPosSegNo, t_thrd.xlog_cxt.openLogSegNo);
WakeupWalSemaphore(&g_instance.wal_cxt.walInitSegLock->l.sem);
} else if (!use_existent) {
g_instance.wal_cxt.globalEndPosSegNo = t_thrd.xlog_cxt.openLogSegNo;
WakeupWalSemaphore(&g_instance.wal_cxt.walInitSegLock->l.sem);
}
}
if (t_thrd.xlog_cxt.openLogFile <= 0) {
XLByteToPrevSeg(t_thrd.xlog_cxt.LogwrtResult->Write, t_thrd.xlog_cxt.openLogSegNo);
t_thrd.xlog_cxt.openLogFile = XLogFileOpen(t_thrd.xlog_cxt.openLogSegNo);
t_thrd.xlog_cxt.openLogOff = 0;
}
if (npages == 0) {
startidx = curridx;
startoffset = (t_thrd.xlog_cxt.LogwrtResult->Write - XLOG_BLCKSZ) % XLogSegSize;
}
npages++;
* Dump the set if this will be the last loop iteration, or if we are
* at the last page of the cache area (since the next page won't be
* contiguous in memory), or if we are at the end of the logfile
* segment.
*/
last_iteration = !XLByteLT(t_thrd.xlog_cxt.LogwrtResult->Write, WriteRqst.Write);
finishing_seg = !ispartialpage && (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
if (last_iteration || curridx == t_thrd.shemem_ptr_cxt.XLogCtl->XLogCacheBlck || finishing_seg) {
instr_time startTime;
instr_time endTime;
PgStat_Counter elapsedTime;
char *from = t_thrd.shemem_ptr_cxt.XLogCtl->pages + startidx * (Size)XLOG_BLCKSZ;
Size nbytes = npages * (Size)XLOG_BLCKSZ;
#ifndef ENABLE_MULTIPLE_NODES
if (g_instance.attr.attr_storage.dcf_attr.enable_dcf && t_thrd.dcf_cxt.dcfCtxInfo->isDcfStarted &&
ispartialpage) {
nbytes -= (EndPtr - WriteRqst.Write);
}
#endif
errno = 0;
pgstat_report_waitevent(WAIT_EVENT_WAL_WRITE);
INSTR_TIME_SET_CURRENT(startTime);
actualBytes = pwrite(t_thrd.xlog_cxt.openLogFile, from, nbytes, startoffset);
INSTR_TIME_SET_CURRENT(endTime);
INSTR_TIME_SUBTRACT(endTime, startTime);
* elapsedTime can be replaced by beentry->waitInfo.event_info.duration.
*/
elapsedTime = (PgStat_Counter)INSTR_TIME_GET_MICROSEC(endTime);
pgstat_report_waitevent(WAIT_EVENT_END);
if (actualBytes != (Size)nbytes) {
if (errno == 0) {
errno = ENOSPC;
}
ereport(PANIC, (errcode_for_file_access(),
errmsg("could not write to log file %s at offset %u, length %lu: %s",
XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.openLogSegNo),
t_thrd.xlog_cxt.openLogOff, (unsigned long)nbytes, TRANSLATE_ERRNO)));
}
if (g_instance.wal_cxt.xlogFlushStats->statSwitch) {
++g_instance.wal_cxt.xlogFlushStats->writeTimes;
g_instance.wal_cxt.xlogFlushStats->totalActualXlogSyncBytes += actualBytes;
g_instance.wal_cxt.xlogFlushStats->totalWriteTime += elapsedTime;
}
reportRedoWrite((PgStat_Counter)npages, elapsedTime);
t_thrd.xlog_cxt.openLogOff += nbytes;
npages = 0;
* If we just wrote the whole last page of a logfile segment,
* fsync the segment immediately. This avoids having to go back
* and re-open prior segments when an fsync request comes along
* later. Doing it here ensures that one and only one backend will
* perform this fsync.
*
* We also do this if this is the last page written for an xlog
* switch.
*
* This is also the right place to notify the Archiver that the
* segment is ready to copy to archival storage, and to update the
* timer for archive_timeout, and to signal for a checkpoint if
* too many logfile segments have been used since the last
* checkpoint.
*/
if (finishing_seg) {
instr_time startTime;
instr_time endTime;
uint64 elapsedTime;
INSTR_TIME_SET_CURRENT(startTime);
issue_xlog_fsync(t_thrd.xlog_cxt.openLogFile, t_thrd.xlog_cxt.openLogSegNo);
INSTR_TIME_SET_CURRENT(endTime);
INSTR_TIME_SUBTRACT(endTime, startTime);
elapsedTime = INSTR_TIME_GET_MICROSEC(endTime);
if (g_instance.wal_cxt.xlogFlushStats->statSwitch) {
++g_instance.wal_cxt.xlogFlushStats->syncTimes;
g_instance.wal_cxt.xlogFlushStats->totalSyncTime += elapsedTime;
}
WalSndWakeupRequest();
t_thrd.xlog_cxt.LogwrtResult->Flush = t_thrd.xlog_cxt.LogwrtResult->Write;
UpdateSSDoradoCtlInfoAndSync();
AddShareStorageXLopCopyBackendWakeupRequest();
if (XLogArchivingActive()) {
XLogArchiveNotifySeg(t_thrd.xlog_cxt.openLogSegNo);
}
t_thrd.shemem_ptr_cxt.XLogCtl->lastSegSwitchTime = (pg_time_t)time(NULL);
* Request a checkpoint if we've consumed too much xlog since
* the last one. For speed, we first check using the local
* copy of RedoRecPtr, which might be out of date; if it looks
* like a checkpoint is needed, forcibly update RedoRecPtr and
* recheck.
*/
if (IsUnderPostmaster && XLogCheckpointNeeded(t_thrd.xlog_cxt.openLogSegNo)) {
(void)GetRedoRecPtr();
if (XLogCheckpointNeeded(t_thrd.xlog_cxt.openLogSegNo)) {
RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
}
}
XLogUpdateShareStatus();
}
}
if (ispartialpage) {
t_thrd.xlog_cxt.LogwrtResult->Write = WriteRqst.Write;
break;
}
curridx = NextBufIdx(curridx);
if (flexible && npages == 0) {
break;
}
}
Assert(npages == 0);
* Update shared-memory status
*
* We make sure that the shared 'request' values do not fall behind the
* 'result' values. This is not absolutely essential, but it saves some
* code in a couple of places.
*/
{
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->LogwrtResult = *t_thrd.xlog_cxt.LogwrtResult;
if (XLByteLT(xlogctl->LogwrtRqst.Write, t_thrd.xlog_cxt.LogwrtResult->Write)) {
xlogctl->LogwrtRqst.Write = t_thrd.xlog_cxt.LogwrtResult->Write;
}
if (XLByteLT(xlogctl->LogwrtRqst.Flush, t_thrd.xlog_cxt.LogwrtResult->Flush)) {
xlogctl->LogwrtRqst.Flush = t_thrd.xlog_cxt.LogwrtResult->Flush;
g_instance.comm_cxt.predo_cxt.redoPf.primary_flush_ptr = t_thrd.xlog_cxt.LogwrtResult->Flush;
}
SpinLockRelease(&xlogctl->info_lck);
}
if (g_instance.wal_cxt.xlogFlushStats->statSwitch) {
g_instance.wal_cxt.xlogFlushStats->totalXlogSyncBytes += (t_thrd.xlog_cxt.LogwrtResult->Write - lastWrited);
g_instance.wal_cxt.xlogFlushStats->currOpenXlogSegNo = t_thrd.xlog_cxt.openLogSegNo;
}
}
* Write the log to uwal.
*/
static void XLogWriteUwal(XLogRecPtr UwalWriteRqst)
{
bool ispartialpage = false;
bool last_iteration = false;
bool finishing_seg = false;
bool use_existent = false;
bool uwal_write_limit = false;
int curridx = 0;
int npages = 0;
int startidx = 0;
uint32 startoffset = 0;
XLogRecPtr lastWrited = InvalidXLogRecPtr;
uint32 offset;
bool firstUwal = false;
uint64_t targetOffset = 0;
int uwalMaxPages = UWAL_OBJ_ALIGN_LEN / XLOG_BLCKSZ;
if (g_instance.attr.attr_storage.uwal_async_append_switch) {
uwalMaxPages = uwalMaxPages * 4;
}
Assert(t_thrd.int_cxt.CritSectionCount > 0);
*t_thrd.xlog_cxt.LogwrtResult = t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtResult;
lastWrited = t_thrd.xlog_cxt.LogwrtResult->Write;
npages = 0;
startidx = 0;
startoffset = 0;
offset = lastWrited % XLOG_BLCKSZ;
if (t_thrd.xlog_cxt.uwalInfo.info.dataSize == 0) {
if (0 != GsUwalQueryByUser(t_thrd.xlog_cxt.ThisTimeLineID, false)) {
ereport(PANIC, (errcode_for_file_access(), errmsg("uwal query by user failed")));
}
}
if (t_thrd.xlog_cxt.uwalInfo.info.startTimeLine != t_thrd.xlog_cxt.ThisTimeLineID ||
t_thrd.xlog_cxt.uwalInfo.info.dataSize == 0) {
if (0 != GsUwalCreate(t_thrd.xlog_cxt.LogwrtResult->Write - offset)) {
ereport(PANIC, (errcode_for_file_access(), errmsg("could not create uwal file")));
}
firstUwal = true;
}
curridx = XLogRecPtrToBufIdx(t_thrd.xlog_cxt.LogwrtResult->Write);
while (XLByteLT(t_thrd.xlog_cxt.LogwrtResult->Write, UwalWriteRqst)) {
XLogRecPtr EndPtr = t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks[curridx];
if (!XLByteLT(t_thrd.xlog_cxt.LogwrtResult->Write, EndPtr))
ereport(PANIC,
(errmsg("xlog write request %X/%X is past end of log %X/%X",
(uint32)(t_thrd.xlog_cxt.LogwrtResult->Write >> 32),
(uint32)t_thrd.xlog_cxt.LogwrtResult->Write, (uint32)(EndPtr >> 32), (uint32)EndPtr)));
t_thrd.xlog_cxt.LogwrtResult->Write = EndPtr;
ispartialpage = XLByteLT(UwalWriteRqst, t_thrd.xlog_cxt.LogwrtResult->Write);
if (!XLByteInPrevSeg(t_thrd.xlog_cxt.LogwrtResult->Write, t_thrd.xlog_cxt.openLogSegNo)) {
* Switch to new logfile segment. We cannot have any pending
* pages here (since we dump what we have at segment end).
*/
Assert(npages == 0);
if (t_thrd.xlog_cxt.openLogFile >= 0) {
XLogFileClose();
}
XLByteToPrevSeg(t_thrd.xlog_cxt.LogwrtResult->Write, t_thrd.xlog_cxt.openLogSegNo);
use_existent = true;
t_thrd.xlog_cxt.openLogFile = XLogFileInit(t_thrd.xlog_cxt.openLogSegNo, &use_existent, true);
t_thrd.xlog_cxt.openLogOff = 0;
* Unlock WalAuxiliary thread to init new xlog segment if we are running out
* of xlog segments.
*/
if (!use_existent) {
g_instance.wal_cxt.globalEndPosSegNo = t_thrd.xlog_cxt.openLogSegNo;
WakeupWalSemaphore(&g_instance.wal_cxt.walInitSegLock->l.sem);
}
}
if (t_thrd.xlog_cxt.openLogFile <= 0) {
XLByteToPrevSeg(t_thrd.xlog_cxt.LogwrtResult->Write, t_thrd.xlog_cxt.openLogSegNo);
t_thrd.xlog_cxt.openLogFile = XLogFileOpen(t_thrd.xlog_cxt.openLogSegNo);
t_thrd.xlog_cxt.openLogOff = 0;
}
if (npages == 0) {
startidx = curridx;
startoffset = (t_thrd.xlog_cxt.LogwrtResult->Write - XLOG_BLCKSZ) % XLogSegSize;
}
npages++;
* Dump the set if this will be the last loop iteration, or if we are
* at the last page of the cache area (since the next page won't be
* contiguous in memory), or if we are at the end of the logfile
* segment.
*/
last_iteration = !XLByteLT(t_thrd.xlog_cxt.LogwrtResult->Write, UwalWriteRqst);
finishing_seg = !ispartialpage && (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
uwal_write_limit = (npages >= uwalMaxPages);
int ret = 0;
if (last_iteration || curridx == t_thrd.shemem_ptr_cxt.XLogCtl->XLogCacheBlck || finishing_seg ||
uwal_write_limit) {
instr_time startTime;
instr_time endTime;
PgStat_Counter elapsedTime;
Size nbytes;
char *from = NULL;
Size WriteTotal;
if (last_iteration && ispartialpage) {
WriteTotal = (npages - 1) * XLOG_BLCKSZ + UwalWriteRqst % XLOG_BLCKSZ;
} else {
WriteTotal = npages * XLOG_BLCKSZ;
}
if (firstUwal) {
from = t_thrd.shemem_ptr_cxt.XLogCtl->pages + startidx * (Size)XLOG_BLCKSZ;
nbytes = WriteTotal;
targetOffset = lastWrited - offset;
} else {
from = t_thrd.shemem_ptr_cxt.XLogCtl->pages + startidx * (Size)XLOG_BLCKSZ + offset;
nbytes = WriteTotal - offset;
targetOffset = lastWrited;
}
pgstat_report_waitevent(WAIT_EVENT_WAL_WRITE);
INSTR_TIME_SET_CURRENT(startTime);
ret = GsUwalWrite(nbytes, from, targetOffset);
INSTR_TIME_SET_CURRENT(endTime);
INSTR_TIME_SUBTRACT(endTime, startTime);
* elapsedTime can be replaced by beentry->waitInfo.event_info.duration.
*/
elapsedTime = (PgStat_Counter)INSTR_TIME_GET_MICROSEC(endTime);
pgstat_report_waitevent(WAIT_EVENT_END);
if (0 != ret) {
ereport(PANIC, (errcode_for_file_access(), errmsg("uwal write failed, retCode = %d.", ret)));
}
if (g_instance.wal_cxt.xlogFlushStats->statSwitch) {
++g_instance.wal_cxt.xlogFlushStats->writeTimes;
g_instance.wal_cxt.xlogFlushStats->totalActualXlogSyncBytes += nbytes;
g_instance.wal_cxt.xlogFlushStats->totalWriteTime += elapsedTime;
++g_instance.wal_cxt.xlogFlushStats->syncTimes;
g_instance.wal_cxt.xlogFlushStats->totalSyncTime += elapsedTime;
}
reportRedoWrite((PgStat_Counter)npages, elapsedTime);
lastWrited += WriteTotal - offset;
if (ret == 0) {
GsUwalUpdateSenderSyncLsn(lastWrited);
GsUwalRcvStateUpdate(lastWrited);
}
npages = 0;
offset = lastWrited % XLOG_BLCKSZ;
if (finishing_seg) {
WalSndWakeupRequest();
t_thrd.xlog_cxt.LogwrtResult->Flush = t_thrd.xlog_cxt.LogwrtResult->Write;
t_thrd.shemem_ptr_cxt.XLogCtl->lastSegSwitchTime = (pg_time_t)time(NULL);
* Request a checkpoint if we've consumed too much xlog since
* the last one. For speed, we first check using the local
* copy of RedoRecPtr, which might be out of date; if it looks
* like a checkpoint is needed, forcibly update RedoRecPtr and
* recheck.
*/
if (IsUnderPostmaster && XLogCheckpointNeeded(t_thrd.xlog_cxt.openLogSegNo)) {
(void)GetRedoRecPtr();
if (XLogCheckpointNeeded(t_thrd.xlog_cxt.openLogSegNo)) {
RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
}
}
}
}
if (ispartialpage) {
t_thrd.xlog_cxt.LogwrtResult->Write = UwalWriteRqst;
break;
}
curridx = NextBufIdx(curridx);
}
Assert(npages == 0);
if (XLByteLT(t_thrd.xlog_cxt.LogwrtResult->Flush, UwalWriteRqst) &&
XLByteLT(t_thrd.xlog_cxt.LogwrtResult->Flush, t_thrd.xlog_cxt.LogwrtResult->Write)) {
WalSndWakeupRequest();
t_thrd.xlog_cxt.LogwrtResult->Flush = t_thrd.xlog_cxt.LogwrtResult->Write;
}
* Update shared-memory status
*
* We make sure that the shared 'request' values do not fall behind the
* 'result' values. This is not absolutely essential, but it saves some
* code in a couple of places.
*/
{
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->LogwrtResult = *t_thrd.xlog_cxt.LogwrtResult;
if (XLByteLT(xlogctl->LogwrtRqst.Write, t_thrd.xlog_cxt.LogwrtResult->Write)) {
xlogctl->LogwrtRqst.Write = t_thrd.xlog_cxt.LogwrtResult->Write;
}
if (XLByteLT(xlogctl->LogwrtRqst.Flush, t_thrd.xlog_cxt.LogwrtResult->Flush)) {
xlogctl->LogwrtRqst.Flush = t_thrd.xlog_cxt.LogwrtResult->Flush;
g_instance.comm_cxt.predo_cxt.redoPf.primary_flush_ptr = t_thrd.xlog_cxt.LogwrtResult->Flush;
}
SpinLockRelease(&xlogctl->info_lck);
}
if (g_instance.wal_cxt.xlogFlushStats->statSwitch) {
g_instance.wal_cxt.xlogFlushStats->totalXlogSyncBytes += (t_thrd.xlog_cxt.LogwrtResult->Write - lastWrited);
g_instance.wal_cxt.xlogFlushStats->currOpenXlogSegNo = t_thrd.xlog_cxt.openLogSegNo;
}
}
* Record the LSN for an asynchronous transaction commit/abort
* and nudge the WALWriter if there is work for it to do.
* (This should not be called for synchronous commits.)
*/
void XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
{
XLogRecPtr WriteRqstPtr = asyncXactLSN;
bool sleeping = false;
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
*t_thrd.xlog_cxt.LogwrtResult = xlogctl->LogwrtResult;
sleeping = xlogctl->WalWriterSleeping;
if (XLByteLT(xlogctl->asyncXactLSN, asyncXactLSN)) {
xlogctl->asyncXactLSN = asyncXactLSN;
}
SpinLockRelease(&xlogctl->info_lck);
* If the WALWriter is sleeping, we should kick it to make it come out of
* low-power mode. Otherwise, determine whether there's a full page of
* WAL available to write.
*/
if (!sleeping) {
WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
if (XLByteLE(WriteRqstPtr, t_thrd.xlog_cxt.LogwrtResult->Flush)) {
return;
}
}
* Nudge the WALWriter: it has a full page of WAL to write, or we want it
* to come out of low-power mode so that this async commit will reach disk
* within the expected amount of time.
*/
if (g_instance.proc_base->walwriterLatch) {
SetLatch(g_instance.proc_base->walwriterLatch);
}
}
* Record the LSN up to which we can remove WAL because it's not required by
* any replication slot.
*/
void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->replicationSlotMinLSN = lsn;
SpinLockRelease(&xlogctl->info_lck);
if (!ENABLE_DMS && XLByteEQ(lsn, InvalidXLogRecPtr) && !IsInitdb && !RecoveryInProgress()
&& !AM_WAL_SENDER && !AM_WAL_DB_SENDER &&
strcmp(u_sess->attr.attr_common.application_name, "gs_roach") != 0) {
ereport(WARNING, (errmsg("replicationSlotMinLSN is InvalidXLogRecPtr!!!")));
}
}
* Return the oldest LSN we must retain to satisfy the needs of some
* replication slot.
*/
static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
XLogRecPtr retval;
SpinLockAcquire(&xlogctl->info_lck);
retval = xlogctl->replicationSlotMinLSN;
SpinLockRelease(&xlogctl->info_lck);
return retval;
}
* To facilitate calling static function 'XLogGetReplicationSlotMinimumLSN()' in other files
*/
XLogRecPtr XLogGetReplicationSlotMinimumLSNByOther(void)
{
XLogRecPtr minlsn;
minlsn = XLogGetReplicationSlotMinimumLSN();
return minlsn;
}
* Record the LSN up to which we can choose it as a startpoint to dummy standby.
*/
void XLogSetReplicationSlotMaximumLSN(XLogRecPtr lsn)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->replicationSlotMaxLSN = lsn;
SpinLockRelease(&xlogctl->info_lck);
if (!ENABLE_DMS && XLByteEQ(lsn, InvalidXLogRecPtr) && !IsInitdb && !RecoveryInProgress()
&& !AM_WAL_SENDER && !AM_WAL_DB_SENDER &&
strcmp(u_sess->attr.attr_common.application_name, "gs_roach") != 0) {
ereport(WARNING, (errmsg("replicationSlotMaxLSN is InvalidXLogRecPtr!!!")));
}
}
* Return the latest LSN we must retain to satisfy the needs of some
* replication slot.
*/
XLogRecPtr XLogGetReplicationSlotMaximumLSN(void)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
XLogRecPtr retval;
SpinLockAcquire(&xlogctl->info_lck);
retval = xlogctl->replicationSlotMaxLSN;
SpinLockRelease(&xlogctl->info_lck);
return retval;
}
* Reset replication slot before promote to primary.
* We should free the slotname in caller.
*/
static void ResetSlotLSNEndRecovery(StringInfo slotname)
{
int i = 0;
bool resetslot = false;
errno_t rc = 0;
ReplicationSlot *slot = NULL;
Assert(t_thrd.slot_cxt.ReplicationSlotCtl != NULL);
Assert(slotname != NULL);
if (slotname->len <= 0) {
return;
}
LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE);
for (i = 0; i < g_instance.attr.attr_storage.max_replication_slots; i++) {
ReplicationSlot *s = &t_thrd.slot_cxt.ReplicationSlotCtl->replication_slots[i];
if (s->in_use) {
SpinLockAcquire(&s->mutex);
if (s->data.isDummyStandby) {
s->data.restart_lsn = 0;
} else if (strncmp(NameStr(s->data.name), slotname->data, strlen(slotname->data)) == 0) {
s->data.restart_lsn = latestValidRecord;
resetslot = true;
}
s->just_dirtied = true;
s->dirty = true;
SpinLockRelease(&s->mutex);
}
}
* If we can not find slot for PGXCNodeName, we will create it.
* Explame for the scene: standby promotes(failover or switchover) to
* primary firstly, pg_replslot is empty.
*/
if (!resetslot) {
for (i = 0; i < g_instance.attr.attr_storage.max_replication_slots; i++) {
ReplicationSlot *s = &t_thrd.slot_cxt.ReplicationSlotCtl->replication_slots[i];
SpinLockAcquire(&s->mutex);
if (!s->in_use) {
rc = strncpy_s(NameStr(s->data.name), NAMEDATALEN, slotname->data, NAMEDATALEN - 1);
securec_check(rc, "\0", "\0");
NameStr(s->data.name)[NAMEDATALEN - 1] = '\0';
s->data.restart_lsn = latestValidRecord;
SET_SLOT_PERSISTENCY(s->data, RS_PERSISTENT);
s->data.database = InvalidOid;
s->in_use = true;
s->just_dirtied = true;
s->dirty = true;
SpinLockRelease(&s->mutex);
slot = s;
resetslot = true;
break;
}
SpinLockRelease(&s->mutex);
}
}
LWLockRelease(ReplicationSlotControlLock);
if (!resetslot) {
ereport(WARNING,
(errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
errmsg("could not reset replication slot at the end of recovery"),
errhint("Delete replication slot or increase g_instance.attr.attr_storage.max_replication_slots.")));
}
if (slot != NULL) {
CreateSlotOnDisk(slot);
}
CheckPointReplicationSlots();
ReplicationSlotsComputeRequiredLSN(NULL);
}
inline void XlogUpMinRecovPointSwitchChk(XLogRecPtr lsn)
{
if (g_instance.comm_cxt.predo_cxt.pre_enable_switch !=
g_instance.attr.attr_storage.enable_update_max_page_flush_lsn) {
ereport(WARNING, (errmsg("XlogUpMinRecovPointSwitchChk lsn %X/%X, local minRecoveryPoint %X/%X, "
"ControlFile minRecoveryPoint %X/%X, updateMinRecoveryPointSwitch:%u change to :%u",
(uint32)(lsn >> 32), (uint32)lsn, (uint32)(t_thrd.xlog_cxt.minRecoveryPoint >> 32),
(uint32)t_thrd.xlog_cxt.minRecoveryPoint,
(uint32)(t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint >> 32),
(uint32)(t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint),
g_instance.comm_cxt.predo_cxt.pre_enable_switch,
g_instance.attr.attr_storage.enable_update_max_page_flush_lsn)));
g_instance.comm_cxt.predo_cxt.pre_enable_switch = g_instance.attr.attr_storage.enable_update_max_page_flush_lsn;
}
}
* Advance minRecoveryPoint in control file.
*
* If we crash during recovery, we must reach this point again before the
* database is consistent.
*
* If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
* is only updated if it's not already greater than or equal to 'lsn'.
*/
void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
{
if (!t_thrd.xlog_cxt.updateMinRecoveryPoint || (!force && XLByteLE(lsn, t_thrd.xlog_cxt.minRecoveryPoint))) {
return;
}
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
t_thrd.xlog_cxt.minRecoveryPoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
SetMinRecoverPointForStats(t_thrd.xlog_cxt.minRecoveryPoint);
* An invalid minRecoveryPoint means that we need to recover all the WAL,
* i.e., we're doing crash recovery. We never modify the control file's
* value in that case, so we can short-circuit future checks here too.
*/
if (t_thrd.xlog_cxt.minRecoveryPoint == 0) {
t_thrd.xlog_cxt.updateMinRecoveryPoint = false;
} else if (force || XLByteLT(t_thrd.xlog_cxt.minRecoveryPoint, lsn)) {
if (XLByteLT(t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint, lsn)) {
t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint = lsn;
UpdateControlFile();
t_thrd.xlog_cxt.minRecoveryPoint = lsn;
SetMinRecoverPointForStats(t_thrd.xlog_cxt.minRecoveryPoint);
ereport(DEBUG1,
(errmsg("updated min recovery point to %X/%X", (uint32)(t_thrd.xlog_cxt.minRecoveryPoint >> 32),
(uint32)t_thrd.xlog_cxt.minRecoveryPoint)));
}
}
LWLockRelease(ControlFileLock);
}
void XLogWaitFlush(XLogRecPtr recptr)
{
* During REDO, we are reading not writing WAL.
*
* Test XLogInsertAllowed() instead of RecoveryInProgress(). XLogInsertAllowed()
* allows WAL writing at the end of recovery by setting a thread local variable
* if this thread local variable is not set, it returns RecoveryInProgress()
*
* The checkpointer acts this way as well - when it tries to write
* end-of-recovery checkpoint, this function should indeed flush
*
* During recovery, if the caller thread does not set allow XLogInsert, only
* update minRecoveryPoint and return
*/
if (!XLogInsertAllowed()) {
if (g_supportHotStandby) {
UpdateMinRecoveryPoint(recptr, false);
}
return;
}
if (SS_DISASTER_STANDBY_CLUSTER && !g_instance.dms_cxt.SSRecoveryInfo.disaster_cluster_promoting) {
ereport(DEBUG1, (errmsg("[SS_DISASTER_CLUSTER] SS standby cluster needless wait xlog flush")));
return;
}
volatile XLogRecPtr flushTo = gs_compare_and_swap_u64(&g_instance.wal_cxt.flushResult, 0, 0);
while (XLByteLT(flushTo, recptr)) {
if (!g_instance.wal_cxt.isWalWriterUp) {
XLogSelfFlush();
} else if (LWLockAcquireOrWait(g_instance.wal_cxt.walFlushWaitLock->l.lock, LW_EXCLUSIVE)) {
PGSemaphoreLock(&g_instance.wal_cxt.walFlushWaitLock->l.sem, true);
LWLockRelease(g_instance.wal_cxt.walFlushWaitLock->l.lock);
}
flushTo = pg_atomic_barrier_read_u64(&g_instance.wal_cxt.flushResult);
}
return;
}
void XLogWaitBufferInit(XLogRecPtr recptr)
{
volatile XLogRecPtr sentTo = gs_compare_and_swap_u64(&g_instance.wal_cxt.sentResult, 0, 0);
while (XLByteLT(sentTo, recptr)) {
if (!g_instance.wal_cxt.isWalWriterUp) {
XLogSelfFlush();
} else if (LWLockAcquireOrWait(g_instance.wal_cxt.walBufferInitWaitLock->l.lock, LW_EXCLUSIVE)) {
PGSemaphoreLock(&g_instance.wal_cxt.walBufferInitWaitLock->l.sem, true);
LWLockRelease(g_instance.wal_cxt.walBufferInitWaitLock->l.lock);
}
sentTo = gs_compare_and_swap_u64(&g_instance.wal_cxt.sentResult, 0, 0);
}
return;
}
static void XLogFlushCore(XLogRecPtr writeRqstPtr)
{
START_CRIT_SECTION();
XLogwrtRqst WriteRqst;
WriteRqst.Write = writeRqstPtr;
WriteRqst.Flush = writeRqstPtr;
XLogWrite(WriteRqst, false);
END_CRIT_SECTION();
}
void XLogDoFlush()
{
if (!XLByteLT(t_thrd.xlog_cxt.LogwrtResult->Flush, t_thrd.xlog_cxt.LogwrtResult->Write)) {
return;
}
* Could get here without iterating above loop, in which case we might
* have no open file or the wrong one. However, we do not need to
* fsync more than one file.
*/
if (u_sess->attr.attr_storage.sync_method != SYNC_METHOD_OPEN &&
u_sess->attr.attr_storage.sync_method != SYNC_METHOD_OPEN_DSYNC) {
if (t_thrd.xlog_cxt.openLogFile >= 0 &&
!XLByteInPrevSeg(t_thrd.xlog_cxt.LogwrtResult->Write, t_thrd.xlog_cxt.openLogSegNo)) {
XLogFileClose();
}
if (t_thrd.xlog_cxt.openLogFile < 0) {
XLByteToPrevSeg(t_thrd.xlog_cxt.LogwrtResult->Write, t_thrd.xlog_cxt.openLogSegNo);
t_thrd.xlog_cxt.openLogFile = XLogFileOpen(t_thrd.xlog_cxt.openLogSegNo);
t_thrd.xlog_cxt.openLogOff = 0;
}
instr_time startTime;
instr_time endTime;
uint64 elapsedTime;
INSTR_TIME_SET_CURRENT(startTime);
issue_xlog_fsync(t_thrd.xlog_cxt.openLogFile, t_thrd.xlog_cxt.openLogSegNo);
INSTR_TIME_SET_CURRENT(endTime);
INSTR_TIME_SUBTRACT(endTime, startTime);
elapsedTime = INSTR_TIME_GET_MICROSEC(endTime);
if (g_instance.wal_cxt.xlogFlushStats->statSwitch) {
++g_instance.wal_cxt.xlogFlushStats->syncTimes;
g_instance.wal_cxt.xlogFlushStats->totalSyncTime += elapsedTime;
}
}
WalSndWakeupRequest();
t_thrd.xlog_cxt.LogwrtResult->Flush = t_thrd.xlog_cxt.LogwrtResult->Write;
UpdateSSDoradoCtlInfoAndSync();
AddShareStorageXLopCopyBackendWakeupRequest();
XLogUpdateShareStatus();
}
* Flush xlog, but without specifying exactly where to flush to.
*
* We normally flush only completed blocks; but if there is nothing to do on
* that basis, we check for unflushed async commits in the current incomplete
* block, and flush through the latest one of those. Thus, if async commits
* are not being used, we will flush complete blocks only. We can guarantee
* that async commits reach disk after at most three cycles; normally only
* one or two. (When flushing complete blocks, we allow XLogWrite to write
* "flexibly", meaning it can stop at the end of the buffer ring; this makes a
* difference only with very high load or long wal_writer_delay, but imposes
* one extra cycle for the worst case for async commits.)
*
* This routine is invoked periodically by the background walwriter process.
*
* Returns TRUE if we flushed anything.
*/
bool XLogBackgroundFlush(bool fsync)
{
if (fsync) {
XLogDoFlush();
WakeupWalSemaphore(&g_instance.wal_cxt.walFlushWaitLock->l.sem);
return false;
}
XLogRecPtr WriteRqstPtr = InvalidXLogRecPtr;
XLogRecPtr InitializeRqstPtr = InvalidXLogRecPtr;
int start_entry_idx, curr_entry_idx, next_entry_idx, entry_idx;
volatile WALInsertStatusEntry *start_entry_ptr = NULL;
volatile WALInsertStatusEntry *curr_entry_ptr = NULL;
volatile WALInsertStatusEntry *next_entry_ptr = NULL;
#ifndef ENABLE_MULTIPLE_NODES
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
XLogRecPtr consensus_lsn = InvalidXLogRecPtr;
int paxos_entry_idx = -1;
#endif
volatile WALInsertStatusEntry *status_entry_ptr = NULL;
bool need_flush = false;
if (fsync) {
XLogDoFlush();
WakeupWalSemaphore(&g_instance.wal_cxt.walFlushWaitLock->l.sem);
return false;
}
* Entry #: g_instance.wal_cxt.walInsertStatusTable
* +---------------+--------+-------+
* 0: | | | |
* +---------------+--------+-------+
* ......
* +---------------+--------+-------+
* *lastWalStatusEntry+1: | | |COPIED | <--- while loop start here till hit a hole or
* +---------------+--------+-------+ and older LRC
* ...... COPIED
* +---------------+--------+-------+
* prev_entry: | endLSN(N-1) |LRC(N-1)|COPIED | <== curr_entry_ptr <--- flush till here
* +---------------+--------+-------+
* curr_entry_idx: | endLSN(N) |LRC(N) |NOT_CP | <== next_entry_ptr
* +---------------+--------+-------+
* next_entry_idx: | endLSN(N+1) |LRC(N+1)|COPIED |
* +---------------+--------+-------+
* ......
* +---------------+--------+-------+
* WAL_INSERT_STATUS_ENTRIES:| | | |
* - 1 +---------------+--------+-------+
*
*/
start_entry_idx = GET_NEXT_STATUS_ENTRY(g_instance.wal_cxt.lastWalStatusEntryFlushed);
start_entry_ptr = &g_instance.wal_cxt.walInsertStatusTable[GET_STATUS_ENTRY_INDEX(start_entry_idx)];
pg_read_barrier();
* Bail out if the very first entry is not copied.
*/
if (start_entry_ptr->endLSN == 0) {
return false;
}
if ((start_entry_ptr->endLSN & ~MAX_XLOG_REC_PTR) > 0) {
need_flush = true;
}
#ifndef ENABLE_MULTIPLE_NODES
if (g_instance.attr.attr_storage.dcf_attr.enable_dcf && t_thrd.dcf_cxt.dcfCtxInfo->isDcfStarted) {
SpinLockAcquire(&xlogctl->info_lck);
consensus_lsn = xlogctl->LogwrtPaxos.Consensus;
SpinLockRelease(&xlogctl->info_lck);
if (XLByteLE((XLogRecPtr)start_entry_ptr->endLSN, consensus_lsn))
paxos_entry_idx = start_entry_idx;
}
#endif
next_entry_idx = start_entry_idx;
next_entry_ptr = start_entry_ptr;
uint64 stTime = GetCurrentTimestamp();
uint64 startLSN = (start_entry_ptr->endLSN & MAX_XLOG_REC_PTR);
uint64 totalXlogIterBytes = g_instance.wal_cxt.totalXlogIterBytes;
uint64 totalXlogIterTimes = g_instance.wal_cxt.totalXlogIterTimes;
uint64 averageXlogFlushBytes = (totalXlogIterTimes == 0) ? 0 : totalXlogIterBytes / totalXlogIterTimes;
uint64 curAverageXlogFlushBytes = (averageXlogFlushBytes == 0) ? XLOG_FLUSH_SIZE_INIT :
(averageXlogFlushBytes / PAGE_SIZE_BYTES + 1) * PAGE_SIZE_BYTES;
do {
curr_entry_ptr = next_entry_ptr;
curr_entry_idx = next_entry_idx;
next_entry_idx = GET_NEXT_STATUS_ENTRY(curr_entry_idx);
next_entry_ptr = &g_instance.wal_cxt.walInsertStatusTable[GET_STATUS_ENTRY_INDEX(next_entry_idx)];
Assert(curr_entry_ptr->endLSN != 0);
#ifndef ENABLE_MULTIPLE_NODES
if (g_instance.attr.attr_storage.dcf_attr.enable_dcf && t_thrd.dcf_cxt.dcfCtxInfo->isDcfStarted) {
SpinLockAcquire(&xlogctl->info_lck);
consensus_lsn = xlogctl->LogwrtPaxos.Consensus;
SpinLockRelease(&xlogctl->info_lck);
if (XLByteLE((XLogRecPtr)curr_entry_ptr->endLSN, consensus_lsn))
paxos_entry_idx = curr_entry_idx;
}
#endif
if (((curr_entry_ptr->LRC + 1) & 0x7FFFFFFF) != next_entry_ptr->LRC) {
if (!need_flush && (curr_entry_ptr->endLSN & ~MAX_XLOG_REC_PTR) > 0) {
need_flush = true;
}
break;
}
* Flush if accumulate enough bytes or till the LSN in the entry before
* an entry associated with the first uncopied record found in the current loop.
*/
if (next_entry_ptr->endLSN == 0) {
if (u_sess->attr.attr_common.lc_xlog_flush_opt) {
break;
}
if (((curr_entry_ptr->endLSN - startLSN) > curAverageXlogFlushBytes) ||
(GetCurrentTimestamp() - stTime >= (uint64)g_instance.attr.attr_storage.wal_flush_timeout)) {
break;
}
pg_usleep(g_instance.attr.attr_storage.wal_flush_delay);
next_entry_idx = curr_entry_idx;
next_entry_ptr = curr_entry_ptr;
}
if (!need_flush && (next_entry_ptr->endLSN & ~MAX_XLOG_REC_PTR) > 0) {
need_flush = true;
}
} while (true);
g_instance.wal_cxt.lastLRCScanned = curr_entry_ptr->LRC;
* During REDO, we are reading not writing WAL.
*
* Test XLogInsertAllowed(), not RecoveryInProgress. XLogInsertAllowed()
* allows WAL writing at the end of recovery by setting a thread local variable
* if this thread local variable is not set, it returns RecoveryInProgress()
*
* The checkpointer acts this way as well - when it tries to write
* end-of-recovery checkpoint, this function should indeed flush.
*/
if (!XLogInsertAllowed()) {
return false;
}
if (SS_DORADO_STANDBY_CLUSTER) {
return false;
}
#ifndef ENABLE_MULTIPLE_NODES
if (g_instance.attr.attr_storage.dcf_attr.enable_dcf && t_thrd.dcf_cxt.dcfCtxInfo->isDcfStarted) {
WriteRqstPtr = (XLogRecPtr)pg_atomic_barrier_read_u64(&curr_entry_ptr->endLSN);
LWLockAcquire(WALWritePaxosLock, LW_EXCLUSIVE);
XLogWritePaxos(WriteRqstPtr);
LWLockRelease(WALWritePaxosLock);
if (paxos_entry_idx == -1) {
return false;
} else {
curr_entry_idx = paxos_entry_idx;
curr_entry_ptr = &g_instance.wal_cxt.walInsertStatusTable[GET_STATUS_ENTRY_INDEX(paxos_entry_idx)];
}
}
#endif
WriteRqstPtr = ((XLogRecPtr)pg_atomic_barrier_read_u64(&curr_entry_ptr->endLSN) & MAX_XLOG_REC_PTR);
#ifdef WAL_DEBUG
if (u_sess->attr.attr_storage.XLOG_DEBUG) {
ereport(LOG, (errmsg("xlog bg flush request %X/%X; write %X/%X; flush %X/%X", (uint32)(WriteRqstPtr >> 32),
(uint32)WriteRqstPtr, (uint32)(t_thrd.xlog_cxt.LogwrtResult->Write >> 32),
(uint32)t_thrd.xlog_cxt.LogwrtResult->Write,
(uint32)(t_thrd.xlog_cxt.LogwrtResult->Flush >> 32),
(uint32)t_thrd.xlog_cxt.LogwrtResult->Flush)));
}
#endif
if (g_instance.attr.attr_storage.enable_uwal)
XLogFlushUwal(WriteRqstPtr);
else
XLogFlushCore(WriteRqstPtr);
#ifndef ENABLE_MULTIPLE_NODES
#ifdef USE_ASSERT_CHECKING
if (g_instance.attr.attr_storage.dcf_attr.enable_dcf && t_thrd.dcf_cxt.dcfCtxInfo->isDcfStarted) {
SpinLockAcquire(&xlogctl->info_lck);
consensus_lsn = xlogctl->LogwrtPaxos.Consensus;
SpinLockRelease(&xlogctl->info_lck);
Assert(XLByteLE(WriteRqstPtr, consensus_lsn));
}
#endif
#endif
g_instance.wal_cxt.lastWalStatusEntryFlushed = curr_entry_idx;
g_instance.wal_cxt.lastLRCWrited = curr_entry_ptr->LRC;
if (ENABLE_DMS || !g_instance.attr.attr_common.xlog_write_flush_split || need_flush) {
XLogDoFlush();
WakeupWalSemaphore(&g_instance.wal_cxt.walFlushWaitLock->l.sem);
}
g_instance.wal_cxt.totalXlogIterTimes++;
g_instance.wal_cxt.totalXlogIterBytes += (curr_entry_ptr->endLSN & MAX_XLOG_REC_PTR - startLSN);
* Update LRC and status in the status entry that is already flushed (from start to current entry).
* All WAL insert threads need to examine both of the LRC and the status
* fiels before copying its endLSN and set status to WAL_COPIED in this
* entry. This is to ensure it truly owns this entry, i.e., the writer thread
* finishes updating the entry.
*/
entry_idx = start_entry_idx - 1;
do {
* Increment entry_idx by 1 and wrap the result around WAL_INSERT_STATUS_ENTRIES.
*/
entry_idx = GET_NEXT_STATUS_ENTRY(entry_idx);
status_entry_ptr = &g_instance.wal_cxt.walInsertStatusTable[GET_STATUS_ENTRY_INDEX(entry_idx)];
status_entry_ptr->LRC = (status_entry_ptr->LRC + GET_WAL_INSERT_STATUS_ENTRY_CNT()) & 0x7FFFFFFF;
status_entry_ptr->endLSN = 0;
} while (entry_idx != curr_entry_idx);
pg_write_barrier();
InitializeRqstPtr = WriteRqstPtr - WriteRqstPtr % XLOG_BLCKSZ;
if (InitializeRqstPtr != InvalidXLogRecPtr && XLByteLT(g_instance.wal_cxt.sentResult, InitializeRqstPtr)) {
AdvanceXLInsertBuffer<false>(InitializeRqstPtr, false);
(void)pg_atomic_exchange_u64((uint64 *)&g_instance.wal_cxt.sentResult, InitializeRqstPtr);
WakeupWalSemaphore(&g_instance.wal_cxt.walBufferInitWaitLock->l.sem);
}
return true;
}
* This function calls XLogBackgroundFlush ONCE when no one else is
* doing it
* The caller of this function should loop on this function to make
* sure the LSN the caller waiting on is indeed flushed
*/
static void XLogSelfFlush(void)
{
* The current callers of this function all check whether
* the WAL writer is present, but as a generic function,
* it should also does the check.
*/
if (g_instance.wal_cxt.isWalWriterUp) {
return;
}
* We only try self-flush when nobody else is already doing it
*/
if (LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE)) {
*t_thrd.xlog_cxt.LogwrtResult = t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtResult;
XLogBackgroundFlush();
XLogBackgroundFlush(true);
LWLockRelease(WALWriteLock);
}
return;
}
* This function is called by CopyXLogRecordToWAL or CopyXLogRecordToWALForGroup
* when large records ( > wal buffer size ) are copied to WAL.
*
* During copying such a large record, we will wrap around the wal buffer and
* access the page the large record starts from at least twice. When an agent
* hits the start page again, the agent needs to initiate a full wal buffer
* flush or it will get stuck - it has no more buffer page to use but
* the wal writer cannot flush for this agent because the wal writer should only
* flush when a status entry is updated but this agent cannot update its status
* entry before it finishes copying its record.
*
* This function should not change the wal copy status entries.
*
* numHitsOnStartPage - the 2nd hit on the start page needs to wait for all
* potential concurrent flushes to finish while the
* subsequent hits on the start page don't
* CurrPos - The requested LSN
* currLRC - need this to calculate the previous LRC for use to wait for
*/
static void XLogSelfFlushWithoutStatus(int numHitsOnStartPage, XLogRecPtr currPos, int currLRC)
{
* I only needs to wait for the wal writer or other agents'
* flushes (when wal writer is absent) to finish when I hit
* the start page the 2nd time within one WAL copy because
* after that the wal flush is blocked by me
*/
if (numHitsOnStartPage == 2) {
int prevLRC = (currLRC - 1) & 0x7FFFFFFF;
while (g_instance.wal_cxt.lastLRCFlushed != prevLRC) {
if (!g_instance.wal_cxt.isWalWriterUp) {
XLogSelfFlush();
}
pg_usleep(1);
}
}
#ifndef ENABLE_MULTIPLE_NODES
if (g_instance.attr.attr_storage.dcf_attr.enable_dcf && t_thrd.dcf_cxt.dcfCtxInfo->isDcfStarted) {
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
XLogRecPtr consensus_lsn = InvalidXLogRecPtr;
LWLockAcquire(WALWritePaxosLock, LW_EXCLUSIVE);
XLogWritePaxos(currPos);
LWLockRelease(WALWritePaxosLock);
SpinLockAcquire(&xlogctl->info_lck);
consensus_lsn = xlogctl->LogwrtPaxos.Consensus;
SpinLockRelease(&xlogctl->info_lck);
while (XLByteLT(consensus_lsn, currPos)) {
pg_usleep(1);
SpinLockAcquire(&xlogctl->info_lck);
consensus_lsn = xlogctl->LogwrtPaxos.Consensus;
SpinLockRelease(&xlogctl->info_lck);
}
}
#endif
* To this point, we are the one holding up the wal writer or
* anyone that does the flush but we will still get the write
* lock to be safe since acquiring a lock without contention
* doesn't hurt performance here
*/
LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
if (g_instance.attr.attr_storage.enable_uwal)
XLogFlushUwal(currPos);
else {
XLogFlushCore(currPos);
XLogDoFlush();
}
LWLockRelease(WALWriteLock);
XLogRecPtr initializeRqstPtr = currPos - currPos % XLOG_BLCKSZ;
AdvanceXLInsertBuffer<false>(initializeRqstPtr, false);
(void)pg_atomic_exchange_u64((uint64 *)&g_instance.wal_cxt.sentResult, initializeRqstPtr);
WakeupWalSemaphore(&g_instance.wal_cxt.walBufferInitWaitLock->l.sem);
}
* Test whether XLOG data has been flushed up to (at least) the given position.
*
* Returns true if a flush is still needed. (It may be that someone else
* is already in process of flushing that far, however.)
*/
bool XLogNeedsFlush(XLogRecPtr record)
{
* During recovery, we don't flush WAL but update minRecoveryPoint
* instead. So "needs flush" is taken to mean whether minRecoveryPoint
* would need to be updated.
*/
if (RecoveryInProgress()) {
if (XLByteLE(record, t_thrd.xlog_cxt.minRecoveryPoint) || !t_thrd.xlog_cxt.updateMinRecoveryPoint) {
return false;
}
* Update local copy of minRecoveryPoint. But if the lock is busy,
* just return a conservative guess.
*/
if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED)) {
return true;
}
t_thrd.xlog_cxt.minRecoveryPoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
SetMinRecoverPointForStats(t_thrd.xlog_cxt.minRecoveryPoint);
LWLockRelease(ControlFileLock);
* An invalid minRecoveryPoint means that we need to recover all the
* WAL, i.e., we're doing crash recovery. We never modify the control
* file's value in that case, so we can short-circuit future checks
* here too.
*/
if (t_thrd.xlog_cxt.minRecoveryPoint == 0) {
t_thrd.xlog_cxt.updateMinRecoveryPoint = false;
}
if (XLByteLE(record, t_thrd.xlog_cxt.minRecoveryPoint) || !t_thrd.xlog_cxt.updateMinRecoveryPoint) {
return false;
} else {
return true;
}
}
if (XLByteLE(record, t_thrd.xlog_cxt.LogwrtResult->Flush)) {
return false;
}
{
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
*t_thrd.xlog_cxt.LogwrtResult = xlogctl->LogwrtResult;
SpinLockRelease(&xlogctl->info_lck);
}
if (XLByteLE(record, t_thrd.xlog_cxt.LogwrtResult->Flush)) {
return false;
}
return true;
}
* Create a new XLOG file segment, or open a pre-existing one.
*
* log, seg: identify segment to be created/opened.
*
* *use_existent: if TRUE, OK to use a pre-existing file (else, any
* pre-existing file will be deleted). On return, TRUE if a pre-existing
* file was used.
*
* use_lock: if TRUE, acquire ControlFileLock while moving file into
* place. This should be TRUE except during bootstrap log creation. The
* caller must *not* hold the lock at call.
*
* Returns FD of opened file.
*
* Note: errors here are ERROR not PANIC because we might or might not be
* inside a critical section (eg, during checkpoint there is no reason to
* take down the system on failure). They will promote to PANIC if we are
* in a critical section.
*/
static int XLogFileInitInternal(XLogSegNo logsegno, bool *use_existent, bool use_lock, const char *xlog_dir)
{
char path[MAXPGPATH];
char tmppath[MAXPGPATH];
XLogSegNo installed_segno;
int max_advance;
int fd;
int nbytes;
errno_t rc = EOK;
gstrace_entry(GS_TRC_ID_XLogFileInit);
rc = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", xlog_dir, t_thrd.xlog_cxt.ThisTimeLineID,
(uint32)((logsegno) / XLogSegmentsPerXLogId), (uint32)((logsegno) % XLogSegmentsPerXLogId));
securec_check_ss(rc, "", "");
if (*use_existent) {
fd = BasicOpenFile(path, O_RDWR | PG_BINARY | (unsigned int)get_sync_bit(u_sess->attr.attr_storage.sync_method),
S_IRUSR | S_IWUSR);
if (fd < 0) {
if (!FILE_POSSIBLY_DELETED(errno)) {
ereport(ERROR,
(errcode_for_file_access(), errmsg("could not open file \"%s\" (log segment %s): %s", path,
XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, logsegno),
TRANSLATE_ERRNO)));
}
} else {
gstrace_exit(GS_TRC_ID_XLogFileInit);
return fd;
}
}
if (SS_DORADO_STANDBY_CLUSTER || SS_STANDBY_MODE) {
ereport(FATAL, (errmsg("[SS] Standby node cannot init xlog files")));
}
* Initialize an empty (all zeroes) segment. NOTE: it is possible that
* another process is doing the same thing. If so, we will end up
* pre-creating an extra log segment. That seems OK, and better than
* holding the lock throughout this lengthy process.
*/
ereport(DEBUG2, (errmsg("creating and filling new WAL file")));
rc = snprintf_s(tmppath, MAXPGPATH, MAXPGPATH - 1, "%s/xlogtemp.%lu", xlog_dir, gs_thread_self());
securec_check_ss(rc, "\0", "\0");
unlink(tmppath);
fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR);
if (fd < 0) {
ereport(ERROR, (errcode_for_file_access(),
errmsg("could not create file \"%s\": %s", tmppath, TRANSLATE_ERRNO)));
}
* Zero-fill the file. We have to do this the hard way to ensure that all
* the file space has really been allocated --- on platforms that allow
* "holes" in files, just seeking to the end doesn't allocate intermediate
* space. This way, we know that we have all the space and (after the
* fsync below) that all the indirect blocks are down on disk. Therefore,
* fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
* log file.
*
* Note: ensure the buffer is reasonably well-aligned; this may save a few
* cycles transferring data to the kernel.
*/
if (is_dss_fd(fd)) {
char *ss_zero_buffer = NULL;
char *ss_zero_buffer_raw = NULL;
pgstat_report_waitevent(WAIT_EVENT_WAL_INIT_WRITE);
errno = 0;
if (ftruncate(fd, XLogSegSize) != 0) {
int save_errno = errno;
close(fd);
unlink(tmppath);
errno = save_errno ? save_errno : ENOSPC;
ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %s",
tmppath, TRANSLATE_ERRNO)));
}
pgstat_report_waitevent(WAIT_EVENT_END);
if (g_instance.wal_cxt.ssZeroBuffer != NULL) {
ss_zero_buffer = (char *)BUFFERALIGN(g_instance.wal_cxt.ssZeroBuffer);
} else {
ss_zero_buffer_raw = (char*)palloc0(SS_XLOG_WRITE_ZERO_STEP + ALIGNOF_BUFFER);
ss_zero_buffer = (char *)BUFFERALIGN(ss_zero_buffer_raw);
}
for (nbytes = 0; (uint32)nbytes < XLogSegSize; nbytes += SS_XLOG_WRITE_ZERO_STEP) {
errno = 0;
pgstat_report_waitevent(WAIT_EVENT_WAL_INIT_WRITE);
if ((int)write(fd, ss_zero_buffer, SS_XLOG_WRITE_ZERO_STEP) != (int)SS_XLOG_WRITE_ZERO_STEP) {
int save_errno = errno;
close(fd);
unlink(tmppath);
errno = save_errno ? save_errno : ENOSPC;
ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %s",
tmppath, TRANSLATE_ERRNO)));
}
pgstat_report_waitevent(WAIT_EVENT_END);
}
if (ss_zero_buffer_raw != NULL) {
ss_zero_buffer = NULL;
pfree(ss_zero_buffer_raw);
}
} else {
char zbuffer_raw[XLOG_BLCKSZ + ALIGNOF_BUFFER];
char *zbuffer = (char *)BUFFERALIGN(zbuffer_raw);
rc = memset_s(zbuffer, XLOG_BLCKSZ, 0, XLOG_BLCKSZ);
securec_check(rc, "\0", "\0");
for (nbytes = 0; (uint32)nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ) {
errno = 0;
pgstat_report_waitevent(WAIT_EVENT_WAL_INIT_WRITE);
if ((int)write(fd, zbuffer, XLOG_BLCKSZ) != (int)XLOG_BLCKSZ) {
int save_errno = errno;
close(fd);
unlink(tmppath);
errno = save_errno ? save_errno : ENOSPC;
ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %s",
tmppath, TRANSLATE_ERRNO)));
}
pgstat_report_waitevent(WAIT_EVENT_END);
}
}
pgstat_report_waitevent(WAIT_EVENT_WAL_INIT_SYNC);
if (pg_fsync(fd) != 0) {
int save_errno = errno;
close(fd);
errno = save_errno;
ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %s",
tmppath, TRANSLATE_ERRNO)));
}
pgstat_report_waitevent(WAIT_EVENT_END);
if (close(fd)) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %s",
tmppath, TRANSLATE_ERRNO)));
}
* Now move the segment into place with its final name.
*
* If caller didn't want to use a pre-existing file, get rid of any
* pre-existing file. Otherwise, cope with possibility that someone else
* has created the file while we were filling ours: if so, use ours to
* pre-create a future log segment.
*/
installed_segno = logsegno;
max_advance = XLOGfileslop;
if (!InstallXLogFileSegment(&installed_segno, (const char *)tmppath, *use_existent, &max_advance,
use_lock, xlog_dir)) {
* No need for any more future segments, or InstallXLogFileSegment()
* failed to rename the file into place. If the rename failed, opening
* the file below will fail.
*/
unlink(tmppath);
}
*use_existent = false;
fd = BasicOpenFile(path, O_RDWR | PG_BINARY | (unsigned int)get_sync_bit(u_sess->attr.attr_storage.sync_method),
S_IRUSR | S_IWUSR);
if (fd < 0) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\" (log segment %s): %s", path,
XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, logsegno),
TRANSLATE_ERRNO)));
}
ereport(DEBUG2, (errmsg("done creating and filling new WAL file:%s", path)));
gstrace_exit(GS_TRC_ID_XLogFileInit);
return fd;
}
int XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
{
return XLogFileInitInternal(logsegno, use_existent, use_lock, SS_XLOGDIR);
}
void XLogFileCutPage(char *buffer, uint32 bufLen, uint32 cpyLen)
{
if (bufLen != 0) {
uint32 offsetLen = cpyLen % bufLen;
uint32 truncateLen = bufLen - offsetLen;
errno_t rc = EOK;
ereport(LOG, (errcode_for_file_access(),
errmsg("XLogFileCutPage begin, bufLen:%u, cpyLen:%u, offsetLen:%u, truncateLen:%u", bufLen,
cpyLen, offsetLen, truncateLen)));
rc = memset_s((buffer + offsetLen), truncateLen, 0, truncateLen);
securec_check(rc, "\0", "\0");
}
}
bool PreInitXlogFileForStandby(XLogRecPtr requestLsn)
{
static XLogRecPtr lastWriteLsn = 0;
static XLogSegNo nextSegNo = 0;
if (requestLsn < lastWriteLsn) {
ereport(WARNING, (errmsg("PreInitXlogFileForStandby need reset. requestLsn: %X/%X; lastWriteLsn: %X/%X",
(uint32)(requestLsn >> 32), (uint32)requestLsn, (uint32)(lastWriteLsn >> 32),
(uint32)lastWriteLsn)));
lastWriteLsn = requestLsn;
nextSegNo = 0;
} else if (requestLsn < lastWriteLsn + XLogSegSize) {
return false;
} else {
lastWriteLsn = requestLsn;
}
XLogSegNo reqSegNo;
XLByteToPrevSeg(requestLsn, reqSegNo);
if (nextSegNo <= reqSegNo) {
nextSegNo = reqSegNo + 1;
}
XLogSegNo targetSegNo = reqSegNo + g_instance.attr.attr_storage.advance_xlog_file_num;
for (; nextSegNo <= targetSegNo; nextSegNo++) {
bool use_existent = true;
int lf = XLogFileInit(nextSegNo, &use_existent, true);
if (!use_existent)
ereport(DEBUG1,
(errmsg("PreInitXlogFileForStandby new nextSegNo: %X/%X ThisTimeLineID: %u, RecoveryTargetTLI: %u",
(uint32)(nextSegNo >> 32), (uint32)nextSegNo, t_thrd.shemem_ptr_cxt.XLogCtl->ThisTimeLineID,
t_thrd.shemem_ptr_cxt.XLogCtl->RecoveryTargetTLI)));
if (lf >= 0)
close(lf);
}
return true;
}
void PreInitXlogFileForPrimary(int wal_file_init_num)
{
XLogSegNo startSegNo, nextSegNo, target;
int lf;
bool use_existent;
g_xlog_stat_shared->walAuxWakeNum++;
if (g_instance.wal_cxt.globalEndPosSegNo == InvalidXLogSegPtr || (ENABLE_DMS && !SS_NORMAL_PRIMARY)) {
return;
}
startSegNo = g_instance.wal_cxt.globalEndPosSegNo + 1;
target = startSegNo + wal_file_init_num - 1;
for (nextSegNo = startSegNo; nextSegNo <= target; nextSegNo++) {
use_existent = true;
lf = XLogFileInit(nextSegNo, &use_existent, true);
if (lf >= 0) {
close(lf);
}
}
}
* Create a new XLOG file segment by copying a pre-existing one.
*
* log, seg: identify segment to be created.
*
* srcTLI, srclog, srcseg: identify segment to be copied (could be from
* a different timeline)
*
* Currently this is only used during recovery, and so there are no locking
* considerations. But we should be just as tense as XLogFileInit to avoid
* emplacing a bogus file.
*/
static void XLogFileCopy(char *path, char *tmppath)
{
char buffer[XLOG_BLCKSZ];
int srcfd;
int fd;
uint32 nbytes;
errno_t rc = EOK;
Assert(path);
Assert(tmppath);
* Open the source file
*/
srcfd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
if (srcfd < 0) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %s", path, TRANSLATE_ERRNO)));
}
* Copy into a temp file name.
*/
unlink(tmppath);
fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR);
if (fd < 0) {
close(srcfd);
ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %s", tmppath, TRANSLATE_ERRNO)));
}
* Do the data copying.
*/
for (nbytes = 0; nbytes < (uint32)XLogSegSize; nbytes += sizeof(buffer)) {
errno = 0;
pgstat_report_waitevent(WAIT_EVENT_WAL_COPY_READ);
if ((int)read(srcfd, buffer, sizeof(buffer)) != (int)sizeof(buffer)) {
close(srcfd);
close(fd);
if (errno != 0) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\": %s", path, TRANSLATE_ERRNO)));
} else {
ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg("not enough data in file \"%s\"", path)));
}
}
pgstat_report_waitevent(WAIT_EVENT_END);
errno = 0;
pgstat_report_waitevent(WAIT_EVENT_WAL_COPY_WRITE);
if ((int)write(fd, buffer, sizeof(buffer)) != (int)sizeof(buffer)) {
int save_errno = errno;
close(srcfd);
close(fd);
* If we fail to make the file, delete it to release disk space
*/
unlink(tmppath);
errno = save_errno ? save_errno : ENOSPC;
ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %s", tmppath, TRANSLATE_ERRNO)));
}
pgstat_report_waitevent(WAIT_EVENT_END);
}
rc = memset_s(buffer, XLOG_BLCKSZ, 0, XLOG_BLCKSZ);
securec_check(rc, "\0", "\0");
pgstat_report_waitevent(WAIT_EVENT_WAL_COPY_SYNC);
if (pg_fsync(fd) != 0) {
close(srcfd);
close(fd);
ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(), errmsg("could not fsync file \"%s\": %s", tmppath, TRANSLATE_ERRNO)));
}
pgstat_report_waitevent(WAIT_EVENT_END);
if (close(fd)) {
close(srcfd);
ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %s", tmppath, TRANSLATE_ERRNO)));
}
close(srcfd);
}
static void XLogFileTruncateInner(char *path, int fd, uint32 targetPageOff, char *buffer, int bufLen)
{
uint32 nbytes;
errno_t rc = EOK;
uint32 startBytes = targetPageOff;
if (lseek(fd, (off_t)targetPageOff, SEEK_SET) < 0) {
ereport(ERROR, (errcode_for_file_access(),
errmsg("lseek2:could not seek in log file %s to offset %u: %s", path, targetPageOff, TRANSLATE_ERRNO)));
}
for (nbytes = startBytes; nbytes < (uint32)XLogSegSize; nbytes += bufLen) {
errno = 0;
pgstat_report_waitevent(WAIT_EVENT_WAL_COPY_WRITE);
if ((int)write(fd, buffer, bufLen) != bufLen) {
close(fd);
ereport(ERROR, (errcode_for_file_access(),
errmsg("could not write to file \"%s\": %s", path, TRANSLATE_ERRNO)));
}
pgstat_report_waitevent(WAIT_EVENT_END);
if (nbytes == startBytes) {
rc = memset_s(buffer, XLOG_BLCKSZ, 0, XLOG_BLCKSZ);
securec_check(rc, "\0", "\0");
}
}
pgstat_report_waitevent(WAIT_EVENT_WAL_COPY_SYNC);
if (pg_fsync(fd) != 0) {
close(fd);
ereport(data_sync_elevel(ERROR), (errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %s", path, TRANSLATE_ERRNO)));
}
pgstat_report_waitevent(WAIT_EVENT_END);
}
static void XLogFileTruncate(char *path, XLogRecPtr RecPtr)
{
char buffer[XLOG_BLCKSZ];
int fd;
XLogRecPtr targetPagePtr;
uint32 targetPageOff;
targetPagePtr = RecPtr - RecPtr % XLOG_BLCKSZ;
targetPageOff = (targetPagePtr % XLogSegSize);
fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
if (fd < 0)
ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %s", path, TRANSLATE_ERRNO)));
if (lseek(fd, (off_t)targetPageOff, SEEK_SET) < 0) {
ereport(ERROR, (errcode_for_file_access(),
errmsg("could not seek in log file %s to offset %u: %s", path, targetPageOff, TRANSLATE_ERRNO)));
}
if ((int)read(fd, buffer, sizeof(buffer)) != (int)sizeof(buffer)) {
close(fd);
if (errno != 0)
ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\": %s", path, TRANSLATE_ERRNO)));
else
ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg("not enough data in file \"%s\"", path)));
}
XLogFileCutPage(buffer, XLOG_BLCKSZ, RecPtr % XLogSegSize);
XLogFileTruncateInner(path, fd, targetPageOff, buffer, (int)(sizeof(buffer)));
close(fd);
}
* Install a new XLOG segment file as a current or future log segment.
*
* This is used both to install a newly-created segment (which has a temp
* filename while it's being created) and to recycle an old segment.
*
* *segno: identify segment to install as (or first possible target).
* When find_free is TRUE, this is modified on return to indicate the
* actual installation location or last segment searched.
*
* tmppath: initial name of file to install. It will be renamed into place.
*
* find_free: if TRUE, install the new segment at the first empty segno
* number at or after the passed numbers. If FALSE, install the new segment
* exactly where specified, deleting any existing segment file there.
*
* *max_advance: maximum number of segno slots to advance past the starting
* point. Fail if no free slot is found in this range. On return, reduced
* by the number of slots skipped over. (Irrelevant, and may be NULL,
* when find_free is FALSE.)
*
* use_lock: if TRUE, acquire ControlFileLock while moving file into
* place. This should be TRUE except during bootstrap log creation. The
* caller must *not* hold the lock at call.
*
* Returns TRUE if the file was installed successfully. FALSE indicates that
* max_advance limit was exceeded, or an error occurred while renaming the
* file into place.
*/
static bool InstallXLogFileSegment(XLogSegNo *segno, const char *tmppath, bool find_free, int *max_advance,
bool use_lock, const char *xlog_dir)
{
char path[MAXPGPATH];
struct stat stat_buf;
errno_t errorno = EOK;
errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", xlog_dir, t_thrd.xlog_cxt.ThisTimeLineID,
(uint32)((*segno) / XLogSegmentsPerXLogId), (uint32)((*segno) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
* We want to be sure that only one process does this at a time.
*/
if (use_lock) {
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
}
if (!find_free) {
unlink(path);
} else {
while (stat(path, &stat_buf) == 0) {
if (*max_advance <= 0) {
if (use_lock) {
LWLockRelease(ControlFileLock);
}
return false;
}
(*segno)++;
(*max_advance)--;
errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", xlog_dir,
t_thrd.xlog_cxt.ThisTimeLineID, (uint32)((*segno) / XLogSegmentsPerXLogId),
(uint32)((*segno) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
}
}
if (durable_rename(tmppath, path, LOG) != 0) {
if (use_lock) {
LWLockRelease(ControlFileLock);
}
return false;
}
if (*segno > g_instance.wal_cxt.globalEndPosSegNo) {
g_instance.wal_cxt.globalEndPosSegNo = *segno;
}
if (use_lock) {
LWLockRelease(ControlFileLock);
}
return true;
}
* Open a pre-existing logfile segment for writing.
*/
static int XLogFileOpenInternal(XLogSegNo segno, const char *xlog_dir)
{
char path[MAXPGPATH];
int fd;
errno_t errorno = EOK;
errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", xlog_dir, t_thrd.xlog_cxt.ThisTimeLineID,
(uint32)((segno) / XLogSegmentsPerXLogId), (uint32)((segno) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
fd = BasicOpenFile(path, O_RDWR | PG_BINARY | (unsigned int)get_sync_bit(u_sess->attr.attr_storage.sync_method),
S_IRUSR | S_IWUSR);
if (fd < 0) {
ereport(PANIC, (errcode_for_file_access(), errmsg("could not open xlog file \"%s\" (log segment %s): %s", path,
XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, segno),
TRANSLATE_ERRNO)));
}
return fd;
}
int XLogFileOpen(XLogSegNo segno)
{
return XLogFileOpenInternal(segno, SS_XLOGDIR);
}
* Open a logfile segment for reading (during recovery).
*
* If source = XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
* Otherwise, it's assumed to be already available in pg_xlog.
*/
static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, int source, bool notfoundOk)
{
char xlogfname[MAXFNAMELEN];
char activitymsg[MAXFNAMELEN + 16];
char path[MAXPGPATH];
int fd;
errno_t errorno = EOK;
errorno = snprintf_s(xlogfname, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", tli,
(uint32)((segno) / XLogSegmentsPerXLogId), (uint32)((segno) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
switch (source) {
case XLOG_FROM_ARCHIVE:
errorno = snprintf_s(activitymsg, sizeof(activitymsg), sizeof(activitymsg) - 1, "waiting for %s",
xlogfname);
securec_check_ss(errorno, "", "");
set_ps_display(activitymsg, false);
t_thrd.xlog_cxt.restoredFromArchive = RestoreArchivedFile(path, xlogfname, "RECOVERYXLOG", XLogSegSize);
if (!t_thrd.xlog_cxt.restoredFromArchive) {
return -1;
}
break;
case XLOG_FROM_PG_XLOG:
case XLOG_FROM_STREAM:
errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", SS_XLOGDIR, tli,
(uint32)((segno) / XLogSegmentsPerXLogId), (uint32)((segno) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
t_thrd.xlog_cxt.restoredFromArchive = false;
break;
default:
ereport(ERROR, (errcode(ERRCODE_CASE_NOT_FOUND), errmsg("invalid XLogFileRead source %d", source)));
break;
}
* If the segment was fetched from archival storage, replace the existing
* xlog segment (if any) with the archival version.
*/
if (source == XLOG_FROM_ARCHIVE) {
KeepFileRestoredFromArchive((const char *)path, (const char *)xlogfname);
errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%s", SS_XLOGDIR, xlogfname);
securec_check_ss(errorno, "", "");
}
fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
if (fd >= 0) {
t_thrd.xlog_cxt.curFileTLI = tli;
errorno = snprintf_s(activitymsg, sizeof(activitymsg), sizeof(activitymsg) - 1, "recovering %s", xlogfname);
securec_check_ss(errorno, "", "");
set_ps_display(activitymsg, false);
t_thrd.xlog_cxt.readSource = source;
t_thrd.xlog_cxt.XLogReceiptSource = source;
if (source != XLOG_FROM_STREAM) {
t_thrd.xlog_cxt.XLogReceiptTime = GetCurrentTimestamp();
}
return fd;
}
if (!FILE_POSSIBLY_DELETED(errno) || !notfoundOk) {
ereport(PANIC, (errcode_for_file_access(), errmsg("XLogFileRead: could not open file \"%s\" "
"(log segment %s): %s", path, XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, segno),
TRANSLATE_ERRNO)));
}
return -1;
}
* Open a logfile segment for reading (during recovery).
* This version searches for the segment with any TLI listed in expectedTLIs.
*/
int XLogFileReadAnyTLI(XLogSegNo segno, int emode, uint32 sources)
{
char path[MAXPGPATH];
ListCell *cell = NULL;
int fd = -1;
errno_t errorno = EOK;
* Loop looking for a suitable timeline ID: we might need to read any of
* the timelines listed in expectedTLIs.
*
* We expect curFileTLI on entry to be the TLI of the preceding file in
* sequence, or 0 if there was no predecessor. We do not allow curFileTLI
* to go backwards; this prevents us from picking up the wrong file when a
* parent timeline extends to higher segment numbers than the child we
* want to read.
*/
foreach (cell, t_thrd.xlog_cxt.expectedTLIs) {
TimeLineID tli = (TimeLineID)lfirst_int(cell);
if (tli < t_thrd.xlog_cxt.curFileTLI) {
break;
}
if (sources & XLOG_FROM_ARCHIVE) {
fd = XLogFileRead(segno, emode, tli, XLOG_FROM_ARCHIVE, true);
if (fd != -1) {
ereport(DEBUG1, (errmsg("got WAL segment from archive")));
return fd;
}
}
if (sources & XLOG_FROM_PG_XLOG) {
fd = XLogFileRead(segno, emode, tli, XLOG_FROM_PG_XLOG, true);
if (fd != -1) {
return fd;
}
}
}
errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", SS_XLOGDIR,
t_thrd.xlog_cxt.recoveryTargetTLI, (uint32)((segno) / XLogSegmentsPerXLogId),
(uint32)((segno) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
if (!SS_ONDEMAND_REALTIME_BUILD_NORMAL) {
ereport(emode, (errcode_for_file_access(), errmsg("XLogFileReadAnyTLI: could not open file \"%s\" "
"(log segment: %s, fd: %d, sources: %u): %s", path,
XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, segno), fd, sources, TRANSLATE_ERRNO)));
}
return -1;
}
* Close the current logfile segment for writing.
*/
static void XLogFileClose(void)
{
Assert(t_thrd.xlog_cxt.openLogFile >= 0);
* WAL segment files will not be re-read in normal operation, so we advise
* the OS to release any cached pages. But do not do so if WAL archiving
* or streaming is active, because archiver and walsender process could
* use the cache to read the WAL segment.
*/
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
if (!XLogIsNeeded()) {
(void)posix_fadvise(t_thrd.xlog_cxt.openLogFile, 0, 0, POSIX_FADV_DONTNEED);
}
#endif
if (close(t_thrd.xlog_cxt.openLogFile)) {
ereport(PANIC, (errcode_for_file_access(),
errmsg("could not close log file %s: %s",
XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.openLogSegNo),
TRANSLATE_ERRNO)));
}
t_thrd.xlog_cxt.openLogFile = -1;
}
* A file was restored from the archive under a temporary filename (path),
* and now we want to keep it. Rename it under the permanent filename in
* in pg_xlog (xlogfname), replacing any existing file with the same name.
*/
static void KeepFileRestoredFromArchive(const char *path, const char *xlogfname)
{
char xlogfpath[MAXPGPATH];
bool reload = false;
struct stat statbuf;
errno_t errorno = EOK;
errorno = snprintf_s(xlogfpath, MAXPGPATH, MAXPGPATH - 1, "%s/%s", SS_XLOGDIR, xlogfname);
securec_check_ss(errorno, "", "");
if (stat(xlogfpath, &statbuf) == 0) {
char oldpath[MAXPGPATH] = {0};
errorno = memset_s(oldpath, MAXPGPATH, 0, MAXPGPATH);
securec_check(errorno, "", "");
#ifdef WIN32
* On Windows, if another process (e.g a walsender process) holds
* the file open in FILE_SHARE_DELETE mode, unlink will succeed,
* but the file will still show up in directory listing until the
* last handle is closed, and we cannot rename the new file in its
* place until that. To avoid that problem, rename the old file to
* a temporary name first. Use a counter to create a unique
* filename, because the same file might be restored from the
* archive multiple times, and a walsender could still be holding
* onto an old deleted version of it.
*/
errorno = snprintf_s(oldpath, MAXPGPATH, MAXPGPATH - 1, "%s.deleted%u", xlogfpath,
t_thrd.xlog_cxt.deletedcounter++);
securec_check_ss(errorno, "", "");
if (rename(xlogfpath, oldpath) != 0) {
ereport(ERROR, (errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %s", xlogfpath, oldpath, TRANSLATE_ERRNO)));
}
#else
errorno = strncpy_s(oldpath, MAXPGPATH, xlogfpath, MAXPGPATH - 1);
securec_check(errorno, "", "");
#endif
if (unlink(oldpath) != 0) {
ereport(FATAL, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %s", xlogfpath, TRANSLATE_ERRNO)));
}
reload = true;
}
durable_rename(path, xlogfpath, ERROR);
XLogArchiveForceDone(xlogfname);
* If the existing file was replaced, since walsenders might have it
* open, request them to reload a currently-open segment. This is only
* required for WAL segments, walsenders don't hold other files open,
* but there's no harm in doing this too often, and we don't know what
* kind of a file we're dealing with here.
*/
if (reload) {
WalSndRqstFileReload();
}
if (AllowCascadeReplication()) {
WalSndWakeup();
}
}
* Attempt to retrieve the specified file from off-line archival storage.
* If successful, fill "path" with its complete path (note that this will be
* a temp file name that doesn't follow the normal naming convention), and
* return TRUE.
* If not successful, fill "path" with the name of the normal on-line file
* (which may or may not actually exist, but we'll try to use it), and return FALSE.
* For fixed-size files, the caller may pass the expected size as an
* additional crosscheck on successful recovery. If the file size is not
* known, set expectedSize = 0.
*/
static bool RestoreArchivedFile(char *path, const char *xlogfname, const char *recovername, off_t expectedSize)
{
char xlogpath[MAXPGPATH];
char xlogRestoreCmd[MAXPGPATH];
char lastRestartPointFname[MAXPGPATH];
char *dp = NULL;
char *endp = NULL;
const char *sp = NULL;
int rc = 0;
bool signaled = false;
struct stat stat_buf;
XLogSegNo restartSegNo;
errno_t errorno = EOK;
if (t_thrd.xlog_cxt.recoveryRestoreCommand == NULL) {
goto not_available;
}
* When doing archive recovery, we always prefer an archived log file even
* if a file of the same name exists in XLOGDIR. The reason is that the
* file in XLOGDIR could be an old, un-filled or partly-filled version
* that was copied and restored as part of backing up $PGDATA.
*
* We could try to optimize this slightly by checking the local copy
* lastchange timestamp against the archived copy, but we have no API to
* do this, nor can we guarantee that the lastchange timestamp was
* preserved correctly when we copied to archive. Our aim is robustness,
* so we elect not to do this.
*
* If we cannot obtain the log file from the archive, however, we will try
* to use the XLOGDIR file if it exists. This is so that we can make use
* of log segments that weren't yet transferred to the archive.
*
* Notice that we don't actually overwrite any files when we copy back
* from archive because the recoveryRestoreCommand may inadvertently
* restore inappropriate xlogs, or they may be corrupt, so we may wish to
* fallback to the segments remaining in current XLOGDIR later. The
* copy-from-archive filename is always the same, ensuring that we don't
* run out of disk space on long recoveries.
*/
errorno = snprintf_s(xlogpath, MAXPGPATH, MAXPGPATH - 1, "%s/%s", SS_XLOGDIR, recovername);
securec_check_ss(errorno, "", "");
* Make sure there is no existing file named recovername.
*/
if (stat(xlogpath, &stat_buf) != 0) {
if (errno != ENOENT) {
ereport(FATAL, (errcode_for_file_access(),
errmsg("could not stat file \"%s\": %s", xlogpath, TRANSLATE_ERRNO)));
}
} else {
if (unlink(xlogpath) != 0) {
ereport(FATAL, (errcode_for_file_access(),
errmsg("could not remove file \"%s\": %s", xlogpath, TRANSLATE_ERRNO)));
}
}
* Calculate the archive file cutoff point for use during log shipping
* replication. All files earlier than this point can be deleted from the
* archive, though there is no requirement to do so.
*
* We initialise this with the filename of an InvalidXLogRecPtr, which
* will prevent the deletion of any WAL files from the archive because of
* the alphabetic sorting property of WAL filenames.
*
* Once we have successfully located the redo pointer of the checkpoint
* from which we start recovery we never request a file prior to the redo
* pointer of the last restartpoint. When redo begins we know that we have
* successfully located it, so there is no need for additional status
* flags to signify the point when we can begin deleting WAL files from
* the archive.
*/
if (t_thrd.xlog_cxt.InRedo) {
XLByteToSeg(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo, restartSegNo);
errorno = snprintf_s(lastRestartPointFname, MAXPGPATH, MAXPGPATH - 1, "%08X%08X%08X",
t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.ThisTimeLineID,
(uint32)((restartSegNo) / XLogSegmentsPerXLogId),
(uint32)((restartSegNo) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
Assert(strcmp(lastRestartPointFname, xlogfname) <= 0);
} else {
errorno = snprintf_s(lastRestartPointFname, MAXPGPATH, MAXPGPATH - 1, "%08X%08X%08X", 0,
(uint32)((0L) / XLogSegmentsPerXLogId), (uint32)((0L) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
}
* construct the command to be executed
*/
dp = xlogRestoreCmd;
endp = xlogRestoreCmd + MAXPGPATH - 1;
*endp = '\0';
for (sp = t_thrd.xlog_cxt.recoveryRestoreCommand; *sp; sp++) {
if (*sp == '%') {
switch (sp[1]) {
case 'p':
sp++;
errorno = strncpy_s(dp, MAXPGPATH - (dp - xlogRestoreCmd), xlogpath, strlen(xlogpath));
securec_check(errorno, "", "");
make_native_path(dp);
dp += strlen(dp);
break;
case 'f':
sp++;
errorno = strncpy_s(dp, MAXPGPATH - (dp - xlogRestoreCmd), xlogfname, strlen(xlogfname));
securec_check(errorno, "", "");
dp += strlen(dp);
break;
case 'r':
sp++;
errorno = strncpy_s(dp, MAXPGPATH - (dp - xlogRestoreCmd), lastRestartPointFname,
strlen(lastRestartPointFname));
securec_check(errorno, "", "");
dp += strlen(dp);
break;
case '%':
sp++;
if (dp < endp) {
*dp++ = *sp;
}
break;
default:
if (dp < endp) {
*dp++ = *sp;
}
break;
}
} else {
if (dp < endp) {
*dp++ = *sp;
}
}
}
*dp = '\0';
ereport(DEBUG3, (errmsg_internal("executing restore command \"%s\"", xlogRestoreCmd)));
PreRestoreCommand();
rc = gs_popen_security(xlogRestoreCmd);
PostRestoreCommand();
if (rc == 0) {
if (stat(xlogpath, &stat_buf) == 0) {
if (expectedSize > 0 && stat_buf.st_size != expectedSize) {
* If we find a partial file in standby mode, we assume it's
* because it's just being copied to the archive, and keep
* trying.
*
* Otherwise treat a wrong-sized file as FATAL to ensure the
* DBA would notice it, but is that too strong? We could try
* to plow ahead with a local copy of the file ... but the
* problem is that there probably isn't one, and we'd
* incorrectly conclude we've reached the end of WAL and we're
* done recovering ...
*/
int elevel = (t_thrd.xlog_cxt.StandbyMode && stat_buf.st_size < expectedSize) ? DEBUG1 : FATAL;
ereport(elevel, (errmsg("archive file \"%s\" has wrong size: %lu instead of %lu", xlogfname,
(unsigned long)stat_buf.st_size, (unsigned long)expectedSize)));
return false;
} else {
ereport(LOG, (errmsg("restored log file \"%s\" from archive", xlogfname)));
errorno = strcpy_s(path, MAXPGPATH, xlogpath);
securec_check(errorno, "", "");
return true;
}
} else if (errno != ENOENT) {
ereport(FATAL, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %s", xlogpath, TRANSLATE_ERRNO)));
}
}
* Remember, we rollforward UNTIL the restore fails so failure here is
* just part of the process... that makes it difficult to determine
* whether the restore failed because there isn't an archive to restore,
* or because the administrator has specified the restore program
* incorrectly. We have to assume the former.
*
* However, if the failure was due to any sort of signal, it's best to
* punt and abort recovery. (If we "return false" here, upper levels will
* assume that recovery is complete and start up the database!) It's
* essential to abort on child SIGINT and SIGQUIT, because per spec
* system() ignores SIGINT and SIGQUIT while waiting; if we see one of
* those it's a good bet we should have gotten it too.
*
* On SIGTERM, assume we have received a fast shutdown request, and exit
* cleanly. It's pure chance whether we receive the SIGTERM first, or the
* child process. If we receive it first, the signal handler will call
* proc_exit, otherwise we do it here. If we or the child process received
* SIGTERM for any other reason than a fast shutdown request, postmaster
* will perform an immediate shutdown when it sees us exiting
* unexpectedly.
*
* Per the Single Unix Spec, shells report exit status > 128 when a called
* command died on a signal. Also, 126 and 127 are used to report
* problems such as an unfindable command; treat those as fatal errors
* too.
*/
if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM) {
proc_exit(1);
}
signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
ereport(signaled ? FATAL : DEBUG2,
(errmsg("could not restore file \"%s\" from archive: return code %d", xlogfname, rc)));
not_available:
* if an archived file is not available, there might still be a version of
* this file in XLOGDIR, so return that as the filename to open.
*
* In many recovery scenarios we expect this to fail also, but if so that
* just means we've reached the end of WAL.
*/
errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%s", SS_XLOGDIR, xlogfname);
securec_check_ss(errorno, "", "");
return false;
}
* Attempt to execute an external shell command during recovery.
*
* 'command' is the shell command to be executed, 'commandName' is a
* human-readable name describing the command emitted in the logs. If
* 'failOnSignal' is true and the command is killed by a signal, a FATAL
* error is thrown. Otherwise a WARNING is emitted.
*
* This is currently used for recovery_end_command and archive_cleanup_command.
*/
static void ExecuteRecoveryCommand(char *command, char *commandName, bool failOnSignal)
{
char xlogRecoveryCmd[MAXPGPATH];
char lastRestartPointFname[MAXPGPATH];
char *dp = NULL;
char *endp = NULL;
const char *sp = NULL;
int rc = 0;
bool signaled = false;
XLogSegNo restartSegNo;
errno_t errorno = EOK;
Assert(command && commandName);
* Calculate the archive file cutoff point for use during log shipping
* replication. All files earlier than this point can be deleted from the
* archive, though there is no requirement to do so.
*/
LWLockAcquire(ControlFileLock, LW_SHARED);
XLByteToSeg(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo, restartSegNo);
errorno = snprintf_s(lastRestartPointFname, MAXPGPATH, MAXPGPATH - 1, "%08X%08X%08X",
t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.ThisTimeLineID,
(uint32)((restartSegNo) / XLogSegmentsPerXLogId),
(uint32)((restartSegNo) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
LWLockRelease(ControlFileLock);
dp = xlogRecoveryCmd;
endp = xlogRecoveryCmd + MAXPGPATH - 1;
*endp = '\0';
for (sp = command; *sp; sp++) {
if (*sp == '%') {
switch (sp[1]) {
case 'r':
sp++;
errorno = strncpy_s(dp, MAXPGPATH - (dp - xlogRecoveryCmd), lastRestartPointFname,
strlen(lastRestartPointFname));
securec_check(errorno, "", "");
dp += strlen(dp);
break;
case '%':
sp++;
if (dp < endp) {
*dp++ = *sp;
}
break;
default:
if (dp < endp) {
*dp++ = *sp;
}
break;
}
} else {
if (dp < endp) {
*dp++ = *sp;
}
}
}
*dp = '\0';
ereport(DEBUG3, (errmsg_internal("executing %s \"%s\"", commandName, command)));
rc = gs_popen_security(xlogRecoveryCmd);
if (rc != 0) {
* If the failure was due to any sort of signal, it's best to punt and
* abort recovery. See also detailed comments on signals in RestoreArchivedFile().
*/
signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
ereport((signaled && failOnSignal) ? FATAL : WARNING,
(errmsg("%s \"%s\": return code %d", commandName, command, rc)));
}
}
* Preallocate log files beyond the specified log endpoint.
*
* XXX this is currently extremely conservative, since it forces only one
* future log segment to exist, and even that only if we are 75% done with
* the current one. This is only appropriate for very low-WAL-volume systems.
* High-volume systems will be OK once they've built up a sufficient set of
* recycled log segments, but the startup transient is likely to include
* a lot of segment creations by foreground processes, which is not so good.
*/
static void PreallocXlogFiles(XLogRecPtr endptr)
{
* These condition do not need preallocate xlog files:
* 1. In ss repplication dorado cluster, standby cluster sync primary xlog
* 2. In ondemand recovery, we do not preallocate xlog files for better rto
*/
if (SS_DORADO_STANDBY_CLUSTER || SS_IN_ONDEMAND_RECOVERY || SS_STANDBY_MODE) {
return;
}
XLogSegNo _logSegNo;
int lf;
bool use_existent = false;
gstrace_entry(GS_TRC_ID_PreallocXlogFiles);
XLByteToPrevSeg(endptr, _logSegNo);
if ((endptr - 1) % XLogSegSize >= (uint32)(0.75 * XLogSegSize)) {
_logSegNo++;
use_existent = true;
lf = XLogFileInit(_logSegNo, &use_existent, true);
close(lf);
if (!use_existent) {
t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_added++;
}
}
gstrace_exit(GS_TRC_ID_PreallocXlogFiles);
}
* Throws an error if the given log segment has already been removed or
* recycled. The caller should only pass a segment that it knows to have
* existed while the server has been running, as this function always
* succeeds if no WAL segments have been removed since startup.
* 'tli' is only used in the error message.
*/
void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
XLogSegNo lastRemovedSegNo;
errno_t errorno = EOK;
SpinLockAcquire(&xlogctl->info_lck);
lastRemovedSegNo = xlogctl->lastRemovedSegNo;
SpinLockRelease(&xlogctl->info_lck);
if (segno <= lastRemovedSegNo) {
char filename[MAXFNAMELEN];
errorno = snprintf_s(filename, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", tli,
(uint32)((segno) / XLogSegmentsPerXLogId), (uint32)((segno) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
WalSegmemtRemovedhappened = true;
ereport(ERROR,
(errcode_for_file_access(), errmsg("requested WAL segment %s has already been removed", filename)));
}
}
* Return the last WAL segment removed, or 0 if no segment has been removed
* since startup.
*
* NB: the result can be out of date arbitrarily fast, the caller has to deal
* with that.
*/
XLogSegNo XLogGetLastRemovedSegno(void)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
XLogSegNo lastRemovedSegNo;
SpinLockAcquire(&xlogctl->info_lck);
lastRemovedSegNo = xlogctl->lastRemovedSegNo;
SpinLockRelease(&xlogctl->info_lck);
return lastRemovedSegNo;
}
static void remove_xlogtemp_files(void)
{
if (SS_DORADO_STANDBY_CLUSTER || SS_STANDBY_MODE) {
ereport(LOG, (errmsg("[SS] Standby node skip remove xlogtemp files")));
return;
}
DIR *dir = NULL;
char fullpath[MAXPGPATH] = {0};
struct dirent *de = NULL;
struct stat st;
errno_t errorno = EOK;
if ((dir = opendir(SS_XLOGDIR)) != NULL) {
while ((de = readdir(dir)) != NULL) {
if (strncmp(de->d_name, ".", 1) == 0 || strncmp(de->d_name, "..", 2) == 0) {
continue;
}
if (!strstr(de->d_name, "xlogtemp")) {
continue;
}
errorno = snprintf_s(fullpath, sizeof(fullpath), sizeof(fullpath) - 1, "%s/%s", SS_XLOGDIR, de->d_name);
securec_check_ss(errorno, "\0", "\0");
if (lstat(fullpath, &st) != 0) {
if (!FILE_POSSIBLY_DELETED(errno)) {
ereport(WARNING, (errmsg("could not stat file or directory : %s", fullpath)));
}
continue;
}
if (S_ISDIR(st.st_mode)) {
continue;
}
if (S_ISREG(st.st_mode)) {
(void)unlink(fullpath);
}
}
(void)closedir(dir);
}
}
* Update the last removed segno pointer in shared memory, to reflect
* that the given XLOG file has been removed.
*/
static void UpdateLastRemovedPtr(const char *filename)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
uint32 tli;
uint32 log_temp = 0;
uint32 seg_temp = 0;
XLogSegNo segno;
if (sscanf_s(filename, "%08X%08X%08X", &tli, &log_temp, &seg_temp) != 3) {
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("could not parse xlog segment file \"%s\"", filename)));
}
segno = (uint64)log_temp * XLogSegmentsPerXLogId + seg_temp;
SpinLockAcquire(&xlogctl->info_lck);
if (segno > xlogctl->lastRemovedSegNo) {
xlogctl->lastRemovedSegNo = segno;
}
SpinLockRelease(&xlogctl->info_lck);
}
* Recycle or remove all log files older or equal to passed segno#
*
* endptr is current (or recent) end of xlog; this is used to determine
* whether we want to recycle rather than delete no-longer-wanted log files.
*/
static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
{
if (SS_DORADO_STANDBY_CLUSTER || SS_STANDBY_MODE) {
ereport(LOG, (errmsg("[SS] Standby node skip remove old xlog files")));
return;
}
DIR *xldir = NULL;
struct dirent *xlde = NULL;
char lastoff[MAXFNAMELEN];
errno_t errorno = EOK;
xldir = AllocateDir(SS_XLOGDIR);
if (xldir == NULL) {
ereport(ERROR,
(errcode_for_file_access(), errmsg("could not open transaction log directory \"%s\": %s",
SS_XLOGDIR, TRANSLATE_ERRNO)));
}
errorno = snprintf_s(lastoff, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", t_thrd.xlog_cxt.ThisTimeLineID,
(uint32)((segno) / XLogSegmentsPerXLogId), (uint32)((segno) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
g_instance.comm_cxt.predo_cxt.redoPf.oldest_segment = segno + 1;
ereport(LOG, (errmsg("attempting to remove WAL segments older than log file %s", lastoff)));
gstrace_entry(GS_TRC_ID_RemoveOldXlogFiles);
while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL) {
if (strlen(xlde->d_name) != 24 || strspn(xlde->d_name, "0123456789ABCDEF") != 24) {
continue;
}
* We ignore the timeline part of the XLOG segment identifiers in
* deciding whether a segment is still needed. This ensures that we
* won't prematurely remove a segment from a parent timeline. We could
* probably be a little more proactive about removing segments of
* non-parent timelines, but that would be a whole lot more
* complicated.
*
* We use the alphanumeric sorting property of the filenames to decide
* which ones are earlier than the lastoff segment.
*/
if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0) {
if (XLogArchiveCheckDone(xlde->d_name)) {
UpdateLastRemovedPtr((const char *)xlde->d_name);
RemoveXlogFile(xlde->d_name, endptr);
}
}
}
FreeDir(xldir);
gstrace_data(1, GS_TRC_ID_RemoveOldXlogFiles, TRC_DATA_FMT_DFLT, lastoff, MAXFNAMELEN);
gstrace_exit(GS_TRC_ID_RemoveOldXlogFiles);
}
* Recycle or remove a log file that's no longer needed.
*
* endptr is current (or recent) end of xlog; this is used to determine
* whether we want to recycle rather than delete no-longer-wanted log files.
*/
static void RemoveXlogFile(const char *segname, XLogRecPtr endptr)
{
char path[MAXPGPATH];
#ifdef WIN32
char newpath[MAXPGPATH];
#endif
struct stat statbuf;
XLogSegNo endLogSegNo;
int max_advance;
* Initialize info about where to try to recycle to. We allow recycling
* segments up to XLOGfileslop segments beyond the current XLOG location.
*/
XLByteToPrevSeg(endptr, endLogSegNo);
max_advance = XLOGfileslop;
errno_t errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%s", SS_XLOGDIR, segname);
securec_check_ss(errorno, "", "");
* Before deleting the file, see if it can be recycled as a future log
* segment. Only recycle normal files, pg_standby for example can create
* symbolic links pointing to a separate archive directory.
*/
if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
InstallXLogFileSegment(&endLogSegNo, (const char *)path, true, &max_advance, true, SS_XLOGDIR)) {
ereport(DEBUG2, (errmsg("recycled transaction log file \"%s\"", segname)));
t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_recycled++;
if (max_advance > 0) {
endLogSegNo++;
max_advance--;
}
} else {
int rc;
ereport(DEBUG2, (errmsg("removing transaction log file \"%s\"", segname)));
#ifdef WIN32
* On Windows, if another process (e.g another backend) holds the file
* open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
* will still show up in directory listing until the last handle is
* closed. To avoid confusing the lingering deleted file for a live
* WAL file that needs to be archived, rename it before deleting it.
*
* If another process holds the file open without FILE_SHARE_DELETE
* flag, rename will fail. We'll try again at the next checkpoint.
*/
errorno = snprintf_s(newpath, MAXPGPATH, MAXPGPATH - 1, "%s.deleted", path);
securec_check_ss(errorno, "", "");
if (rename(path, newpath) != 0) {
ereport(LOG,
(errcode_for_file_access(), errmsg("could not rename old transaction log file \"%s\": %s",
path, TRANSLATE_ERRNO)));
return;
}
rc = unlink(newpath);
#else
rc = unlink(path);
#endif
if (rc != 0) {
ereport(LOG,
(errcode_for_file_access(), errmsg("could not remove old transaction log file \"%s\": %s",
path, TRANSLATE_ERRNO)));
return;
}
t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_removed++;
}
XLogArchiveCleanup(segname);
}
* Verify whether pg_xlog and pg_xlog/archive_status exist.
* If the latter does not exist, recreate it.
*
* It is not the goal of this function to verify the contents of these
* directories, but to help in cases where someone has performed a cluster
* copy for PITR purposes but omitted pg_xlog from the copy.
*
* We could also recreate pg_xlog if it doesn't exist, but a deliberate
* policy decision was made not to. It is fairly common for pg_xlog to be
* a symlink, and if that was the DBA's intent then automatically making a
* plain directory would result in degraded performance with no notice.
*/
static void ValidateXLOGDirectoryStructure(void)
{
struct stat stat_buf;
if (stat(SS_XLOGDIR, &stat_buf) != 0 || !S_ISDIR(stat_buf.st_mode))
ereport(FATAL, (errmsg("required WAL directory \"%s\" does not exist", SS_XLOGDIR)));
if (stat(ARCHIVEDIR, &stat_buf) == 0) {
if (!S_ISDIR(stat_buf.st_mode)) {
ereport(FATAL, (errmsg("required WAL directory \"%s\" does not exist", ARCHIVEDIR)));
}
} else {
ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", ARCHIVEDIR)));
if (mkdir(ARCHIVEDIR, S_IRWXU) < 0) {
ereport(FATAL, (errmsg("could not create missing directory \"%s\": %s", ARCHIVEDIR, TRANSLATE_ERRNO)));
}
}
}
* Remove previous backup history files. This also retries creation of
* .ready files for any backup history files for which XLogArchiveNotify
* failed earlier.
*/
static void CleanupBackupHistory(void)
{
DIR *xldir = NULL;
struct dirent *xlde = NULL;
char path[MAXPGPATH];
errno_t errorno = EOK;
xldir = AllocateDir(SS_XLOGDIR);
if (xldir == NULL) {
ereport(ERROR,
(errcode_for_file_access(), errmsg("could not open transaction log directory \"%s\": %s",
SS_XLOGDIR, TRANSLATE_ERRNO)));
}
while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL) {
if (strlen(xlde->d_name) > 24 && strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"), ".backup") == 0) {
if (XLogArchiveCheckDone(xlde->d_name)) {
ereport(DEBUG2, (errmsg("removing transaction log backup history file \"%s\"", xlde->d_name)));
errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%s", SS_XLOGDIR, xlde->d_name);
securec_check_ss(errorno, "", "");
unlink(path);
XLogArchiveCleanup(xlde->d_name);
}
}
}
FreeDir(xldir);
}
* Attempt to read an XLOG record.
*
* If RecPtr is not NULL, try to read a record at that position. Otherwise
* try to read a record just after the last one previously read.
*
* If no valid record is available, returns NULL, or fails if emode is PANIC.
* (emode must be either PANIC, LOG). In standby mode, retries until a valid
* record is available.
*
* The record is copied into readRecordBuf, so that on successful return,
* the returned record pointer always points there.
*/
static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode, bool fetching_ckpt)
{
XLogRecord *record = NULL;
uint32 streamFailCount = 0;
XLogPageReadPrivate *readprivate = (XLogPageReadPrivate *)xlogreader->private_data;
readprivate->fetching_ckpt = fetching_ckpt;
readprivate->emode = emode;
readprivate->randAccess = !XLByteEQ(RecPtr, InvalidXLogRecPtr);
t_thrd.xlog_cxt.failedSources = 0;
for (;;) {
char *errormsg = NULL;
record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
t_thrd.xlog_cxt.ReadRecPtr = xlogreader->ReadRecPtr;
t_thrd.xlog_cxt.EndRecPtr = xlogreader->EndRecPtr;
g_instance.comm_cxt.predo_cxt.redoPf.read_ptr = t_thrd.xlog_cxt.ReadRecPtr;
if (record == NULL) {
if (t_thrd.xlog_cxt.readFile >= 0) {
close(t_thrd.xlog_cxt.readFile);
t_thrd.xlog_cxt.readFile = -1;
}
* We only end up here without a message when XLogPageRead() failed
* - in that case we already logged something.
* In StandbyMode that only happens if we have been triggered, so
* we shouldn't loop anymore in that case.
*/
if (errormsg != NULL) {
ereport(emode_for_corrupt_record(emode, XLByteEQ(RecPtr, InvalidXLogRecPtr) ? t_thrd.xlog_cxt.EndRecPtr
: RecPtr),
(errmsg_internal("%s", errormsg) ));
}
} else if ((!timeLineInHistory(xlogreader->latestPageTLI, t_thrd.xlog_cxt.expectedTLIs)) &&
(!(g_instance.attr.attr_storage.IsRoachStandbyCluster && dummyStandbyMode))) {
char fname[MAXFNAMELEN];
XLogSegNo targetSegNo;
uint32 offset;
errno_t errorno = EOK;
XLByteToSeg(xlogreader->latestPagePtr, targetSegNo);
offset = xlogreader->latestPagePtr % XLogSegSize;
errorno = snprintf_s(fname, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", xlogreader->readPageTLI,
(uint32)((targetSegNo) / XLogSegmentsPerXLogId),
(uint32)((targetSegNo) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
ereport(emode_for_corrupt_record(emode,
XLByteEQ(RecPtr, InvalidXLogRecPtr) ? t_thrd.xlog_cxt.EndRecPtr : RecPtr),
(errmsg("unexpected timeline ID %u in log segment %s, offset %u", xlogreader->latestPageTLI, fname,
offset)));
record = NULL;
}
if (record != NULL) {
latestValidRecord = t_thrd.xlog_cxt.ReadRecPtr;
latestRecordCrc = record->xl_crc;
latestRecordLen = record->xl_tot_len;
if (SS_DORADO_CLUSTER) {
t_thrd.xlog_cxt.ssXlogReadFailedTimes = 0;
}
return record;
} else {
if (SS_DORADO_CLUSTER) {
t_thrd.xlog_cxt.ssXlogReadFailedTimes++;
xlogreader->preReadStartPtr = InvalidXlogPreReadStartPtr;
}
if (streamFailCount < XLOG_STREAM_READREC_MAXTRY) {
streamFailCount++;
if (u_sess->attr.attr_storage.HaModuleDebug) {
ereport(LOG,
(errmsg("reread xlog record for %uth times at %X/%X", streamFailCount,
(uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32), (uint32)t_thrd.xlog_cxt.ReadRecPtr)));
}
pg_usleep(XLOG_STREAM_READREC_INTERVAL);
ProcTxnWorkLoad(true);
} else {
t_thrd.xlog_cxt.failedSources |= t_thrd.xlog_cxt.readSource;
if (u_sess->attr.attr_storage.HaModuleDebug) {
ereport(LOG,
(errmsg("xlog record failed failedsource %u at %X/%X", t_thrd.xlog_cxt.failedSources,
(uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32), (uint32)t_thrd.xlog_cxt.ReadRecPtr)));
}
}
if (t_thrd.xlog_cxt.readFile >= 0) {
close(t_thrd.xlog_cxt.readFile);
t_thrd.xlog_cxt.readFile = -1;
}
if (xlogreader->preReadBuf != NULL) {
xlogreader->preReadStartPtr = InvalidXlogPreReadStartPtr;
}
* If archive recovery was requested, but we were still doing
* crash recovery, switch to archive recovery and retry using the
* offline archive. We have now replayed all the valid WAL in
* pg_xlog, so we are presumably now consistent.
*
* We require that there's at least some valid WAL present in
* pg_xlog, however (!fetch_ckpt). We could recover using the WAL
* from the archive, even if pg_xlog is completely empty, but we'd
* have no idea how far we'd have to replay to reach consistency.
* So err on the safe side and give up.
*/
if (!t_thrd.xlog_cxt.InArchiveRecovery && t_thrd.xlog_cxt.ArchiveRecoveryRequested && !fetching_ckpt) {
ProcTxnWorkLoad(true);
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
XLogRecPtr newMinRecoveryPoint;
ereport(DEBUG1, (errmsg_internal("reached end of WAL in pg_xlog, entering archive recovery")));
t_thrd.xlog_cxt.InArchiveRecovery = true;
if (t_thrd.xlog_cxt.StandbyModeRequested) {
t_thrd.xlog_cxt.StandbyMode = true;
}
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
SpinLockAcquire(&xlogctl->info_lck);
newMinRecoveryPoint = xlogctl->lastReplayedEndRecPtr;
SpinLockRelease(&xlogctl->info_lck);
if (XLByteLT(t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint, newMinRecoveryPoint)) {
t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint = newMinRecoveryPoint;
}
t_thrd.xlog_cxt.minRecoveryPoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
SetMinRecoverPointForStats(t_thrd.xlog_cxt.minRecoveryPoint);
UpdateControlFile();
LWLockRelease(ControlFileLock);
CheckRecoveryConsistency();
ereport(LOG, (errmsg("update minrecovery point to %X/%X in archive recovery",
(uint32)(t_thrd.xlog_cxt.minRecoveryPoint >> 32),
(uint32)(t_thrd.xlog_cxt.minRecoveryPoint))));
* Before we retry, reset lastSourceFailed and currentSource
* so that we will check the archive next.
*/
t_thrd.xlog_cxt.failedSources = 0;
continue;
}
ProcTxnWorkLoad(false);
if ((t_thrd.xlog_cxt.StandbyMode && !dummyStandbyMode && !t_thrd.xlog_cxt.recoveryTriggered) ||
SS_ONDEMAND_REALTIME_BUILD_NORMAL) {
continue;
} else {
if (u_sess->attr.attr_storage.HaModuleDebug) {
ereport(LOG,
(errmsg("startup replay xlog ready to exit, standbymode is %s, dummyStandbyMode is %s, recoveryTriggered is %s",
t_thrd.xlog_cxt.StandbyMode ? "true" : "false",
dummyStandbyMode ? "true" : "false",
t_thrd.xlog_cxt.recoveryTriggered ? "true" : "false")));
}
return NULL;
}
}
}
}
void UpdateMinrecoveryInAchive()
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
XLogRecPtr newMinRecoveryPoint;
* we only push g_GlobalLsnForwarder once in ondemand extreme rto,
* and we already do that in OnDemandSendRecoveryEndMarkToWorkersAndWaitForReach,
* so skit it.
*/
if (IsDefaultExtremeRtoMode()) {
extreme_rto::PushToWorkerLsn(true);
}
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
SpinLockAcquire(&xlogctl->info_lck);
newMinRecoveryPoint = xlogctl->lastReplayedEndRecPtr;
SpinLockRelease(&xlogctl->info_lck);
if (XLByteLT(t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint, newMinRecoveryPoint)) {
t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint = newMinRecoveryPoint;
}
t_thrd.xlog_cxt.minRecoveryPoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
g_instance.comm_cxt.predo_cxt.redoPf.min_recovery_point = t_thrd.xlog_cxt.minRecoveryPoint;
UpdateControlFile();
LWLockRelease(ControlFileLock);
MultiRedoUpdateMinRecovery(t_thrd.xlog_cxt.minRecoveryPoint);
t_thrd.xlog_cxt.updateMinRecoveryPoint = true;
ereport(LOG,
(errmsg("update minrecovery point to %X/%X in archive recovery",
(uint32)(t_thrd.xlog_cxt.minRecoveryPoint >> 32), (uint32)(t_thrd.xlog_cxt.minRecoveryPoint))));
}
* Try to read a timeline's history file.
*
* If successful, return the list of component TLIs (the given TLI followed by
* its ancestor TLIs). If we can't find the history file, assume that the
* timeline has no parents, and return a list of just the specified timeline
* ID.
*/
List *readTimeLineHistory(TimeLineID targetTLI)
{
List *result = NIL;
char path[MAXPGPATH];
char histfname[MAXFNAMELEN];
char fline[MAXPGPATH];
FILE *fd = NULL;
bool fromArchive = false;
errno_t errorno = EOK;
if (targetTLI == 1) {
return list_make1_int((int)targetTLI);
}
if (t_thrd.xlog_cxt.ArchiveRestoreRequested) {
errorno = snprintf_s(histfname, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X.history", targetTLI);
securec_check_ss(errorno, "", "");
fromArchive = RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
} else {
errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X.history", SS_XLOGDIR, targetTLI);
securec_check_ss(errorno, "", "");
}
fd = AllocateFile(path, "r");
if (fd == NULL) {
if (errno != ENOENT) {
ereport(FATAL, (errcode_for_file_access(), errmsg("could not open file \"%s\": %s", path, TRANSLATE_ERRNO)));
}
return list_make1_int((int)targetTLI);
}
result = NIL;
* Parse the file...
*/
while (fgets(fline, sizeof(fline), fd) != NULL) {
char *ptr = NULL;
char *endptr = NULL;
TimeLineID tli;
for (ptr = fline; *ptr; ptr++) {
if (!isspace((unsigned char)*ptr)) {
break;
}
}
if (*ptr == '\0' || *ptr == '#') {
continue;
}
tli = (TimeLineID)strtoul(ptr, &endptr, 0);
if (errno == EINVAL || errno == ERANGE) {
ereport(FATAL, (errmsg("recovery_target_timeline is not a valid number: \"%s\"", ptr)));
}
if (endptr == ptr) {
(void)FreeFile(fd);
fd = NULL;
ereport(FATAL,
(errmsg("syntax error in history file: %s", fline), errhint("Expected a numeric timeline ID.")));
}
if (result != NIL && tli <= (TimeLineID)linitial_int(result)) {
(void)FreeFile(fd);
fd = NULL;
ereport(FATAL, (errmsg("invalid data in history file: %s", fline),
errhint("Timeline IDs must be in increasing sequence.")));
}
result = lcons_int((int)tli, result);
}
(void)FreeFile(fd);
fd = NULL;
if (result != NULL && targetTLI <= (TimeLineID)linitial_int(result)) {
ereport(FATAL, (errmsg("invalid data in history file \"%s\"", path),
errhint("Timeline IDs must be less than child timeline's ID.")));
}
result = lcons_int((int)targetTLI, result);
ereport(DEBUG3, (errmsg_internal("history of timeline %u is %s", targetTLI, nodeToString(result))));
if (fromArchive) {
KeepFileRestoredFromArchive((const char *)path, (const char *)histfname);
}
return result;
}
* Probe whether a timeline history file exists for the given timeline ID
*/
static bool existsTimeLineHistory(TimeLineID probeTLI)
{
char path[MAXPGPATH];
char histfname[MAXFNAMELEN];
FILE *fd = NULL;
errno_t errorno = EOK;
if (probeTLI == 1) {
return false;
}
if (t_thrd.xlog_cxt.ArchiveRestoreRequested) {
errorno = snprintf_s(histfname, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X.history", probeTLI);
securec_check_ss(errorno, "", "");
RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
} else {
errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X.history", SS_XLOGDIR, probeTLI);
securec_check_ss(errorno, "", "");
}
fd = AllocateFile(path, "r");
if (fd != NULL) {
(void)FreeFile(fd);
fd = NULL;
return true;
} else {
if (errno != ENOENT) {
ereport(FATAL, (errcode_for_file_access(), errmsg("could not open file \"%s\": %s", path, TRANSLATE_ERRNO)));
}
return false;
}
}
* Scan for new timelines that might have appeared in the archive since we
* started recovery.
*
* If there are any, the function changes recovery target TLI to the latest
* one and returns 'true'.
*/
bool rescanLatestTimeLine(void)
{
TimeLineID newtarget;
newtarget = findNewestTimeLine(t_thrd.xlog_cxt.recoveryTargetTLI);
if (newtarget != t_thrd.xlog_cxt.recoveryTargetTLI) {
* Determine the list of expected TLIs for the new TLI
*/
List *newExpectedTLIs = readTimeLineHistory(newtarget);
* If the current timeline is not part of the history of the new
* timeline, we cannot proceed to it.
*
* XXX This isn't foolproof: The new timeline might have forked from
* the current one, but before the current recovery location. In that
* case we will still switch to the new timeline and proceed replaying
* from it even though the history doesn't match what we already
* replayed. That's not good. We will likely notice at the next online
* checkpoint, as the TLI won't match what we expected, but it's not
* guaranteed. The admin needs to make sure that doesn't happen.
*/
if (!list_member_int(newExpectedTLIs, (int)t_thrd.xlog_cxt.recoveryTargetTLI)) {
ereport(LOG, (errmsg("new timeline %u is not a child of database system timeline %u", newtarget,
t_thrd.xlog_cxt.ThisTimeLineID)));
} else {
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
t_thrd.xlog_cxt.recoveryTargetTLI = newtarget;
list_free(t_thrd.xlog_cxt.expectedTLIs);
t_thrd.xlog_cxt.expectedTLIs = newExpectedTLIs;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->RecoveryTargetTLI = t_thrd.xlog_cxt.recoveryTargetTLI;
SpinLockRelease(&xlogctl->info_lck);
ereport(LOG, (errmsg("new target timeline is %u", t_thrd.xlog_cxt.recoveryTargetTLI)));
return true;
}
}
return false;
}
* Find the newest existing timeline, assuming that startTLI exists.
*
* Note: while this is somewhat heuristic, it does positively guarantee
* that (result + 1) is not a known timeline, and therefore it should
* be safe to assign that ID to a new timeline.
*/
static TimeLineID findNewestTimeLine(TimeLineID startTLI)
{
TimeLineID newestTLI;
TimeLineID probeTLI;
* The algorithm is just to probe for the existence of timeline history
* files. XXX is it useful to allow gaps in the sequence?
*/
newestTLI = startTLI;
for (probeTLI = startTLI + 1;; probeTLI++) {
if (existsTimeLineHistory(probeTLI)) {
newestTLI = probeTLI;
} else {
break;
}
}
return newestTLI;
}
void SSWriteInstanceControlFile(int fd, const char* buffer, int id, off_t wsize)
{
errno = 0;
off_t seekpos = 0;
off_t actual_size = 0;
if (id != -1) {
seekpos = (off_t)BLCKSZ * id;
off_t fileSize = dss_get_file_size(XLOG_CONTROL_FILE);
ereport(LOG, (errmsg_internal("start fileSize = %ld", fileSize)));
if ((off_t)seekpos > fileSize) {
(void)dss_fallocate_file(fd, 0, fileSize, seekpos - fileSize);
}
actual_size = lseek(fd, seekpos, SEEK_SET);
if (actual_size == -1) {
ereport(PANIC, (errcode_for_file_access(),
errmsg("write position %ld is larger than max write size in dss", seekpos)));
}
}
pgstat_report_waitevent(WAIT_EVENT_CONTROL_FILE_WRITE);
if (write(fd, buffer, wsize) != wsize) {
if (errno == 0) {
errno = ENOSPC;
}
ereport(PANIC, (errcode_for_file_access(), errmsg("could not write to control file: %s", TRANSLATE_ERRNO)));
}
pgstat_report_waitevent(WAIT_EVENT_END);
pgstat_report_waitevent((uint32)WAIT_EVENT_CONTROL_FILE_SYNC);
if (pg_fsync(fd) != 0) {
ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync control file: %s", TRANSLATE_ERRNO)));
}
pgstat_report_waitevent(WAIT_EVENT_END);
}
* I/O routines for pg_control
*
* *ControlFile is a buffer in shared memory that holds an image of the
* contents of pg_control. WriteControlFile() initializes pg_control
* given a preloaded buffer, ReadControlFile() loads the buffer from
* the pg_control file (during postmaster or standalone-backend startup),
* and UpdateControlFile() rewrites pg_control after we modify xlog state.
*
* For simplicity, WriteControlFile() initializes the fields of pg_control
* that are related to checking backend/database compatibility, and
* ReadControlFile() verifies they are correct. We could split out the
* I/O and compatibility-check functions, but there seems no need currently.
*/
STATIC void WriteControlFile(void)
{
int fd = -1;
char buffer[PG_CONTROL_SIZE] __attribute__((__aligned__(ALIGNOF_BUFFER)));
errno_t errorno = EOK;
* Initialize version and compatibility-check fields
*/
t_thrd.shemem_ptr_cxt.ControlFile->pg_control_version = PG_CONTROL_VERSION;
t_thrd.shemem_ptr_cxt.ControlFile->catalog_version_no = CATALOG_VERSION_NO;
t_thrd.shemem_ptr_cxt.ControlFile->maxAlign = MAXIMUM_ALIGNOF;
t_thrd.shemem_ptr_cxt.ControlFile->floatFormat = FLOATFORMAT_VALUE;
t_thrd.shemem_ptr_cxt.ControlFile->blcksz = BLCKSZ;
t_thrd.shemem_ptr_cxt.ControlFile->relseg_size = RELSEG_SIZE;
t_thrd.shemem_ptr_cxt.ControlFile->xlog_blcksz = XLOG_BLCKSZ;
t_thrd.shemem_ptr_cxt.ControlFile->xlog_seg_size = XLogSegSize;
t_thrd.shemem_ptr_cxt.ControlFile->nameDataLen = NAMEDATALEN;
t_thrd.shemem_ptr_cxt.ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
t_thrd.shemem_ptr_cxt.ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
#ifdef HAVE_INT64_TIMESTAMP
t_thrd.shemem_ptr_cxt.ControlFile->enableIntTimes = true;
#else
t_thrd.shemem_ptr_cxt.ControlFile->enableIntTimes = false;
#endif
t_thrd.shemem_ptr_cxt.ControlFile->float4ByVal = FLOAT4PASSBYVAL;
t_thrd.shemem_ptr_cxt.ControlFile->float8ByVal = FLOAT8PASSBYVAL;
t_thrd.shemem_ptr_cxt.ControlFile->timeline = 0;
INIT_CRC32C(t_thrd.shemem_ptr_cxt.ControlFile->crc);
COMP_CRC32C(t_thrd.shemem_ptr_cxt.ControlFile->crc, (char *)t_thrd.shemem_ptr_cxt.ControlFile,
offsetof(ControlFileData, crc));
FIN_CRC32C(t_thrd.shemem_ptr_cxt.ControlFile->crc);
* We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
* excess over sizeof(ControlFileData). This reduces the odds of
* premature-EOF errors when reading pg_control. We'll still fail when we
* check the contents of the file, but hopefully with a more specific
* error than "couldn't read pg_control".
*/
if (sizeof(ControlFileData) > PG_CONTROL_SIZE) {
ereport(PANIC, (errmsg("sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one")));
}
errorno = memset_s(buffer, PG_CONTROL_SIZE, 0, PG_CONTROL_SIZE);
securec_check(errorno, "", "");
errorno = memcpy_s(buffer, PG_CONTROL_SIZE, t_thrd.shemem_ptr_cxt.ControlFile, sizeof(ControlFileData));
securec_check(errorno, "", "");
#ifdef USE_ASSERT_CHECKING
#define MAX_SIZE 1024
char current_absolute_path[MAX_SIZE] = {0};
getcwd(current_absolute_path, MAX_SIZE);
#endif
fd = BasicOpenFile(XLOG_CONTROL_FILE, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR);
if (fd < 0) {
ereport(PANIC,
(errcode_for_file_access(), errmsg("could not create control file \"%s\": %s", XLOG_CONTROL_FILE, TRANSLATE_ERRNO)));
}
if (ENABLE_DSS && SS_OFFICIAL_PRIMARY) {
SSWriteInstanceControlFile(fd, buffer, SS_CTRL_PAGE, PG_CONTROL_SIZE);
} else {
errno = 0;
pgstat_report_waitevent(WAIT_EVENT_CONTROL_FILE_WRITE);
if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE) {
if (errno == 0) {
errno = ENOSPC;
}
ereport(PANIC, (errcode_for_file_access(), errmsg("could not write to control file: %s", TRANSLATE_ERRNO)));
}
pgstat_report_waitevent(WAIT_EVENT_END);
pgstat_report_waitevent(WAIT_EVENT_CONTROL_FILE_SYNC);
if (pg_fsync(fd) != 0) {
ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync control file: %s", TRANSLATE_ERRNO)));
}
pgstat_report_waitevent(WAIT_EVENT_END);
}
if (close(fd)) {
ereport(PANIC, (errcode_for_file_access(), errmsg("could not close control file: %s", TRANSLATE_ERRNO)));
}
}
void ReadControlFile(void)
{
pg_crc32c crc;
int fd = -1;
char *fname = NULL;
bool retry = false;
errno_t errorno = EOK;
#ifdef USE_ASSERT_CHECKING
#define MAX_SIZE 1024
char current_absolute_path[MAX_SIZE] = {0};
getcwd(current_absolute_path, MAX_SIZE);
#endif
fname = XLOG_CONTROL_FILE;
loop:
fd = BasicOpenFile(fname, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
if (fd < 0) {
ereport(FATAL, (errcode_for_file_access(), errmsg("could not open control file \"%s\": %s", fname, TRANSLATE_ERRNO)));
}
if (ENABLE_DSS) {
int read_size = BUFFERALIGN(sizeof(ControlFileData));
char buffer[read_size] __attribute__((__aligned__(ALIGNOF_BUFFER)));
pgstat_report_waitevent(WAIT_EVENT_CONTROL_FILE_READ);
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
if (read(fd, buffer, read_size) != read_size) {
LWLockRelease(ControlFileLock);
ereport(PANIC, (errcode_for_file_access(), errmsg("could not read from control file: %s", TRANSLATE_ERRNO)));
}
LWLockRelease(ControlFileLock);
pgstat_report_waitevent(WAIT_EVENT_END);
errorno = memcpy_s(t_thrd.shemem_ptr_cxt.ControlFile,
sizeof(ControlFileData), buffer, sizeof(ControlFileData));
securec_check_c(errorno, "\0", "\0");
if (XLByteLE(g_instance.dms_cxt.ckptRedo, t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo)) {
g_instance.dms_cxt.ckptRedo = t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo;
}
} else {
pgstat_report_waitevent(WAIT_EVENT_CONTROL_FILE_READ);
if (read(fd, t_thrd.shemem_ptr_cxt.ControlFile, sizeof(ControlFileData)) !=
sizeof(ControlFileData)) {
ereport(PANIC, (errcode_for_file_access(), errmsg("could not read from control file: %s", TRANSLATE_ERRNO)));
}
pgstat_report_waitevent(WAIT_EVENT_END);
}
if (close(fd)) {
ereport(PANIC, (errcode_for_file_access(), errmsg("could not close control file: %s", TRANSLATE_ERRNO)));
}
INIT_CRC32C(crc);
COMP_CRC32C(crc, (char *)t_thrd.shemem_ptr_cxt.ControlFile, offsetof(ControlFileData, crc));
FIN_CRC32C(crc);
if (!EQ_CRC32C(crc, t_thrd.shemem_ptr_cxt.ControlFile->crc)) {
if (retry == false) {
ereport(WARNING, (errmsg("control file \"%s\" contains incorrect checksum, try backup file", fname)));
fname = XLOG_CONTROL_FILE_BAK;
retry = true;
goto loop;
} else {
ereport(FATAL, (errmsg("incorrect checksum in control file")));
}
}
* Check for expected pg_control format version. If this is wrong, the
* CRC check will likely fail because we'll be checking the wrong number
* of bytes. Complaining about wrong version will probably be more
* enlightening than complaining about wrong CRC.
*/
if (t_thrd.shemem_ptr_cxt.ControlFile->pg_control_version != PG_CONTROL_VERSION &&
t_thrd.shemem_ptr_cxt.ControlFile->pg_control_version % 65536 == 0 &&
t_thrd.shemem_ptr_cxt.ControlFile->pg_control_version / 65536 != 0) {
ereport(
FATAL,
(errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with PG_CONTROL_VERSION %u (0x%08x),"
" but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
t_thrd.shemem_ptr_cxt.ControlFile->pg_control_version,
t_thrd.shemem_ptr_cxt.ControlFile->pg_control_version, PG_CONTROL_VERSION, PG_CONTROL_VERSION),
errhint("This could be a problem of mismatched byte ordering. It looks like you need to gs_initdb.")));
}
if (t_thrd.shemem_ptr_cxt.ControlFile->pg_control_version != PG_CONTROL_VERSION) {
ereport(FATAL, (errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with PG_CONTROL_VERSION %u,"
" but the server was compiled with PG_CONTROL_VERSION %d.",
t_thrd.shemem_ptr_cxt.ControlFile->pg_control_version, PG_CONTROL_VERSION),
errhint("It looks like you need to gs_initdb.")));
}
* Do compatibility checking immediately. If the database isn't
* compatible with the backend executable, we want to abort before we can
* possibly do any damage.
*/
if (t_thrd.shemem_ptr_cxt.ControlFile->catalog_version_no != CATALOG_VERSION_NO) {
ereport(FATAL, (errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with CATALOG_VERSION_NO %u,"
" but the server was compiled with CATALOG_VERSION_NO %d.",
t_thrd.shemem_ptr_cxt.ControlFile->catalog_version_no, CATALOG_VERSION_NO),
errhint("It looks like you need to gs_initdb.")));
}
if (t_thrd.shemem_ptr_cxt.ControlFile->maxAlign != MAXIMUM_ALIGNOF) {
ereport(FATAL, (errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with MAXALIGN %u,"
" but the server was compiled with MAXALIGN %d.",
t_thrd.shemem_ptr_cxt.ControlFile->maxAlign, MAXIMUM_ALIGNOF),
errhint("It looks like you need to gs_initdb.")));
}
if (t_thrd.shemem_ptr_cxt.ControlFile->floatFormat != FLOATFORMAT_VALUE) {
ereport(FATAL,
(errmsg("database files are incompatible with server"),
errdetail("The database cluster appears to use a different floating-point number format than the "
"server executable."),
errhint("It looks like you need to gs_initdb.")));
}
if (t_thrd.shemem_ptr_cxt.ControlFile->blcksz != BLCKSZ) {
ereport(FATAL, (errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with BLCKSZ %u"
" but the server was compiled with BLCKSZ %d.",
t_thrd.shemem_ptr_cxt.ControlFile->blcksz, BLCKSZ),
errhint("It looks like you need to recompile or gs_initdb.")));
}
if (t_thrd.shemem_ptr_cxt.ControlFile->relseg_size != RELSEG_SIZE) {
ereport(FATAL, (errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with RELSEG_SIZE %u,"
" but the server was compiled with RELSEG_SIZE %d.",
t_thrd.shemem_ptr_cxt.ControlFile->relseg_size, RELSEG_SIZE),
errhint("It looks like you need to recompile or gs_initdb.")));
}
if (t_thrd.shemem_ptr_cxt.ControlFile->xlog_blcksz != XLOG_BLCKSZ) {
ereport(FATAL, (errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with XLOG_BLCKSZ %u,"
" but the server was compiled with XLOG_BLCKSZ %d.",
t_thrd.shemem_ptr_cxt.ControlFile->xlog_blcksz, XLOG_BLCKSZ),
errhint("It looks like you need to recompile or gs_initdb.")));
}
if (t_thrd.shemem_ptr_cxt.ControlFile->xlog_seg_size != XLogSegSize) {
ereport(FATAL, (errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with XLOG_SEG_SIZE %u,"
" but the server was compiled with XLOG_SEG_SIZE %lu.",
t_thrd.shemem_ptr_cxt.ControlFile->xlog_seg_size, XLogSegSize),
errhint("It looks like you need to recompile or gs_initdb.")));
}
if (t_thrd.shemem_ptr_cxt.ControlFile->nameDataLen != NAMEDATALEN) {
ereport(FATAL, (errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with NAMEDATALEN %u,"
" but the server was compiled with NAMEDATALEN %d.",
t_thrd.shemem_ptr_cxt.ControlFile->nameDataLen, NAMEDATALEN),
errhint("It looks like you need to recompile or gs_initdb.")));
}
if (t_thrd.shemem_ptr_cxt.ControlFile->indexMaxKeys != INDEX_MAX_KEYS) {
ereport(FATAL, (errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with INDEX_MAX_KEYS %u"
" but the server was compiled with INDEX_MAX_KEYS %d.",
t_thrd.shemem_ptr_cxt.ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
errhint("It looks like you need to recompile or gs_initdb.")));
}
if (t_thrd.shemem_ptr_cxt.ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE) {
ereport(FATAL, (errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %u,"
" but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
t_thrd.shemem_ptr_cxt.ControlFile->toast_max_chunk_size, (int)TOAST_MAX_CHUNK_SIZE),
errhint("It looks like you need to recompile or gs_initdb.")));
}
#ifdef HAVE_INT64_TIMESTAMP
if (t_thrd.shemem_ptr_cxt.ControlFile->enableIntTimes != true) {
ereport(FATAL, (errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
" but the server was compiled with HAVE_INT64_TIMESTAMP."),
errhint("It looks like you need to recompile or gs_initdb.")));
}
#else
if (t_thrd.shemem_ptr_cxt.ControlFile->enableIntTimes != false) {
ereport(FATAL, (errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
" but the server was compiled without HAVE_INT64_TIMESTAMP."),
errhint("It looks like you need to recompile or gs_initdb.")));
}
#endif
#ifdef USE_FLOAT4_BYVAL
if (t_thrd.shemem_ptr_cxt.ControlFile->float4ByVal != true) {
ereport(FATAL, (errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
" but the server was compiled with USE_FLOAT4_BYVAL."),
errhint("It looks like you need to recompile or gs_initdb.")));
}
#else
if (t_thrd.shemem_ptr_cxt.ControlFile->float4ByVal != false) {
ereport(FATAL, (errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
" but the server was compiled without USE_FLOAT4_BYVAL."),
errhint("It looks like you need to recompile or gs_initdb.")));
}
#endif
#ifdef USE_FLOAT8_BYVAL
if (t_thrd.shemem_ptr_cxt.ControlFile->float8ByVal != true) {
ereport(FATAL, (errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
" but the server was compiled with USE_FLOAT8_BYVAL."),
errhint("It looks like you need to recompile or gs_initdb.")));
}
#else
if (t_thrd.shemem_ptr_cxt.ControlFile->float8ByVal != false) {
ereport(FATAL, (errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
" but the server was compiled without USE_FLOAT8_BYVAL."),
errhint("It looks like you need to recompile or gs_initdb.")));
}
#endif
if (retry == true) {
RecoverControlFile();
}
}
* copy ControlFile to local variable, then calculate CRC and write to pg_control
* with the local variable.
* If all work is with ControlFile, there is a time window between calculating CRC
* and writing file, other thread can modify ControlFile before writing file, then
* crc in ControlFile is invalid.
*/
void UpdateControlFile(void)
{
if (SS_STANDBY_MODE) {
return;
}
int fd = -1;
int len;
errno_t err = EOK;
char *fname[2];
len = sizeof(ControlFileData);
int write_size;
write_size = (int)BUFFERALIGN(len);
char buffer[write_size] __attribute__((__aligned__(ALIGNOF_BUFFER))) = {0};
err = memcpy_s(&buffer, write_size, t_thrd.shemem_ptr_cxt.ControlFile, len);
securec_check(err, "\0", "\0");
INIT_CRC32C(((ControlFileData *)buffer)->crc);
COMP_CRC32C(((ControlFileData *)buffer)->crc, (char *)buffer, offsetof(ControlFileData, crc));
FIN_CRC32C(((ControlFileData *)buffer)->crc);
#ifdef USE_ASSERT_CHECKING
#define MAX_SIZE 1024
char current_absolute_path[MAX_SIZE] = {0};
getcwd(current_absolute_path, MAX_SIZE);
#endif
fname[0] = XLOG_CONTROL_FILE_BAK;
fname[1] = XLOG_CONTROL_FILE;
for (int i = 0; i < 2; i++) {
if (i == 0) {
fd = BasicOpenFile(fname[i], O_CREAT | O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
} else {
fd = BasicOpenFile(fname[i], O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
}
if (fd < 0) {
if (ENABLE_DSS && g_instance.status == FastShutdown && errno == ERR_DSS_CONNECT_FAILED
&& !g_instance.dms_cxt.SSReformInfo.in_reform) {
ereport(WARNING, (errmsg("could not open control file due to lost connection to DSS,"
" and CM may not have ability to control exit process, so shutdown directly.")));
exit(0);
}
ereport(FATAL, (errcode_for_file_access(), errmsg("could not open control file \"%s\": %s", fname[i], TRANSLATE_ERRNO)));
}
errno = 0;
pgstat_report_waitevent(WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE);
if (write(fd, buffer, write_size) != write_size) {
if (errno == 0) {
errno = ENOSPC;
}
ereport(PANIC, (errcode_for_file_access(), errmsg("could not write to control file: %s", TRANSLATE_ERRNO)));
}
pgstat_report_waitevent(WAIT_EVENT_END);
pgstat_report_waitevent(WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE);
if (fsync(fd) != 0) {
ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync control file: %s", TRANSLATE_ERRNO)));
}
pgstat_report_waitevent(WAIT_EVENT_END);
if (close(fd)) {
ereport(PANIC, (errcode_for_file_access(), errmsg("could not close control file: %s", TRANSLATE_ERRNO)));
}
}
#ifndef ENABLE_MULTIPLE_NODES
* Truncate DCF Log before Min(flushedIdx, minAppliedIdx) to avoid space waste,
* flushedIdx is the DCF index corresponding to flushed XLog LSN,
* minAppliedIdx is used to avoid full build caused by slow DCF follower.
*/
if (g_instance.attr.attr_storage.dcf_attr.enable_dcf && t_thrd.dcf_cxt.dcfCtxInfo->isDcfStarted) {
DcfLogTruncate();
}
#endif
}
* when incorrect checksum is detected in control file,
* we should recover the control file using the content of backup file
*/
static void RecoverControlFile(void)
{
int fd;
char buffer[PG_CONTROL_SIZE];
errno_t errorno = EOK;
ereport(WARNING, (errmsg("recover the control file due to incorrect checksum")));
errorno = memset_s(buffer, PG_CONTROL_SIZE, 0, PG_CONTROL_SIZE);
securec_check(errorno, "", "");
errorno = memcpy_s(buffer, PG_CONTROL_SIZE, t_thrd.shemem_ptr_cxt.ControlFile, sizeof(ControlFileData));
securec_check(errorno, "", "");
fd = BasicOpenFile(XLOG_CONTROL_FILE, O_TRUNC | O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
if (fd < 0) {
ereport(FATAL, (errcode_for_file_access(),
errmsg("recover failed could not open control file \"%s\": %s", XLOG_CONTROL_FILE, TRANSLATE_ERRNO)));
}
errno = 0;
if (write(fd, &buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE) {
if (errno == 0) {
errno = ENOSPC;
}
ereport(PANIC, (errcode_for_file_access(), errmsg("recover failed could not write to control file: %s", TRANSLATE_ERRNO)));
}
if (pg_fsync(fd) != 0) {
ereport(PANIC, (errcode_for_file_access(), errmsg("recover failed could not fsync control file: %s", TRANSLATE_ERRNO)));
}
if (close(fd)) {
ereport(PANIC, (errcode_for_file_access(), errmsg("recover failed could not close control file: %s", TRANSLATE_ERRNO)));
}
}
* Returns the unique system identifier from control file.
*/
uint64 GetSystemIdentifier(void)
{
Assert(t_thrd.shemem_ptr_cxt.ControlFile != NULL);
return t_thrd.shemem_ptr_cxt.ControlFile->system_identifier;
}
void SetSystemIdentifier(uint64 system_identifier)
{
Assert(t_thrd.shemem_ptr_cxt.ControlFile != NULL);
t_thrd.shemem_ptr_cxt.ControlFile->system_identifier = system_identifier;
return;
}
TimeLineID GetThisTimeID(void)
{
Assert(t_thrd.shemem_ptr_cxt.ControlFile != NULL);
return t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.ThisTimeLineID;
}
void SetThisTimeID(TimeLineID timelineID)
{
Assert(t_thrd.shemem_ptr_cxt.ControlFile != NULL);
t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.ThisTimeLineID = timelineID;
return;
}
* Auto-tune the number of XLOG buffers.
*
* The preferred setting for wal_buffers is about 3% of shared_buffers, with
* a maximum of one XLOG segment (there is little reason to think that more
* is helpful, at least so long as we force an fsync when switching log files)
* and a minimum of 8 blocks (which was the default value prior to openGauss
* 9.1, when auto-tuning was added).
*
* This should not be called until NBuffers has received its final value.
*/
static int XLOGChooseNumBuffers(void)
{
int xbuffers = g_instance.attr.attr_storage.NBuffers / 32;
if (xbuffers > (int)XLogSegSize / XLOG_BLCKSZ) {
xbuffers = (int)XLogSegSize / XLOG_BLCKSZ;
}
if (xbuffers < 8) {
xbuffers = 8;
}
return xbuffers;
}
* GUC check_hook for wal_buffers
*/
bool check_wal_buffers(int *newval, void **extra, GucSource source)
{
if (*newval == -1) {
* If we haven't yet changed the boot_val default of -1, just let it
* be. We'll fix it when XLOGShmemSize is called.
*/
if (g_instance.attr.attr_storage.XLOGbuffers == -1) {
return true;
}
*newval = XLOGChooseNumBuffers();
}
* We clamp manually-set values to at least 4 blocks. Prior to PostgreSQL
* 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
* the case, we just silently treat such values as a request for the
* minimum. (We could throw an error instead, but that doesn't seem very
* helpful.)
*/
if (*newval < 4) {
*newval = 4;
}
return true;
}
* Initialization of shared memory for XLOG
*/
Size XLOGShmemSize(void)
{
Size size;
errno_t errorno = EOK;
* If the value of wal_buffers is -1, use the preferred auto-tune value.
* This isn't an amazingly clean place to do this, but we must wait till
* NBuffers has received its final value, and must do it before using the
* value of XLOGbuffers to do anything important.
*/
if (g_instance.attr.attr_storage.XLOGbuffers == -1) {
char buf[32];
errorno = snprintf_s(buf, sizeof(buf), sizeof(buf) - 1, "%d", XLOGChooseNumBuffers());
securec_check_ss(errorno, "", "");
SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
}
Assert(g_instance.attr.attr_storage.XLOGbuffers > 0);
size = sizeof(XLogCtlData) + 16;
size = add_size(size, mul_size(sizeof(XLogRecPtr), g_instance.attr.attr_storage.XLOGbuffers));
size = add_size(size, XLOG_BLCKSZ);
size = add_size(size, mul_size(XLOG_BLCKSZ, g_instance.attr.attr_storage.XLOGbuffers));
* Note: we don't count ControlFileData, it comes out of the "slop factor"
* added by CreateSharedMemoryAndSemaphores. This lets us use this
* routine again below to compute the actual allocation size.
*/
return size;
}
void XLOGShmemInit(void)
{
bool foundCFile = false;
bool foundDCFData = false;
bool foundXLog = false;
char *allocptr = NULL;
int i;
errno_t errorno = EOK;
int nNumaNodes = g_instance.shmem_cxt.numaNodeNum;
if (!IsUnderPostmaster && g_instance.attr.attr_storage.num_xloginsert_locks % nNumaNodes != 0) {
ereport(FATAL,
(errmsg("XLOGShmemInit num_xloginsert_locks should be multiple of NUMA node number in the system.")));
}
g_instance.wal_cxt.num_locks_in_group = g_instance.attr.attr_storage.num_xloginsert_locks / nNumaNodes;
t_thrd.shemem_ptr_cxt.ControlFile = (ControlFileData *)ShmemInitStruct("Control File", sizeof(ControlFileData),
&foundCFile);
t_thrd.shemem_ptr_cxt.dcfData = NULL;
#ifndef ENABLE_MULTIPLE_NODES
if (g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
t_thrd.shemem_ptr_cxt.dcfData = (DCFData *)ShmemInitStruct("DCF data", sizeof(DCFData), &foundDCFData);
}
#endif
t_thrd.shemem_ptr_cxt.XLogCtl = (XLogCtlData *)ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
t_thrd.shemem_ptr_cxt.XLogCtl = (XLogCtlData *)TYPEALIGN(16, t_thrd.shemem_ptr_cxt.XLogCtl);
if (foundCFile || foundXLog || foundDCFData) {
Assert(foundCFile && foundXLog);
#ifndef ENABLE_MULTIPLE_NODES
if (g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
Assert(foundDCFData);
}
#endif
t_thrd.shemem_ptr_cxt.GlobalWALInsertLocks = t_thrd.shemem_ptr_cxt.XLogCtl->Insert.WALInsertLocks;
t_thrd.shemem_ptr_cxt.LocalGroupWALInsertLocks =
t_thrd.shemem_ptr_cxt.GlobalWALInsertLocks[t_thrd.proc->nodeno];
if (ENABLE_DMS && t_thrd.role == STARTUP && (((SS_STANDBY_PROMOTING || SS_PRIMARY_DEMOTED) &&
g_instance.dms_cxt.SSRecoveryInfo.new_primary_reset_walbuf_flag == true) ||
SS_STANDBY_FAILOVER)) {
g_instance.dms_cxt.SSRecoveryInfo.new_primary_reset_walbuf_flag = false;
errorno = memset_s(t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks,
sizeof(XLogRecPtr) * g_instance.attr.attr_storage.XLOGbuffers, 0,
sizeof(XLogRecPtr) * g_instance.attr.attr_storage.XLOGbuffers);
securec_check(errorno, "", "");
ereport(LOG, (errmsg("[SS %s] Successfully reset xlblocks when thrd:%lu with role:%d started",
SS_PERFORMING_SWITCHOVER ? "switchover" : "failover", t_thrd.proc->pid, (int)t_thrd.role)));
}
return;
}
errorno = memset_s(t_thrd.shemem_ptr_cxt.XLogCtl, sizeof(XLogCtlData), 0, sizeof(XLogCtlData));
securec_check(errorno, "", "");
* Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
* multiple of the alignment for same, so no extra alignment padding is
* needed here.
*/
allocptr = ((char *)t_thrd.shemem_ptr_cxt.XLogCtl) + sizeof(XLogCtlData);
t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks = (XLogRecPtr *)allocptr;
errorno = memset_s(t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks,
sizeof(XLogRecPtr) * g_instance.attr.attr_storage.XLOGbuffers, 0,
sizeof(XLogRecPtr) * g_instance.attr.attr_storage.XLOGbuffers);
securec_check(errorno, "", "");
allocptr += sizeof(XLogRecPtr) * g_instance.attr.attr_storage.XLOGbuffers;
WALInsertLockPadded **insertLockGroupPtr = (WALInsertLockPadded **)CACHELINEALIGN(
palloc0(nNumaNodes * sizeof(WALInsertLockPadded *) + PG_CACHE_LINE_SIZE));
#ifdef __USE_NUMA
if (nNumaNodes > 1) {
size_t allocSize = sizeof(WALInsertLockPadded) * g_instance.wal_cxt.num_locks_in_group + PG_CACHE_LINE_SIZE;
for (int i = 0; i < nNumaNodes; i++) {
char *pInsertLock = (char *)numa_alloc_onnode(allocSize, i);
if (pInsertLock == NULL) {
ereport(PANIC, (errmsg("XLOGShmemInit could not alloc memory on node %d", i)));
}
add_numa_alloc_info(pInsertLock, allocSize);
insertLockGroupPtr[i] = (WALInsertLockPadded *)(CACHELINEALIGN(pInsertLock));
}
} else {
#endif
char *pInsertLock = (char *)CACHELINEALIGN(palloc(
sizeof(WALInsertLockPadded) * g_instance.attr.attr_storage.num_xloginsert_locks + PG_CACHE_LINE_SIZE));
insertLockGroupPtr[0] = (WALInsertLockPadded *)(CACHELINEALIGN(pInsertLock));
#ifdef __USE_NUMA
}
#endif
t_thrd.shemem_ptr_cxt.GlobalWALInsertLocks = t_thrd.shemem_ptr_cxt.XLogCtl->Insert.WALInsertLocks =
insertLockGroupPtr;
t_thrd.shemem_ptr_cxt.LocalGroupWALInsertLocks = t_thrd.shemem_ptr_cxt.GlobalWALInsertLocks[0];
for (int processorIndex = 0; processorIndex < nNumaNodes; processorIndex++) {
for (i = 0; i < g_instance.wal_cxt.num_locks_in_group; i++) {
LWLockInitialize(&t_thrd.shemem_ptr_cxt.GlobalWALInsertLocks[processorIndex][i].l.lock,
LWTRANCHE_WAL_INSERT);
t_thrd.shemem_ptr_cxt.GlobalWALInsertLocks[processorIndex][i].l.insertingAt = InvalidXLogRecPtr;
#ifdef __aarch64__
pg_atomic_init_u32(&t_thrd.shemem_ptr_cxt.GlobalWALInsertLocks[processorIndex][i].l.xlogGroupFirst,
INVALID_PGPROCNO);
#endif
}
}
* Align the start of the page buffers to a full xlog block size boundary.
* This simplifies some calculations in XLOG insertion. It is also required
* for O_DIRECT.
*/
allocptr = (char *)TYPEALIGN(XLOG_BLCKSZ, allocptr);
t_thrd.shemem_ptr_cxt.XLogCtl->pages = allocptr;
MemSet(t_thrd.shemem_ptr_cxt.XLogCtl->pages, 0, (Size)XLOG_BLCKSZ * g_instance.attr.attr_storage.XLOGbuffers);
#ifdef ENABLE_BBOX
if (BBOX_BLACKLIST_XLOG_BUFFER) {
bbox_blacklist_add(XLOG_BUFFER, t_thrd.shemem_ptr_cxt.XLogCtl->pages,
(uint64)XLOG_BLCKSZ * g_instance.attr.attr_storage.XLOGbuffers);
}
#endif
* Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
* in additional info.)
*/
t_thrd.shemem_ptr_cxt.XLogCtl->XLogCacheBlck = g_instance.attr.attr_storage.XLOGbuffers - 1;
t_thrd.shemem_ptr_cxt.XLogCtl->SharedRecoveryInProgress = true;
t_thrd.shemem_ptr_cxt.XLogCtl->IsRecoveryDone = false;
t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandBuildDone = false;
t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandRedoDone = false;
t_thrd.shemem_ptr_cxt.XLogCtl->SharedHotStandbyActive = false;
t_thrd.shemem_ptr_cxt.XLogCtl->WalWriterSleeping = false;
t_thrd.shemem_ptr_cxt.XLogCtl->xlogFlushPtrForPerRead = InvalidXLogRecPtr;
if (!IsInitdb && !dummyStandbyMode) {
t_thrd.shemem_ptr_cxt.XLogCtl->lastRemovedSegNo = GetOldestXLOGSegNo(t_thrd.proc_cxt.DataDir);
}
#if (!defined __x86_64__) && (!defined __aarch64__)
SpinLockInit(&t_thrd.shemem_ptr_cxt.XLogCtl->Insert.insertpos_lck);
#endif
SpinLockInit(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck);
InitSharedLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch);
InitSharedLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->dataRecoveryLatch);
if (IsExtremeRedo()) {
InitSharedLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupDelayLatch);
}
* If we are not in bootstrap mode, pg_control should already exist. Read
* and validate it immediately (see comments in ReadControlFile() for the
* reasons why).
*/
if (!IsBootstrapProcessingMode()) {
ReadControlFile();
}
}
static XLogSegNo GetOldestXLOGSegNo(const char *workingPath)
{
DIR *xlogDir = NULL;
struct dirent *dirEnt = NULL;
char xlogDirStr[MAXPGPATH] = {0};
char oldestXLogFileName[MAXPGPATH] = {0};
TimeLineID tli = 0;
uint32 xlogReadLogid = -1;
uint32 xlogReadLogSeg = -1;
XLogSegNo segno;
errno_t rc = EOK;
if (ENABLE_DSS) {
rc = snprintf_s(xlogDirStr, MAXPGPATH, MAXPGPATH - 1, "%s", SS_XLOGDIR);
} else {
rc = snprintf_s(xlogDirStr, MAXPGPATH, MAXPGPATH - 1, "%s/%s", workingPath, XLOGDIR);
}
securec_check_ss(rc, "", "");
xlogDir = opendir(xlogDirStr);
if (!xlogDir) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not open xlog dir in GetOldestXLOGSegNo.")));
}
while ((dirEnt = readdir(xlogDir)) != NULL) {
if (strlen(dirEnt->d_name) == XLOGFILENAMELEN &&
strspn(dirEnt->d_name, "0123456789ABCDEF") == XLOGFILENAMELEN) {
if (strlen(oldestXLogFileName) == 0 || strcmp(dirEnt->d_name, oldestXLogFileName) < 0) {
rc = strncpy_s(oldestXLogFileName, MAXPGPATH - 1, dirEnt->d_name, strlen(dirEnt->d_name) + 1);
securec_check_ss(rc, "", "");
oldestXLogFileName[strlen(dirEnt->d_name)] = '\0';
}
}
}
(void)closedir(xlogDir);
if (sscanf_s(oldestXLogFileName, "%08X%08X%08X", &tli, &xlogReadLogid, &xlogReadLogSeg) != 3) {
if (SS_DORADO_STANDBY_CLUSTER) {
return (XLogSegNo)0;
}
ereport(ERROR, (errcode_for_file_access(), errmsg("failed to translate name to xlog in GetOldestXLOGSegNo.")));
}
segno = (uint64)xlogReadLogid * XLogSegmentsPerXLogId + xlogReadLogSeg - 1;
return segno;
}
XLogSegNo GetNewestXLOGSegNo(const char *workingPath)
{
DIR *xlogDir = NULL;
struct dirent *dirEnt = NULL;
char xlogDirStr[MAXPGPATH] = {0};
char newestXLogFileName[MAXPGPATH] = {0};
TimeLineID tli = 0;
uint32 xlogReadLogid = -1;
uint32 xlogReadLogSeg = -1;
XLogSegNo segno;
errno_t rc = EOK;
if (ENABLE_DSS) {
rc = snprintf_s(xlogDirStr, MAXPGPATH, MAXPGPATH - 1, "%s", SS_XLOGDIR);
} else {
rc = snprintf_s(xlogDirStr, MAXPGPATH, MAXPGPATH - 1, "%s/%s", workingPath, XLOGDIR);
}
securec_check_ss(rc, "", "");
xlogDir = opendir(xlogDirStr);
if (!xlogDir) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not open xlog dir in GetNewestXLOGSegNo.")));
}
while ((dirEnt = readdir(xlogDir)) != NULL) {
if (strlen(dirEnt->d_name) == XLOGFILENAMELEN &&
strspn(dirEnt->d_name, "0123456789ABCDEF") == XLOGFILENAMELEN) {
if (strlen(newestXLogFileName) == 0 || strcmp(dirEnt->d_name, newestXLogFileName) > 0) {
rc = strncpy_s(newestXLogFileName, MAXPGPATH - 1, dirEnt->d_name, strlen(dirEnt->d_name) + 1);
securec_check_ss(rc, "", "");
newestXLogFileName[strlen(dirEnt->d_name)] = '\0';
}
}
}
(void)closedir(xlogDir);
if (sscanf_s(newestXLogFileName, "%08X%08X%08X", &tli, &xlogReadLogid, &xlogReadLogSeg) != 3) {
ereport(ERROR, (errcode_for_file_access(), errmsg("failed to translate name to xlog in GetNewestXLOGSegNo.")));
}
segno = (uint64)xlogReadLogid * XLogSegmentsPerXLogId + xlogReadLogSeg;
return segno;
}
static uint64 GetMACAddr(void)
{
macaddr mac;
uint64 macAddr;
int sockFd = NO_SOCKET;
struct ifconf ifconfInfo;
struct ifreq ifreqInfo;
char *buf = NULL;
errno_t ss_rc = EOK;
uint32 i;
ss_rc = memset_s((void *)&mac, sizeof(macaddr), 0, sizeof(macaddr));
securec_check(ss_rc, "\0", "\0");
sockFd = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
if (sockFd != NO_SOCKET) {
buf = (char *)palloc(MaxMacAddrList * sizeof(ifreq));
ifconfInfo.ifc_len = MaxMacAddrList * sizeof(ifreq);
ifconfInfo.ifc_buf = buf;
if (ioctl(sockFd, SIOCGIFCONF, &ifconfInfo) != -1) {
struct ifreq *ifrepTmp = ifconfInfo.ifc_req;
for (i = 0; i < (ifconfInfo.ifc_len / sizeof(struct ifreq)); i++) {
ss_rc = strcpy_s(ifreqInfo.ifr_name, strlen(ifrepTmp->ifr_name) + 1, ifrepTmp->ifr_name);
securec_check(ss_rc, "\0", "\0");
if (ioctl(sockFd, SIOCGIFFLAGS, &ifreqInfo) == 0) {
if (!(ifreqInfo.ifr_flags & IFF_LOOPBACK)) {
if (ioctl(sockFd, SIOCGIFHWADDR, &ifreqInfo) == 0) {
mac.a = (unsigned char)ifreqInfo.ifr_hwaddr.sa_data[0];
mac.b = (unsigned char)ifreqInfo.ifr_hwaddr.sa_data[1];
mac.c = (unsigned char)ifreqInfo.ifr_hwaddr.sa_data[2];
mac.d = (unsigned char)ifreqInfo.ifr_hwaddr.sa_data[3];
mac.e = (unsigned char)ifreqInfo.ifr_hwaddr.sa_data[4];
mac.f = (unsigned char)ifreqInfo.ifr_hwaddr.sa_data[5];
break;
}
}
}
ifrepTmp++;
}
}
pfree_ext(buf);
close(sockFd);
}
macAddr = ((uint64)mac.a << 40) | ((uint64)mac.b << 32) | ((uint64)mac.c << 24) | ((uint64)mac.d << 16) |
((uint64)mac.e << 8) | (uint64)mac.f;
return macAddr;
}
* This func must be called ONCE on system install. It creates pg_control
* and the initial XLOG segment.
*/
void BootStrapXLOG(void)
{
CheckPoint checkPoint;
char *buffer = NULL;
XLogPageHeader page;
XLogLongPageHeader longpage;
XLogRecord *record = NULL;
char *recptr = NULL;
bool use_existent = false;
uint64 sysidentifier;
uint64 macAddr;
pg_crc32c crc;
errno_t ret = EOK;
TransactionId latestCompletedXid;
int fd = 0;
int len = 0;
int randomNum;
* Select a hopefully-unique system identifier code for this installation.
* We use the result of GetMACAddr() to make the first 48 bits.
* And the last is a random number from linux system to make sure the difference.
*/
macAddr = GetMACAddr();
sysidentifier = macAddr << 16;
fd = open("/dev/urandom", O_RDONLY, 0);
if (fd == -1) {
ereport(ERROR, (errcode_for_file_access(), errmsg("cann't read a random number from file \"/dev/urandom\".")));
}
if (fd > 0) {
len = read(fd, &randomNum, sizeof(randomNum));
if (len != sizeof(randomNum)) {
close(fd);
ereport(ERROR,
(errcode_for_file_access(), errmsg("cann't read a random number from file \"/dev/urandom\".")));
}
}
close(fd);
sysidentifier |= (uint32)randomNum & 0xFFFF;
ereport(WARNING, (errmsg_internal("macAddr is %u/%u, sysidentifier is %u/%u, randomNum is %u",
(uint32)(macAddr >> 32), (uint32)macAddr, (uint32)(sysidentifier >> 32),
(uint32)sysidentifier, (uint32)randomNum)));
t_thrd.xlog_cxt.ThisTimeLineID = 1;
buffer = (char *)palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
page = (XLogPageHeader)TYPEALIGN(XLOG_BLCKSZ, buffer);
ret = memset_s(page, XLOG_BLCKSZ, 0, XLOG_BLCKSZ);
securec_check(ret, "", "");
* Set up information for the initial checkpoint record
*
* The initial checkpoint record is written to the beginning of the WAL
* segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
* used, so that we can use 0/0 to mean "before any valid WAL segment".
*/
if (ENABLE_INCRE_CKPT) {
u_sess->attr.attr_storage.fullPageWrites = false;
}
checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
checkPoint.ThisTimeLineID = t_thrd.xlog_cxt.ThisTimeLineID;
checkPoint.fullPageWrites = u_sess->attr.attr_storage.fullPageWrites;
checkPoint.nextXid = FirstNormalTransactionId + 1;
checkPoint.nextOid = FirstBootstrapObjectId;
checkPoint.nextMulti = FirstMultiXactId + 1;
checkPoint.nextMultiOffset = 0;
checkPoint.oldestXid = FirstNormalTransactionId;
checkPoint.oldestXidDB = TemplateDbOid;
checkPoint.time = (pg_time_t)time(NULL);
checkPoint.oldestActiveXid = InvalidTransactionId;
checkPoint.remove_seg = InvalidXLogSegPtr;
t_thrd.xact_cxt.ShmemVariableCache->nextXid = checkPoint.nextXid;
t_thrd.xact_cxt.ShmemVariableCache->nextOid = checkPoint.nextOid;
t_thrd.xact_cxt.ShmemVariableCache->oidCount = 0;
t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo = COMMITSEQNO_FIRST_NORMAL + 1;
t_thrd.xact_cxt.ShmemVariableCache->xlogMaxCSN = t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo - 1;
latestCompletedXid = checkPoint.nextXid;
TransactionIdRetreat(latestCompletedXid);
t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid = latestCompletedXid;
t_thrd.xact_cxt.ShmemVariableCache->xmin = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
t_thrd.xact_cxt.ShmemVariableCache->recentLocalXmin = latestCompletedXid;
t_thrd.xact_cxt.ShmemVariableCache->startupMaxXid = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
SetMultiXactIdLimit(FirstMultiXactId, TemplateDbOid);
page->xlp_magic = XLOG_PAGE_MAGIC;
page->xlp_info = XLP_LONG_HEADER;
page->xlp_tli = t_thrd.xlog_cxt.ThisTimeLineID;
page->xlp_pageaddr = XLogSegSize;
longpage = (XLogLongPageHeader)page;
longpage->xlp_sysid = sysidentifier;
longpage->xlp_seg_size = XLogSegSize;
longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
recptr = ((char *)page + SizeOfXLogLongPHD);
record = (XLogRecord *)recptr;
record->xl_prev = 0;
record->xl_xid = InvalidTransactionId;
record->xl_tot_len = CHECKPOINT_LEN;
record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
record->xl_rmid = RM_XLOG_ID;
recptr += SizeOfXLogRecord;
*(recptr++) = XLR_BLOCK_ID_DATA_SHORT;
*(recptr++) = sizeof(checkPoint);
ret = memcpy_s(recptr, sizeof(checkPoint), &checkPoint, sizeof(checkPoint));
securec_check(ret, "", "");
recptr += sizeof(checkPoint);
Assert(recptr - (char *)record == record->xl_tot_len);
INIT_CRC32C(crc);
COMP_CRC32C(crc, ((char *)record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
COMP_CRC32C(crc, (char *)record, offsetof(XLogRecord, xl_crc));
FIN_CRC32C(crc);
record->xl_crc = crc;
use_existent = false;
t_thrd.xlog_cxt.openLogFile = XLogFileInit(1, &use_existent, false);
errno = 0;
pgstat_report_waitevent(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
if (write(t_thrd.xlog_cxt.openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ) {
if (errno == 0) {
errno = ENOSPC;
}
ereport(PANIC, (errcode_for_file_access(), errmsg("could not write bootstrap transaction log file: %s", TRANSLATE_ERRNO)));
}
pgstat_report_waitevent(WAIT_EVENT_END);
pgstat_report_waitevent(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
if (pg_fsync(t_thrd.xlog_cxt.openLogFile) != 0) {
ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync bootstrap transaction log file: %s", TRANSLATE_ERRNO)));
}
pgstat_report_waitevent(WAIT_EVENT_END);
if (close(t_thrd.xlog_cxt.openLogFile)) {
ereport(PANIC, (errcode_for_file_access(), errmsg("could not close bootstrap transaction log file: %s", TRANSLATE_ERRNO)));
}
t_thrd.xlog_cxt.openLogFile = -1;
if (ENABLE_DSS) {
SSInitReformerControlPages();
}
ret = memset_s(t_thrd.shemem_ptr_cxt.ControlFile, sizeof(ControlFileData), 0, sizeof(ControlFileData));
securec_check(ret, "", "");
t_thrd.shemem_ptr_cxt.ControlFile->system_identifier = sysidentifier;
t_thrd.shemem_ptr_cxt.ControlFile->state = DB_SHUTDOWNED;
t_thrd.shemem_ptr_cxt.ControlFile->time = checkPoint.time;
t_thrd.shemem_ptr_cxt.ControlFile->checkPoint = checkPoint.redo;
t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy = checkPoint;
t_thrd.shemem_ptr_cxt.ControlFile->MaxConnections = g_instance.shmem_cxt.MaxConnections;
t_thrd.shemem_ptr_cxt.ControlFile->max_prepared_xacts = g_instance.attr.attr_storage.max_prepared_xacts;
t_thrd.shemem_ptr_cxt.ControlFile->max_locks_per_xact = g_instance.attr.attr_storage.max_locks_per_xact;
t_thrd.shemem_ptr_cxt.ControlFile->wal_level = g_instance.attr.attr_storage.wal_level;
t_thrd.shemem_ptr_cxt.ControlFile->bootstrap_segment = EnableInitDBSegment;
WriteControlFile();
if (IS_SHARED_STORAGE_MODE) {
ShareStorageXLogCtl *ctlInfo = g_instance.xlog_cxt.shareStorageXLogCtl;
InitShareStorageCtlInfo(ctlInfo, sysidentifier);
int writeLen = WriteXlogToShareStorage(XLogSegSize, (char *)page, XLOG_BLCKSZ);
if (writeLen != XLOG_BLCKSZ) {
ereport(PANIC, (errcode_for_file_access(),
errmsg("write checkpoint info faild: len:%u, version:%u, start:%lu, end:%lu",
ctlInfo->length, ctlInfo->version, ctlInfo->insertHead, ctlInfo->insertTail)));
}
ctlInfo->insertHead = MAXALIGN((recptr - (char *)page) + XLogSegSize);
ctlInfo->crc = CalShareStorageCtlInfoCrc(ctlInfo);
UpdateShareStorageCtlInfo(ctlInfo);
FsyncXlogToShareStorage();
ereport(LOG, (errcode_for_file_access(),
errmsg("update dorado ctl info: len:%u, version:%u, start:%lu, end:%lu", ctlInfo->length,
ctlInfo->version, ctlInfo->insertHead, ctlInfo->insertTail)));
}
#ifndef ENABLE_MULTIPLE_NODES
if (g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
ret = memset_s(t_thrd.shemem_ptr_cxt.dcfData, sizeof(DCFData), 0, sizeof(DCFData));
securec_check(ret, "", "");
t_thrd.shemem_ptr_cxt.dcfData->dcfDataVersion = DCF_DATA_VERSION;
t_thrd.shemem_ptr_cxt.dcfData->appliedIndex = 0;
t_thrd.shemem_ptr_cxt.dcfData->realMinAppliedIdx = 0;
INIT_CRC32C(t_thrd.shemem_ptr_cxt.dcfData->crc);
COMP_CRC32C(t_thrd.shemem_ptr_cxt.dcfData->crc, (char *)t_thrd.shemem_ptr_cxt.dcfData, offsetof(DCFData, crc));
FIN_CRC32C(t_thrd.shemem_ptr_cxt.dcfData->crc);
if (!SaveAppliedIndex()) {
ereport(PANIC, (errmsg("Write paxos index failed!")));
}
}
#endif
MemoryContext mem_cxt = AllocSetContextCreate(INSTANCE_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE),
"DoubleWriteContext", ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE, SHARED_CONTEXT);
MemoryContext old_mem_cxt = MemoryContextSwitchTo(mem_cxt);
dw_bootstrap();
(void)MemoryContextSwitchTo(old_mem_cxt);
BootStrapCLOG();
BootStrapCSNLOG();
BootStrapMultiXact();
pfree_ext(buffer);
}
static char *str_time(pg_time_t tnow)
{
pg_strftime(t_thrd.xlog_cxt.buf, sizeof(t_thrd.xlog_cxt.buf), "%Y-%m-%d %H:%M:%S %Z",
pg_localtime(&tnow, log_timezone));
return t_thrd.xlog_cxt.buf;
}
* See if there is a recovery command file (recovery.conf), and if so
* read in parameters for archive recovery and XLOG streaming.
*
* The file is parsed using the main configuration parser.
*/
static void readRecoveryCommandFile(void)
{
FILE *fd = NULL;
TimeLineID rtli = 0;
bool rtliGiven = false;
ConfigVariable *item = NULL;
ConfigVariable *head = NULL;
ConfigVariable *tail = NULL;
fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
if (fd == NULL) {
if (errno == ENOENT) {
return;
}
ereport(FATAL, (errcode_for_file_access(),
errmsg("could not open recovery command file \"%s\": %s", RECOVERY_COMMAND_FILE, TRANSLATE_ERRNO)));
}
* Since we're asking ParseConfigFp() to report errors as FATAL, there's
* no need to check the return value.
*/
(void)ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
(void)FreeFile(fd);
fd = NULL;
for (item = head; item; item = item->next) {
if (strcmp(item->name, "restore_command") == 0 && t_thrd.xlog_cxt.server_mode != STANDBY_MODE) {
t_thrd.xlog_cxt.recoveryRestoreCommand = pstrdup(item->value);
ereport(DEBUG2, (errmsg_internal("restore_command = '%s'", t_thrd.xlog_cxt.recoveryRestoreCommand)));
} else if (strcmp(item->name, "recovery_end_command") == 0) {
t_thrd.xlog_cxt.recoveryEndCommand = pstrdup(item->value);
ereport(DEBUG2, (errmsg_internal("recovery_end_command = '%s'", t_thrd.xlog_cxt.recoveryEndCommand)));
} else if (strcmp(item->name, "archive_cleanup_command") == 0) {
t_thrd.xlog_cxt.archiveCleanupCommand = pstrdup(item->value);
ereport(DEBUG2, (errmsg_internal("archive_cleanup_command = '%s'", t_thrd.xlog_cxt.archiveCleanupCommand)));
} else if (strcmp(item->name, "pause_at_recovery_target") == 0) {
if (ENABLE_DSS) {
ereport(WARNING, (errmsg("pause_at_recovery_target is ignored in shared storage")));
} else {
if (!parse_bool(item->value, &t_thrd.xlog_cxt.recoveryPauseAtTarget)) {
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("parameter \"%s\" requires a Boolean value", "pause_at_recovery_target")));
}
ereport(DEBUG2, (errmsg_internal("pause_at_recovery_target = '%s'", item->value)));
}
} else if (strcmp(item->name, "recovery_target_timeline") == 0) {
rtliGiven = true;
if (strcmp(item->value, "latest") == 0) {
rtli = 0;
} else {
errno = 0;
rtli = (TimeLineID)strtoul(item->value, NULL, 0);
if (errno == EINVAL || errno == ERANGE) {
ereport(FATAL, (errmsg("recovery_target_timeline is not a valid number: \"%s\"", item->value)));
}
}
if (rtli) {
ereport(DEBUG2, (errmsg_internal("recovery_target_timeline = %u", rtli)));
} else {
ereport(DEBUG2, (errmsg_internal("recovery_target_timeline = latest")));
}
} else if (strcmp(item->name, "recovery_target_xid") == 0) {
errno = 0;
t_thrd.xlog_cxt.recoveryTargetXid = (TransactionId)pg_strtouint64(item->value, NULL, 0);
if (errno == EINVAL || errno == ERANGE) {
ereport(FATAL, (errmsg("recovery_target_xid is not a valid number: \"%s\"", item->value)));
}
ereport(DEBUG2, (errmsg_internal("recovery_target_xid = " XID_FMT, t_thrd.xlog_cxt.recoveryTargetXid)));
t_thrd.xlog_cxt.recoveryTarget = RECOVERY_TARGET_XID;
} else if (strcmp(item->name, "recovery_target_time") == 0) {
* if recovery_target_xid or recovery_target_name specified, then
* this overrides recovery_target_time
*/
if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_XID ||
t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_NAME) {
continue;
}
t_thrd.xlog_cxt.recoveryTarget = RECOVERY_TARGET_TIME;
* Convert the time string given by the user to TimestampTz form.
*/
t_thrd.xlog_cxt.recoveryTargetTime =
DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in, CStringGetDatum(item->value),
ObjectIdGetDatum(InvalidOid), Int32GetDatum(-1)));
ereport(DEBUG2, (errmsg_internal("recovery_target_time = '%s'",
timestamptz_to_str(t_thrd.xlog_cxt.recoveryTargetTime))));
#ifdef PGXC
} else if (strcmp(item->name, "recovery_target_barrier") == 0) {
t_thrd.xlog_cxt.recoveryTarget = RECOVERY_TARGET_BARRIER;
t_thrd.xlog_cxt.recoveryTargetBarrierId = pstrdup(item->value);
#endif
} else if (strcmp(item->name, "recovery_target_time_obs") == 0) {
t_thrd.xlog_cxt.recoveryTarget = RECOVERY_TARGET_TIME_OBS;
g_instance.roach_cxt.targetTimeInPITR = pstrdup(item->value);
* Convert the time string given by the user to TimestampTz form.
*/
t_thrd.xlog_cxt.obsRecoveryTargetTime = pstrdup(item->value);
ereport(LOG, (errmsg_internal("recovery_target_time_obs = '%s'", t_thrd.xlog_cxt.obsRecoveryTargetTime)));
} else if (strcmp(item->name, "recovery_target_name") == 0) {
* if recovery_target_xid specified, then this overrides
* recovery_target_name
*/
if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_XID) {
continue;
}
t_thrd.xlog_cxt.recoveryTarget = RECOVERY_TARGET_NAME;
t_thrd.xlog_cxt.recoveryTargetName = pstrdup(item->value);
if (strlen(t_thrd.xlog_cxt.recoveryTargetName) >= MAXFNAMELEN) {
ereport(FATAL, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("recovery_target_name is too long (maximum %d characters)", MAXFNAMELEN - 1)));
}
ereport(DEBUG2, (errmsg_internal("recovery_target_name = '%s'", t_thrd.xlog_cxt.recoveryTargetName)));
} else if (strcmp(item->name, "recovery_target_lsn") == 0) {
* if recovery_target_xid or recovery_target_name or recovery_target_time
* specified, then this overrides recovery_target_lsn
*/
if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_XID ||
t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_NAME ||
t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_TIME) {
continue;
}
t_thrd.xlog_cxt.recoveryTarget = RECOVERY_TARGET_LSN;
* Convert the LSN string given by the user to XLogRecPtr form.
*/
t_thrd.xlog_cxt.recoveryTargetLSN = DatumGetLSN(DirectFunctionCall3(pg_lsn_in, CStringGetDatum(item->value),
ObjectIdGetDatum(InvalidOid),
Int32GetDatum(-1)));
ereport(DEBUG2,
(errmsg_internal("recovery_target_lsn = '%X/%X'", (uint32)(t_thrd.xlog_cxt.recoveryTargetLSN >> 32),
(uint32)t_thrd.xlog_cxt.recoveryTargetLSN)));
} else if (strcmp(item->name, "recovery_target_inclusive") == 0) {
if (!parse_bool(item->value, &t_thrd.xlog_cxt.recoveryTargetInclusive)) {
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("parameter \"%s\" requires a Boolean value", "recovery_target_inclusive")));
}
ereport(DEBUG2, (errmsg_internal("recovery_target_inclusive = %s", item->value)));
} else if (strcmp(item->name, "standby_mode") == 0) {
if (!parse_bool(item->value, &t_thrd.xlog_cxt.StandbyModeRequested)) {
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("parameter \"%s\" requires a Boolean value", "standby_mode")));
}
ereport(DEBUG2, (errmsg_internal("standby_mode = '%s'", item->value)));
} else if (strcmp(item->name, "primary_conninfo") == 0) {
t_thrd.xlog_cxt.PrimaryConnInfo = pstrdup(item->value);
ereport(DEBUG2, (errmsg_internal("primary_conninfo = '%s'", t_thrd.xlog_cxt.PrimaryConnInfo)));
} else if (strcmp(item->name, "primary_slotname") == 0) {
ReplicationSlotValidateName(item->value, ERROR);
u_sess->attr.attr_storage.PrimarySlotName = pstrdup(item->value);
ereport(DEBUG2, (errmsg_internal("primary_slotname = '%s'", u_sess->attr.attr_storage.PrimarySlotName)));
} else if (strcmp(item->name, "trigger_file") == 0) {
t_thrd.xlog_cxt.TriggerFile = pstrdup(item->value);
ereport(DEBUG2, (errmsg_internal("trigger_file = '%s'", t_thrd.xlog_cxt.TriggerFile)));
} else {
ereport(FATAL, (errmsg("unrecognized recovery parameter \"%s\"", item->name)));
}
}
* Check for compulsory parameters
*/
if (t_thrd.xlog_cxt.StandbyModeRequested) {
if (t_thrd.xlog_cxt.PrimaryConnInfo == NULL && t_thrd.xlog_cxt.recoveryRestoreCommand == NULL) {
ereport(WARNING,
(errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
RECOVERY_COMMAND_FILE),
errhint("The database server will regularly poll the pg_xlog subdirectory to check for files "
"placed there.")));
}
} else {
if (t_thrd.xlog_cxt.recoveryRestoreCommand == NULL && XLogInsertAllowed()) {
ereport(
FATAL,
(errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
RECOVERY_COMMAND_FILE)));
}
}
t_thrd.xlog_cxt.ArchiveRestoreRequested = true;
* If user specified recovery_target_timeline, validate it or compute the
* "latest" value. We can't do this until after we've gotten the restore
* command and set InArchiveRecovery, because we need to fetch timeline
* history files from the archive.
*/
if (rtliGiven) {
if (rtli) {
if (rtli != 1 && !existsTimeLineHistory(rtli)) {
ereport(FATAL, (errmsg("recovery target timeline %u does not exist", rtli)));
}
t_thrd.xlog_cxt.recoveryTargetTLI = rtli;
t_thrd.xlog_cxt.recoveryTargetIsLatest = false;
} else {
t_thrd.xlog_cxt.recoveryTargetTLI = findNewestTimeLine(t_thrd.xlog_cxt.recoveryTargetTLI);
t_thrd.xlog_cxt.recoveryTargetIsLatest = true;
}
}
FreeConfigVariables(head);
}
void close_readFile_if_open()
{
if (t_thrd.xlog_cxt.readFile >= 0) {
close(t_thrd.xlog_cxt.readFile);
t_thrd.xlog_cxt.readFile = -1;
}
}
* Exit archive-recovery state
*/
static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo)
{
struct stat statbuf;
char recoveryPath[MAXPGPATH];
char xlogpath[MAXPGPATH];
errno_t errorno = EOK;
t_thrd.xlog_cxt.InArchiveRecovery = false;
UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
* If the ending log segment is still open, close it (to avoid problems on
* Windows with trying to rename or delete an open file).
*/
close_readFile_if_open();
* If we are establishing a new timeline, we have to copy data from the
* last WAL segment of the old timeline to create a starting WAL segment
* for the new timeline.
*
* Notify the archiver that the last WAL segment of the old timeline is
* ready to copy to archival storage. Otherwise, it is not archived for a
* while.
*/
if (endTLI != t_thrd.xlog_cxt.ThisTimeLineID) {
char path[MAXPGPATH];
char tmppath[MAXPGPATH];
errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", SS_XLOGDIR, endTLI,
(uint32)((endLogSegNo) / XLogSegmentsPerXLogId),
(uint32)((endLogSegNo) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
errorno = snprintf_s(tmppath, MAXPGPATH, MAXPGPATH - 1, "%s/xlogtemp.%lu", SS_XLOGDIR, gs_thread_self());
securec_check_ss(errorno, "\0", "\0");
XLogFileCopy(path, tmppath);
* Now move the segment into place with its final name.
*/
if (!InstallXLogFileSegment(&endLogSegNo, (const char *)tmppath, false, NULL, false, SS_XLOGDIR))
ereport(ERROR, (errcode(ERRCODE_CASE_NOT_FOUND), errmsg("InstallXLogFileSegment should not have failed")));
if (XLogArchivingActive()) {
errorno = snprintf_s(xlogpath, MAXPGPATH, MAXPGPATH - 1, "%08X%08X%08X", endTLI,
(uint32)((endLogSegNo) / XLogSegmentsPerXLogId),
(uint32)((endLogSegNo) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
XLogArchiveNotify(xlogpath);
}
}
* Let's just make real sure there are not .ready or .done flags posted
* for the new segment.
*/
errorno = snprintf_s(xlogpath, MAXPGPATH, MAXPGPATH - 1, "%08X%08X%08X", t_thrd.xlog_cxt.ThisTimeLineID,
(uint32)((endLogSegNo) / XLogSegmentsPerXLogId),
(uint32)((endLogSegNo) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
XLogArchiveCleanup(xlogpath);
errorno = snprintf_s(recoveryPath, MAXPGPATH, MAXPGPATH - 1, "%s/RECOVERYXLOG", SS_XLOGDIR);
securec_check_ss(errorno, "", "");
unlink(recoveryPath);
errorno = snprintf_s(recoveryPath, MAXPGPATH, MAXPGPATH - 1, "%s/RECOVERYHISTORY", SS_XLOGDIR);
securec_check_ss(errorno, "", "");
unlink(recoveryPath);
if (stat(RECOVERY_COMMAND_FILE, &statbuf) != 0) {
ereport(LOG, (errmsg("archive recovery complete")));
return;
}
* Rename the config file out of the way, so that we don't accidentally
* re-enter archive recovery mode in a subsequent crash.
*/
unlink(RECOVERY_COMMAND_DONE);
durable_rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE, FATAL);
ereport(LOG, (errmsg("archive recovery complete")));
}
* Remove WAL files that are not part of the given timeline's history.
*
* This is called during recovery, whenever we switch to follow a new
* timeline, and at the end of recovery when we create a new timeline. We
* wouldn't otherwise care about extra WAL files lying in pg_xlog, but they
* can be pre-allocated or recycled WAL segments on the old timeline that we
* haven't used yet, and contain garbage. If we just leave them in pg_xlog,
* they will eventually be archived, and we can't let that happen. Files that
* belong to our timeline history are valid, because we have successfully
* replayed them, but from others we can't be sure.
*
* 'switchpoint' is the current point in WAL where we switch to new timeline,
* and 'newTLI' is the new timeline we switch to.
*/
static void RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
{
DIR *xldir = NULL;
struct dirent *xlde = NULL;
char switchseg[MAXFNAMELEN];
XLogSegNo endLogSegNo;
errno_t errorno = EOK;
XLByteToPrevSeg(switchpoint, endLogSegNo);
xldir = AllocateDir(SS_XLOGDIR);
if (xldir == NULL) {
ereport(ERROR,
(errcode_for_file_access(), errmsg("could not open transaction log directory \"%s\": %s",
SS_XLOGDIR, TRANSLATE_ERRNO)));
}
errorno = snprintf_s(switchseg, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", newTLI,
(uint32)((endLogSegNo) / XLogSegmentsPerXLogId),
(uint32)((endLogSegNo) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
ereport(DEBUG2, (errmsg("attempting to remove WAL segments newer than log file %s", switchseg)));
while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL) {
if (strlen(xlde->d_name) != 24 || strspn(xlde->d_name, "0123456789ABCDEF") != 24) {
continue;
}
* Remove files that are on a timeline older than the new one we're
* switching to, but with a segment number >= the first segment on
* the new timeline.
*/
if (strncmp(xlde->d_name, switchseg, 8) < 0 && strcmp(xlde->d_name + 8, switchseg + 8) > 0) {
* If the file has already been marked as .ready, however, don't
* remove it yet. It should be OK to remove it - files that are
* not part of our timeline history are not required for recovery
* - but seems safer to let them be archived and removed later.
*/
if (!XLogArchiveIsReady(xlde->d_name)) {
RemoveXlogFile(xlde->d_name, switchpoint);
}
}
}
FreeDir(xldir);
xldir = NULL;
}
void CheckBeforeTruncateXLog(XLogRecPtr endRecPtr)
{
const uint32 shiftSize = 32;
ProcTxnWorkLoad(true);
if (XLogRecPtrIsInvalid(t_thrd.xlog_cxt.minRecoveryPoint) ||
XLByteLT(endRecPtr, t_thrd.xlog_cxt.minRecoveryPoint) ||
XLogRecPtrIsValid(t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint)) {
ereport(FATAL, (errmsg("truncate xlog LSN is before consistent recovery point WAL location (LSN) %X/%X",
(uint32)(endRecPtr >> shiftSize), (uint32)endRecPtr)));
}
}
* @Description: After date restore, truncate XLOG after barrier lsn of standby DN.
* off - The offset of the XLOG log needs to be cleared .
*/
void TruncateAndRemoveXLogForRoachRestore(XLogReaderState *record)
{
CheckBeforeTruncateXLog(record->EndRecPtr);
ShutdownWalRcv();
uint32 xlogOff;
XLogSegNo xlogsegno;
char xlogFileName[1024] = {0};
XLByteToSeg(record->EndRecPtr, xlogsegno);
XLogFileName(xlogFileName, MAXFNAMELEN, record->readPageTLI, xlogsegno);
xlogOff = (uint32)(record->EndRecPtr) % XLogSegSize;
int fd = 0;
char *writeContent = NULL;
uint32 truncateLength = 0;
char XLogFilePath[MAX_PATH_LEN] = {0};
errno_t rc = EOK;
rc = snprintf_s(XLogFilePath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/%s", SS_XLOGDIR, xlogFileName);
securec_check_ss(rc, "", "");
if (xlogOff > 0) {
fd = BasicOpenFile(XLogFilePath,
O_RDWR | PG_BINARY | (unsigned int)get_sync_bit(u_sess->attr.attr_storage.sync_method),
S_IRUSR | S_IWUSR);
if (fd < 0) {
ereport(FATAL, (errcode_for_file_access(), errmsg("could not open file \"%s\"", XLogFilePath)));
}
if (lseek(fd, (off_t)xlogOff, SEEK_SET) < 0) {
fprintf(stderr, "lseek error !\n");
close(fd);
ereport(ERROR, (errcode_for_file_access(), errmsg("lseek file error \"%s\" ", XLogFilePath)));
return;
}
truncateLength = XLogSegSize - xlogOff;
writeContent = (char *)palloc0(truncateLength);
if (write(fd, writeContent, truncateLength) != truncateLength) {
close(fd);
pfree(writeContent);
ereport(ERROR, (errcode_for_file_access(), errmsg("could not write file \"%s\" ", XLogFilePath)));
return;
}
if (fsync(fd) != 0) {
close(fd);
pfree(writeContent);
ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync file \"%s\" ", XLogFilePath)));
return;
}
if (close(fd)) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\" ", XLogFilePath)));
}
pfree(writeContent);
}
if (((IS_OBS_DISASTER_RECOVER_MODE == false) && (IS_MULTI_DISASTER_RECOVER_MODE == false)) ||
(t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_TIME_OBS))
durable_rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE, FATAL);
DIR *xldir = NULL;
struct dirent *xlde = NULL;
xldir = AllocateDir(SS_XLOGDIR);
if (xldir == NULL) {
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open transaction log directory \"%s\": %s", SS_XLOGDIR, TRANSLATE_ERRNO)));
}
while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL) {
if (strlen(xlde->d_name) != 24 || strspn(xlde->d_name, "0123456789ABCDEF") != 24) {
continue;
}
if (strcmp(xlde->d_name, xlogFileName) > 0 || (xlogOff == 0 && strcmp(xlde->d_name, xlogFileName) == 0)) {
rc = snprintf_s(XLogFilePath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/%s", SS_XLOGDIR, xlde->d_name);
securec_check_ss(rc, "", "");
rc = unlink(XLogFilePath);
if (rc != 0) {
ereport(LOG, (errcode_for_file_access(),
errmsg("could not remove old transaction log file \"%s\": %s", XLogFilePath, TRANSLATE_ERRNO)));
}
}
}
FreeDir(xldir);
return;
}
static bool xlogCanStop(XLogReaderState *record)
{
return (XLogRecGetRmid(record) != RM_XACT_ID && XLogRecGetRmid(record) != RM_BARRIER_ID &&
XLogRecGetRmid(record) != RM_XLOG_ID);
}
#ifndef ENABLE_LITE_MODE
static void DropAllRecoverySlotForPitr()
{
List *all_archive_slots = NIL;
all_archive_slots = GetAllRecoverySlotsName();
if (all_archive_slots == NIL || all_archive_slots->length == 0) {
list_free_deep(all_archive_slots);
return;
}
foreach_cell(cell, all_archive_slots) {
char* slotname = (char*)lfirst(cell);
char path[MAXPGPATH];
struct stat stat_buf;
int nRet = snprintf_s(path, sizeof(path), MAXPGPATH - 1, "pg_replslot/%s", slotname);
securec_check_ss(nRet, "\0", "\0");
ReplicationSlotDrop(slotname, false);
if (stat(path, &stat_buf) != 0 && errno == ENOENT) {
ereport(LOG, (errmsg("last step in PITR recovery, delete recovery slot %s successfully.", slotname)));
} else {
ereport(WARNING, (errmsg("last step in PITR recovery, delete recovery slot %s failed.", slotname)));
}
}
list_free_deep(all_archive_slots);
return;
}
#endif
* For point-in-time recovery, this function decides whether we want to
* stop applying the XLOG at or after the current record.
*
* Returns TRUE if we are stopping, FALSE otherwise. On TRUE return,
* *includeThis is set TRUE if we should apply this record before stopping.
*
* We also track the timestamp of the latest applied COMMIT/ABORT
* record in XLogCtl->recoveryLastXTime, for logging purposes.
* Also, some information is saved in recoveryStopXid et al for use in
* annotating the new timeline's history file.
*/
static bool recoveryStopsHere(XLogReaderState *record, bool *includeThis)
{
bool stopsHere = false;
char recordRPName[MAXFNAMELEN] = {0};
uint8 record_info;
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
errno_t rc = EOK;
#ifdef PGXC
bool stopsAtThisBarrier = false;
#endif
#ifdef PGXC
TimestampTz recordXtime = 0;
#else
TimestampTz recordXtime;
#endif
char *recordBarrierId = NULL;
char *stopBarrierId = NULL;
if (xlogCanStop(record)) {
return false;
}
record_info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
#ifdef ENABLE_MULTIPLE_NODES
if (XLogRecGetRmid(record) == RM_XACT_ID) {
#endif
if (XLogRecGetRmid(record) == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_COMPACT) {
xl_xact_commit_compact *recordXactCommitData = (xl_xact_commit_compact *)XLogRecGetData(record);
recordXtime = recordXactCommitData->xact_time;
} else if (XLogRecGetRmid(record) == RM_XACT_ID && record_info == XLOG_XACT_COMMIT) {
xl_xact_commit *recordXactCommitData = (xl_xact_commit *)XLogRecGetData(record);
recordXtime = recordXactCommitData->xact_time;
} else if (XLogRecGetRmid(record) == RM_XACT_ID &&
(record_info == XLOG_XACT_ABORT || record_info == XLOG_XACT_ABORT_WITH_XID)) {
xl_xact_abort *recordXactAbortData = (xl_xact_abort *)XLogRecGetData(record);
recordXtime = recordXactAbortData->xact_time;
}
#ifdef ENABLE_MULTIPLE_NODES
}
#endif
else if (XLogRecGetRmid(record) == RM_BARRIER_ID) {
#ifndef ENABLE_LITE_MODE
#ifdef ENABLE_OBS
if (record_info == XLOG_BARRIER_CREATE) {
recordBarrierId = (char *)XLogRecGetData(record);
ereport(u_sess->attr.attr_storage.HaModuleDebug ? LOG : DEBUG4,
(errmsg("processing barrier xlog record for %s", recordBarrierId)));
if (IS_OBS_DISASTER_RECOVER_MODE && t_thrd.xlog_cxt.recoveryTarget != RECOVERY_TARGET_TIME_OBS)
update_recovery_barrier();
}
#else
FEATURE_ON_LITE_MODE_NOT_SUPPORTED();
#endif
#endif
} else if (XLogRecGetRmid(record) == RM_XLOG_ID) {
if (record_info == XLOG_RESTORE_POINT) {
xl_restore_point *recordRestorePointData = (xl_restore_point *)XLogRecGetData(record);
recordXtime = recordRestorePointData->rp_time;
rc = strncpy_s(recordRPName, MAXFNAMELEN, recordRestorePointData->rp_name, MAXFNAMELEN - 1);
securec_check(rc, "", "");
}
} else {
return false;
}
if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_LSN &&
record->ReadRecPtr >= t_thrd.xlog_cxt.recoveryTargetLSN) {
*includeThis = t_thrd.xlog_cxt.recoveryTargetInclusive;
t_thrd.xlog_cxt.recoveryStopAfter = *includeThis;
t_thrd.xlog_cxt.recoveryStopXid = InvalidTransactionId;
t_thrd.xlog_cxt.recoveryStopLSN = record->ReadRecPtr;
t_thrd.xlog_cxt.recoveryStopTime = 0;
t_thrd.xlog_cxt.recoveryStopName[0] = '\0';
if (t_thrd.xlog_cxt.recoveryStopAfter) {
ereport(LOG,
(errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
(uint32)(t_thrd.xlog_cxt.recoveryStopLSN >> 32), (uint32)t_thrd.xlog_cxt.recoveryStopLSN)));
} else {
ereport(LOG,
(errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
(uint32)(t_thrd.xlog_cxt.recoveryStopLSN >> 32), (uint32)t_thrd.xlog_cxt.recoveryStopLSN)));
}
if (stopsHere) {
TruncateAndRemoveXLogForRoachRestore(record);
}
return true;
}
if ((IS_OBS_DISASTER_RECOVER_MODE || IS_MULTI_DISASTER_RECOVER_MODE) &&
t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_UNSET) {
if (strlen((char *)walrcv->recoveryStopBarrierId) == 0 &&
strlen((char *)walrcv->recoverySwitchoverBarrierId) == 0) {
if (XLogRecGetRmid(record) == RM_XACT_ID) {
SetLatestXTime(recordXtime);
}
return false;
} else {
t_thrd.xlog_cxt.recoveryTarget = RECOVERY_TARGET_BARRIER;
}
}
#ifdef ENABLE_MULTIPLE_NODES
if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_XID) {
#else
if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_XID && XLogRecGetRmid(record) == RM_XACT_ID) {
#endif
* There can be only one transaction end record with this exact
* transactionid
*
* when testing for an xid, we MUST test for equality only, since
* transactions are numbered in the order they start, not the order
* they complete. A higher numbered xid will complete before you about
* 50% of the time...
*/
stopsHere = (XLogRecGetXid(record) == t_thrd.xlog_cxt.recoveryTargetXid);
if (stopsHere) {
TruncateAndRemoveXLogForRoachRestore(record);
*includeThis = t_thrd.xlog_cxt.recoveryTargetInclusive;
}
} else if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_TIME_OBS) {
#ifndef ENABLE_LITE_MODE
stopsHere = false;
if (XLogRecGetRmid(record) == RM_XACT_ID && !g_instance.roach_cxt.isGtmFreeCsn) {
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
if (g_instance.roach_cxt.globalBarrierRecordForPITR != NULL && (info == XLOG_XACT_COMMIT ||
info == XLOG_XACT_COMMIT_COMPACT || info == XLOG_XACT_COMMIT_PREPARED)) {
char* token = NULL;
char* lastPtr = NULL;
uint64 targetCsn = InvalidCommitSeqNo;
uint64 currCsn = InvalidCommitSeqNo;
long ts = 0;
if (strncmp(g_instance.roach_cxt.globalBarrierRecordForPITR, "csn_", 4) == 0) {
token = strtok_s(g_instance.roach_cxt.globalBarrierRecordForPITR, "-", &lastPtr);
if (sscanf_s(token, "csn_%021lu_%013ld", &targetCsn, &ts) != 2) {
ereport(PANIC, (errmsg("recovery target csn could not find!")));
}
if (info == XLOG_XACT_COMMIT) {
xl_xact_commit* xlrec = (xl_xact_commit *)XLogRecGetData(record);
currCsn = xlrec->csn;
} else if (info == XLOG_XACT_COMMIT_COMPACT) {
xl_xact_commit_compact* xlrec = (xl_xact_commit_compact *)XLogRecGetData(record);
currCsn = xlrec->csn;
} else if (info == XLOG_XACT_COMMIT_PREPARED) {
xl_xact_commit_prepared* xlrec = (xl_xact_commit_prepared *)XLogRecGetData(record);
currCsn = xlrec->crec.csn;
} else {
ereport(PANIC, (errmsg("recovery target csn xact content error!")));
}
if (currCsn > targetCsn) {
ereport(LOG, (errmsg("For PITR recovery, reach target csn %lu, current csn is %lu,"
" and current xlog location is %08X/%08X.", targetCsn, currCsn,
(uint32)(record->EndRecPtr >> 32), (uint32)record->EndRecPtr)));
stopsHere = true;
*includeThis = true;
uint32 xlogOff;
pfree_ext(g_instance.roach_cxt.globalBarrierRecordForPITR);
pfree_ext(g_instance.roach_cxt.targetRestoreTimeFromMedia);
TruncateAndRemoveXLogForRoachRestore(record);
xlogOff = (uint32)(record->EndRecPtr) % XLOG_BLCKSZ;
if (xlogOff > 0 && xlogOff < XLOG_BLCKSZ) {
rc = memset_s(&record->readBuf[xlogOff], XLOG_BLCKSZ - xlogOff, 0, XLOG_BLCKSZ - xlogOff);
securec_check(rc, "", "");
}
DropAllRecoverySlotForPitr();
}
}
}
} else if ((XLogRecGetRmid(record) == RM_BARRIER_ID) && (record_info == XLOG_BARRIER_CREATE)) {
ereport(DEBUG2, (errmsg("checking if barrier record matches the target obs time")));
Assert(t_thrd.xlog_cxt.obsRecoveryTargetTime != NULL);
Assert(recordBarrierId != NULL);
if (t_thrd.xlog_cxt.obsRecoveryTargetTime == NULL || recordBarrierId == NULL) {
ereport(PANIC,
(errmsg("recovery target is RECOVERY_TARGET_TIME_OBS, but obsRecoveryTargetTime not set")));
}
if (strncmp(recordBarrierId, "hadr_", 5) == 0 || strncmp(recordBarrierId, "csn_", 4) == 0) {
char* barrierTime = strrchr(recordBarrierId, '_');
barrierTime += 1;
if (g_instance.roach_cxt.targetRestoreTimeFromMedia != NULL &&
strcmp(g_instance.roach_cxt.targetRestoreTimeFromMedia, barrierTime) == 0) {
stopsAtThisBarrier = true;
*includeThis = true;
uint32 xlogOff;
ereport(u_sess->attr.attr_storage.HaModuleDebug ? LOG : DEBUG4,
(errmsg("stop barrier success xlog record for %s", recordBarrierId)));
pfree_ext(g_instance.roach_cxt.globalBarrierRecordForPITR);
pfree_ext(g_instance.roach_cxt.targetRestoreTimeFromMedia);
if (IS_OBS_DISASTER_RECOVER_MODE || IS_MULTI_DISASTER_RECOVER_MODE) {
TruncateAndRemoveXLogForRoachRestore(record);
xlogOff = (uint32)(record->EndRecPtr) % XLOG_BLCKSZ;
if (xlogOff > 0 && xlogOff < XLOG_BLCKSZ) {
rc = memset_s(&record->readBuf[xlogOff], XLOG_BLCKSZ - xlogOff, 0, XLOG_BLCKSZ - xlogOff);
securec_check(rc, "", "");
}
DropAllRecoverySlotForPitr();
}
}
}
}
#else
FEATURE_ON_LITE_MODE_NOT_SUPPORTED();
#endif
} else if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_BARRIER) {
stopsHere = false;
if ((XLogRecGetRmid(record) == RM_BARRIER_ID) && (record_info == XLOG_BARRIER_CREATE)) {
ereport(DEBUG2, (errmsg("checking if barrier record matches the target barrier")));
if ((t_thrd.xlog_cxt.recoveryTargetBarrierId != NULL && recordBarrierId != NULL &&
strcmp(t_thrd.xlog_cxt.recoveryTargetBarrierId, recordBarrierId) == 0) ||
(recordBarrierId != NULL && strcmp((char *)walrcv->recoveryStopBarrierId, recordBarrierId) == 0)) {
stopsAtThisBarrier = true;
*includeThis = true;
uint32 xlogOff;
if (t_thrd.xlog_cxt.server_mode == STANDBY_MODE || IS_OBS_DISASTER_RECOVER_MODE) {
TruncateAndRemoveXLogForRoachRestore(record);
xlogOff = (uint32)(record->EndRecPtr) % XLOG_BLCKSZ;
if (xlogOff > 0 && xlogOff < XLOG_BLCKSZ) {
rc = memset_s(&record->readBuf[xlogOff], XLOG_BLCKSZ - xlogOff, 0, XLOG_BLCKSZ - xlogOff);
securec_check(rc, "", "");
}
}
} else if (recordBarrierId != NULL &&
strcmp((char *)walrcv->recoverySwitchoverBarrierId, recordBarrierId) == 0) {
stopsAtThisBarrier = true;
ereport(LOG, (errmsg("reach switchover barrierID %s", (char *)walrcv->recoverySwitchoverBarrierId)));
}
}
} else if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_NAME) {
* There can be many restore points that share the same name, so we
* stop at the first one
*/
stopsHere = (strcmp(recordRPName, t_thrd.xlog_cxt.recoveryTargetName) == 0);
if (stopsHere) {
TruncateAndRemoveXLogForRoachRestore(record);
}
*includeThis = false;
#ifdef ENABLE_MULTIPLE_NODES
} else if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_TIME) {
#else
} else if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_TIME && XLogRecGetRmid(record) == RM_XACT_ID) {
#endif
* There can be many transactions that share the same commit time, so
* we stop after the last one, if we are inclusive, or stop at the
* first one if we are exclusive
*/
if (t_thrd.xlog_cxt.recoveryTargetInclusive) {
stopsHere = (recordXtime > t_thrd.xlog_cxt.recoveryTargetTime);
} else {
stopsHere = (recordXtime >= t_thrd.xlog_cxt.recoveryTargetTime);
}
if (stopsHere) {
TruncateAndRemoveXLogForRoachRestore(record);
*includeThis = false;
}
}
if (stopsHere) {
t_thrd.xlog_cxt.recoveryStopXid = XLogRecGetXid(record);
t_thrd.xlog_cxt.recoveryStopTime = recordXtime;
t_thrd.xlog_cxt.recoveryStopAfter = *includeThis;
t_thrd.xlog_cxt.recoveryStopLSN = InvalidXLogRecPtr;
if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT) {
if (t_thrd.xlog_cxt.recoveryStopAfter)
ereport(LOG, (errmsg("recovery stopping after commit of transaction " XID_FMT ", time %s",
t_thrd.xlog_cxt.recoveryStopXid,
timestamptz_to_str(t_thrd.xlog_cxt.recoveryStopTime))));
else
ereport(LOG, (errmsg("recovery stopping before commit of transaction " XID_FMT ", time %s",
t_thrd.xlog_cxt.recoveryStopXid,
timestamptz_to_str(t_thrd.xlog_cxt.recoveryStopTime))));
} else if (record_info == XLOG_XACT_ABORT || record_info == XLOG_XACT_ABORT_WITH_XID) {
if (t_thrd.xlog_cxt.recoveryStopAfter)
ereport(LOG, (errmsg("recovery stopping after abort of transaction " XID_FMT ", time %s",
t_thrd.xlog_cxt.recoveryStopXid,
timestamptz_to_str(t_thrd.xlog_cxt.recoveryStopTime))));
else
ereport(LOG, (errmsg("recovery stopping before abort of transaction " XID_FMT ", time %s",
t_thrd.xlog_cxt.recoveryStopXid,
timestamptz_to_str(t_thrd.xlog_cxt.recoveryStopTime))));
} else {
rc = strncpy_s(t_thrd.xlog_cxt.recoveryStopName, MAXFNAMELEN, recordRPName, MAXFNAMELEN - 1);
securec_check(rc, "", "");
ereport(LOG, (errmsg("recovery stopping at restore point \"%s\", time %s", t_thrd.xlog_cxt.recoveryStopName,
timestamptz_to_str(t_thrd.xlog_cxt.recoveryStopTime))));
}
* Note that if we use a RECOVERY_TARGET_TIME then we can stop at a
* restore point since they are timestamped, though the latest
* transaction time is not updated.
*/
if (XLogRecGetRmid(record) == RM_XACT_ID && t_thrd.xlog_cxt.recoveryStopAfter) {
SetLatestXTime(recordXtime);
}
#ifdef PGXC
} else if (stopsAtThisBarrier) {
t_thrd.xlog_cxt.recoveryStopTime = recordXtime;
if (strlen((char *)walrcv->recoveryStopBarrierId) != 0) {
stopBarrierId = (char *)walrcv->recoveryStopBarrierId;
} else {
stopBarrierId = (char *)walrcv->recoverySwitchoverBarrierId;
}
ereport(LOG, (errmsg("recovery stopping at barrier %s, time %s",
IS_OBS_DISASTER_RECOVER_MODE ? stopBarrierId : t_thrd.xlog_cxt.recoveryTargetBarrierId,
timestamptz_to_str(t_thrd.xlog_cxt.recoveryStopTime))));
return true;
#endif
} else if (XLogRecGetRmid(record) == RM_XACT_ID) {
SetLatestXTime(recordXtime);
}
return stopsHere;
}
bool RecoveryIsSuspend(void)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
bool recoverySuspend = false;
SpinLockAcquire(&xlogctl->info_lck);
recoverySuspend = xlogctl->recoverySusPend;
SpinLockRelease(&xlogctl->info_lck);
return recoverySuspend;
}
void SetRecoverySuspend(bool recoverySuspend)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->recoverySusPend = recoverySuspend;
SpinLockRelease(&xlogctl->info_lck);
}
* Wait until shared recoveryPause flag is cleared.
*
* XXX Could also be done with shared latch, avoiding the pg_usleep loop.
* Probably not worth the trouble though. This state shouldn't be one that
* anyone cares about server power consumption in.
*/
static void recoveryPausesHere(void)
{
if (!t_thrd.xlog_cxt.LocalHotStandbyActive) {
return;
}
ereport(LOG, (errmsg("recovery has paused"), errhint("Execute pg_xlog_replay_resume() to continue.")));
while (RecoveryIsPaused()) {
pg_usleep(1000000L);
RedoInterruptCallBack();
}
}
bool RecoveryIsPaused(void)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
bool recoveryPause = false;
SpinLockAcquire(&xlogctl->info_lck);
recoveryPause = xlogctl->recoveryPause;
SpinLockRelease(&xlogctl->info_lck);
return recoveryPause;
}
void SetRecoveryPause(bool recoveryPause)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->recoveryPause = recoveryPause;
SpinLockRelease(&xlogctl->info_lck);
}
#ifndef ENABLE_MULTIPLE_NODES
static bool CheckApplyDelayReady(void)
{
if (u_sess->attr.attr_storage.recovery_min_apply_delay <= 0) {
return false;
}
if (!t_thrd.xlog_cxt.reachedConsistency) {
return false;
}
if (!t_thrd.xlog_cxt.ArchiveRecoveryRequested) {
return false;
}
return true;
}
static void KeepWalrecvAliveWhenRecoveryDelay()
{
static TimestampTz lastTestWalrecvTime = (TimestampTz)0;
TimestampTz now = GetCurrentTimestamp();
if (!TimestampDifferenceExceeds(lastTestWalrecvTime, now, 1000)) {
return;
}
lastTestWalrecvTime = now;
if (WalRcvInProgress()) {
return;
}
if (g_instance.pid_cxt.BarrierPreParsePID != 0) {
return;
}
g_instance.csn_barrier_cxt.pre_parse_started = false;
}
* When recovery_min_apply_delay is set, we wait long enough to make sure
* certain record types are applied at least that interval behind the master.
*
* Returns true if we waited.
*
* Note that the delay is calculated between the WAL record log time and
* the current time on standby. We would prefer to keep track of when this
* standby received each WAL record, which would allow a more consistent
* approach and one not affected by time synchronisation issues, but that
* is significantly more effort and complexity for little actual gain in
* usability.
*/
static bool RecoveryApplyDelay(const XLogReaderState *record)
{
uint8 xactInfo;
TimestampTz xtime;
long secs;
int microsecs;
long waitTime = 0;
if (!CheckApplyDelayReady()) {
return false;
}
KeepWalrecvAliveWhenRecoveryDelay();
* Is it a COMMIT record?
* We deliberately choose not to delay aborts since they have no effect on
* MVCC. We already allow replay of records that don't have a timestamp,
* so there is already opportunity for issues caused by early conflicts on
* standbys
*/
if (XLogRecGetRmid(record) != RM_XACT_ID) {
return false;
}
xactInfo = XLogRecGetInfo(record) & (~XLR_INFO_MASK);
if ((xactInfo == XLOG_XACT_COMMIT) || (xactInfo == XLOG_STANDBY_CSN_COMMITTING)) {
xtime = ((xl_xact_commit *)XLogRecGetData(record))->xact_time;
} else if (xactInfo == XLOG_XACT_COMMIT_COMPACT) {
xtime = ((xl_xact_commit_compact *)XLogRecGetData(record))->xact_time;
} else {
return false;
}
u_sess->attr.attr_storage.recoveryDelayUntilTime =
TimestampTzPlusMilliseconds(xtime, u_sess->attr.attr_storage.recovery_min_apply_delay);
TimestampDifference(GetCurrentTimestamp(), u_sess->attr.attr_storage.recoveryDelayUntilTime, &secs, µsecs);
if (secs <= 0 && microsecs <= 0) {
return false;
}
while (true) {
ResetRecoveryDelayLatch();
RedoInterruptCallBack();
KeepWalrecvAliveWhenRecoveryDelay();
if (CheckForFailoverTrigger() || CheckForSwitchoverTrigger() || CheckForStandbyTrigger()) {
break;
}
u_sess->attr.attr_storage.recoveryDelayUntilTime =
TimestampTzPlusMilliseconds(xtime, u_sess->attr.attr_storage.recovery_min_apply_delay);
TimestampDifference(GetCurrentTimestamp(), u_sess->attr.attr_storage.recoveryDelayUntilTime, &secs, µsecs);
ProcTxnWorkLoad(false);
if (secs <= 0 && microsecs / 1000 <= 0) {
break;
}
waitTime = (secs >= 1) ? 1000 : (microsecs / 1000);
WaitRecoveryDelayLatch(waitTime);
}
return true;
}
#endif
* Save timestamp of latest processed commit/abort record.
*
* We keep this in XLogCtl, not a simple static variable, so that it can be
* seen by processes other than the startup process. Note in particular
* that CreateRestartPoint is executed in the checkpointer.
*/
void SetLatestXTime(TimestampTz xtime)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->recoveryLastXTime = xtime;
SpinLockRelease(&xlogctl->info_lck);
}
* Fetch timestamp of latest processed commit/abort record.
*/
TimestampTz GetLatestXTime(void)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
TimestampTz xtime;
SpinLockAcquire(&xlogctl->info_lck);
xtime = xlogctl->recoveryLastXTime;
SpinLockRelease(&xlogctl->info_lck);
return xtime;
}
* Save timestamp of the next chunk of WAL records to apply.
*
* We keep this in XLogCtl, not a simple static variable, so that it can be
* seen by all backends.
*/
static void SetCurrentChunkStartTime(TimestampTz xtime)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->currentChunkStartTime = xtime;
SpinLockRelease(&xlogctl->info_lck);
}
* Fetch timestamp of latest processed commit/abort record.
* Startup process maintains an accurate local copy in XLogReceiptTime
*/
TimestampTz GetCurrentChunkReplayStartTime(void)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
TimestampTz xtime;
SpinLockAcquire(&xlogctl->info_lck);
xtime = xlogctl->currentChunkStartTime;
SpinLockRelease(&xlogctl->info_lck);
return xtime;
}
void SetXLogReceiptTime(TimestampTz time)
{
t_thrd.xlog_cxt.XLogReceiptTime = time;
}
* Returns time of receipt of current chunk of XLOG data, as well as
* whether it was received from streaming replication or from archives.
*/
void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
{
* This must be executed in the startup process, since we don't export the
* relevant state to shared memory.
*/
Assert(t_thrd.xlog_cxt.InRecovery);
*rtime = t_thrd.xlog_cxt.XLogReceiptTime;
*fromStream = (t_thrd.xlog_cxt.XLogReceiptSource == XLOG_FROM_STREAM);
}
TimestampTz GetXLogReceiptTimeOnly()
{
return t_thrd.xlog_cxt.XLogReceiptTime;
}
void SetXLogReceiptSource(int source)
{
t_thrd.xlog_cxt.XLogReceiptSource = source;
}
int GetXLogReceiptSource()
{
return t_thrd.xlog_cxt.XLogReceiptSource;
}
* Note that text field supplied is a parameter name and does not require
* translation
*/
#define RecoveryRequiresIntParameter(param_name, currValue, minValue) do { \
if ((currValue) < (minValue)) { \
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
errmsg("hot standby is not possible because " \
"%s = %d is a lower setting than on the master server " \
"(its value was %d)", \
param_name, currValue, minValue), \
errhint("You might need to increase the value of %s ", param_name))); \
} \
} while (0)
* Check to see if required parameters are set high enough on this server
* for various aspects of recovery operation.
*/
static void CheckRequiredParameterValues(bool DBStateShutdown)
{
* For archive recovery, the WAL must be generated with at least 'archive'
* wal_level.
*/
if (t_thrd.xlog_cxt.InArchiveRecovery && t_thrd.shemem_ptr_cxt.ControlFile->wal_level == WAL_LEVEL_MINIMAL) {
ereport(WARNING,
(errmsg("WAL was generated with wal_level=minimal, data may be missing"),
errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
}
* For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
* we must have at least as many backend slots as the primary.
*/
if (t_thrd.xlog_cxt.InArchiveRecovery && g_instance.attr.attr_storage.EnableHotStandby && !DBStateShutdown) {
if (t_thrd.shemem_ptr_cxt.ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY) {
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" or higher on "
"the master server"),
errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));
}
RecoveryRequiresIntParameter("max_locks_per_transaction", g_instance.attr.attr_storage.max_locks_per_xact,
t_thrd.shemem_ptr_cxt.ControlFile->max_locks_per_xact);
}
}
void StartupDummyStandby(void)
{
char conninfo[MAXCONNINFO];
int replIdx = 1;
ereport(LOG, (errmsg("database system startup Secondary Standby")));
ReadControlFile();
TimeLineID timeline = t_thrd.xlog_cxt.ThisTimeLineID = GetThisTimeID();
DummyStandbySetRecoveryTargetTLI(timeline);
sync_system_identifier = GetSystemIdentifier();
ereport(LOG, (errmsg("timline from control file = %u, systemid = %lu", (uint32)timeline, sync_system_identifier)));
SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
do {
RedoInterruptCallBack();
if (!WalRcvInProgress()) {
while (1) {
int replIdxFromFile = -1;
if (WalSndInProgress(SNDROLE_DUMMYSTANDBY_STANDBY)) {
goto retry;
}
get_failover_host_conninfo_for_dummy(&replIdxFromFile);
if (replIdxFromFile < 0) {
replIdx = ((replIdx - 1) % REP_CONN_ARRAY) + 1;
} else {
replIdx = (replIdxFromFile % REP_CONN_ARRAY) + 1;
}
connect_dn_str(conninfo, replIdx);
replIdx++;
RedoInterruptCallBack();
if (libpqrcv_connect_for_TLI(&timeline, conninfo)) {
ereport(LOG, (errmsg("timline from primary = %u", (uint32)timeline)));
DummyStandbySetRecoveryTargetTLI(timeline);
SetThisTimeID(timeline);
UpdateControlFile();
break;
}
sleep(DUMMYSTANDBY_CONNECT_INTERVAL);
}
ShutdownWalRcv();
XLogRecPtr startpos = 0;
RequestXLogStreaming(&startpos, conninfo, REPCONNTARGET_DEFAULT, u_sess->attr.attr_storage.PrimarySlotName);
}
if (!g_instance.attr.attr_storage.enable_mix_replication && !IS_DN_MULTI_STANDYS_MODE()) {
if (!DataRcvInProgress()) {
if (DataSndInProgress(SNDROLE_DUMMYSTANDBY_STANDBY)) {
goto retry;
}
RequestDataStreaming(conninfo, REPCONNTARGET_DEFAULT);
}
}
retry:
sleep(DUMMYSTANDBY_CHECK_INTERVAL);
} while (1);
}
void ResourceManagerStartup(void)
{
for (uint32 rmid = 0; rmid <= RM_MAX_ID; rmid++) {
if (RmgrTable[rmid].rm_startup != NULL) {
RmgrTable[rmid].rm_startup();
}
}
}
void ResourceManagerStop(void)
{
for (uint32 rmid = 0; rmid <= RM_MAX_ID; rmid++) {
if (RmgrTable[rmid].rm_cleanup != NULL) {
RmgrTable[rmid].rm_cleanup();
}
}
}
#define RecoveryXlogReader(_oldXlogReader, _xlogreader) do { \
if (get_real_recovery_parallelism() > 1) { \
if (GetRedoWorkerCount() > 0) { \
errno_t errorno; \
(_oldXlogReader)->ReadRecPtr = (_xlogreader)->ReadRecPtr; \
(_oldXlogReader)->EndRecPtr = (_xlogreader)->EndRecPtr; \
(_oldXlogReader)->readSegNo = (_xlogreader)->readSegNo; \
(_oldXlogReader)->readOff = (_xlogreader)->readOff; \
(_oldXlogReader)->readLen = (_xlogreader)->readLen; \
(_oldXlogReader)->readPageTLI = (_xlogreader)->readPageTLI; \
(_oldXlogReader)->curReadOff = (_xlogreader)->curReadOff; \
(_oldXlogReader)->curReadSegNo = (_xlogreader)->curReadSegNo; \
(_oldXlogReader)->currRecPtr = (_xlogreader)->currRecPtr; \
(_oldXlogReader)->latestPagePtr = (_xlogreader)->latestPagePtr; \
(_oldXlogReader)->latestPageTLI = (_xlogreader)->latestPageTLI; \
(_oldXlogReader)->isPRProcess = false; \
errorno = memcpy_s((_oldXlogReader)->readBuf, XLOG_BLCKSZ, (_xlogreader)->readBuf, \
(_oldXlogReader)->readLen); \
securec_check(errorno, "", ""); \
(_oldXlogReader)->preReadStartPtr = InvalidXLogRecPtr; \
(_oldXlogReader)->preReadBuf = NULL; \
ResetDecoder(_oldXlogReader); \
(_xlogreader) = (_oldXlogReader); \
} \
EndDispatcherContext(); \
} \
} while (0)
static void EndRedoXlog()
{
if (IsExtremeRtoRunning()) {
ExtremeCheckCommittingCsnList();
}
if ((get_real_recovery_parallelism() > 1) && (!parallel_recovery::DispatchPtrIsNull())) {
SwitchToDispatcherContext();
t_thrd.xlog_cxt.incomplete_actions =
parallel_recovery::CheckImcompleteAction(t_thrd.xlog_cxt.incomplete_actions);
FreeAllocatedRedoItem();
EndDispatcherContext();
}
ResourceManagerStop();
XLogWaitFlush(t_thrd.xlog_cxt.XactLastRecEnd);
}
int XLogSemas(void)
{
return 4 + 2 * g_instance.attr.attr_storage.max_wal_senders;
}
* Log checkPoint
*/
inline void PrintCkpXctlControlFile(XLogRecPtr oldCkpLoc, CheckPoint *oldCkp, XLogRecPtr newCkpLoc, CheckPoint *newCkp,
XLogRecPtr preCkpLoc, char *name)
{
ereport(LOG,
(errmsg("%s PrintCkpXctlControlFile: [checkPoint] oldCkpLoc:%X/%X, oldRedo:%X/%X, newCkpLoc:%X/%X, "
"newRedo:%X/%X, preCkpLoc:%X/%X",
name, (uint32)(oldCkpLoc >> 32), (uint32)oldCkpLoc, (uint32)(oldCkp->redo >> 32),
(uint32)oldCkp->redo, (uint32)(newCkpLoc >> 32), (uint32)newCkpLoc, (uint32)(newCkp->redo >> 32),
(uint32)newCkp->redo, (uint32)(preCkpLoc >> 32), (uint32)preCkpLoc)));
}
void CheckForRestartPoint()
{
if (SS_IN_ONDEMAND_RECOVERY || SS_DISASTER_CLUSTER) {
return;
}
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
if (XLByteLT(xlogctl->lastCheckPointRecPtr, g_instance.comm_cxt.predo_cxt.newestCheckpointLoc)) {
Assert(XLByteLT(xlogctl->lastCheckPoint.redo, g_instance.comm_cxt.predo_cxt.newestCheckpoint.redo));
Assert(g_instance.comm_cxt.predo_cxt.newestCheckpoint.nextXid >= xlogctl->lastCheckPoint.nextXid);
Assert(g_instance.comm_cxt.predo_cxt.newestCheckpoint.nextOid >= xlogctl->lastCheckPoint.nextOid);
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->lastCheckPointRecPtr = g_instance.comm_cxt.predo_cxt.newestCheckpointLoc;
xlogctl->lastCheckPoint = const_cast<CheckPoint &>(g_instance.comm_cxt.predo_cxt.newestCheckpoint);
SpinLockRelease(&xlogctl->info_lck);
}
* Perform a checkpoint to update all our recovery activity to disk.
*
* Note that we write a shutdown checkpoint rather than an on-line
* one. This is not particularly critical, but since we may be
* assigning a new TLI, using a shutdown checkpoint allows us to have
* the rule that TLI only changes in shutdown checkpoints, which
* allows some extra error checking in xlog_redo.
*
* In most scenarios (except new timeline switch), the checkpoint
* seems unnecessary 'cause we need to improve the performance of
* switchover. We transfer the checkpoint to the checkpoint process
* which seems effective. However this would lead to new issue
* about dirty pages without checkpoint before we do the first
* checkpoint. So we temporarily switch to full-page write mode,
* and turn it off after the first checkpoint.
*/
if (t_thrd.xlog_cxt.ArchiveRestoreRequested) {
if (t_thrd.xlog_cxt.bgwriterLaunched || t_thrd.xlog_cxt.pagewriter_launched) {
RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT);
} else {
CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
}
} else {
t_thrd.shemem_ptr_cxt.XLogCtl->FpwBeforeFirstCkpt = true;
if (t_thrd.xlog_cxt.bgwriterLaunched || t_thrd.xlog_cxt.pagewriter_launched) {
RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE);
}
}
}
#define REL_NODE_FORMAT(rnode) (rnode).spcNode, (rnode).dbNode, (rnode).relNode, (rnode).bucketNode
void WriteRemainSegsFile(int fd, const char *buffer, uint32 usedLen)
{
if (write(fd, buffer, usedLen) != usedLen) {
if (errno == 0) {
errno = ENOSPC;
}
ereport(PANIC,
(errcode_for_file_access(),
errmsg("could not write remain segs check start point to file: %s", TRANSLATE_ERRNO)));
}
if (pg_fsync(fd) != 0) {
ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync remain segs file: %s", TRANSLATE_ERRNO)));
}
}
void XLogCopyRemainSegsToBuf(int fd)
{
uint32 remainSegsNum = 0;
if (t_thrd.xlog_cxt.remain_segs == NULL) {
WriteRemainSegsFile(fd, (char *)&remainSegsNum, sizeof(uint32));
return;
}
remainSegsNum = hash_get_num_entries(t_thrd.xlog_cxt.remain_segs);
WriteRemainSegsFile(fd, (char *)&remainSegsNum, sizeof(uint32));
HASH_SEQ_STATUS status;
hash_seq_init(&status, t_thrd.xlog_cxt.remain_segs);
ExtentTag *extentTag = NULL;
while ((extentTag = (ExtentTag *)hash_seq_search(&status)) != NULL) {
ereport(WARNING, (errmsg("Segment [%u, %u, %u, %d] is remained segment, remain extent type %s.",
REL_NODE_FORMAT(extentTag->remainExtentHashTag.rnode),
XlogGetRemainExtentTypeName(extentTag->remainExtentType))));
WriteRemainSegsFile(fd, (char *)extentTag, sizeof(ExtentTag));
}
}
void RecoveryRemainSegsFromRemainSegsFile(struct HTAB *remainSegsHtbl, uint32 remainSegNum, char *buf)
{
if (remainSegNum == 0) {
return;
}
while (remainSegNum-- > 0) {
bool found = false;
ExtentTag* extentTag = (ExtentTag *)buf;
ExtentTag* new_entry = (ExtentTag *)hash_search(remainSegsHtbl, (void *)&(extentTag->remainExtentHashTag),
HASH_ENTER, &found);
if (found) {
ereport(WARNING, (errmsg("Extent [%u, %u, %u, %d] should not repeatedly found in alloc segs htbl.",
REL_NODE_FORMAT(extentTag->remainExtentHashTag.rnode))));
} else {
new_entry->forkNum = extentTag->forkNum;
new_entry->remainExtentType = extentTag->remainExtentType;
new_entry->xid = extentTag->xid;
new_entry->lsn = extentTag->lsn;
}
buf += sizeof(ExtentTag);
}
}
bool IsRemainSegsFileExist()
{
struct stat segFileSt;
if (stat(XLOG_REMAIN_SEGS_FILE_PATH, &segFileSt) < 0) {
if (errno != ENOENT) {
ereport(ERROR, (errcode_for_file_access(),
errmsg("could not stat directory \"%s\": %s", XLOG_REMAIN_SEGS_FILE_PATH, TRANSLATE_ERRNO)));
}
return false;
}
if (!S_ISREG(segFileSt.st_mode)) {
return false;
}
return true;
}
static void RenameRemainSegsFile()
{
int errorNo = rename(XLOG_REMAIN_SEGS_FILE_PATH, XLOG_REMAIN_SEGS_BACKUP_FILE_PATH);
if (errorNo != 0) {
ereport(WARNING, (errcode_for_file_access(), errmsg("Failed to rename file \"%s\":%m.",
XLOG_REMAIN_SEGS_FILE_PATH)));
} else {
ereport(LOG, (errmodule(MOD_SEGMENT_PAGE), errmsg("Rename file \"%s\" to \"%s\":%m.",
XLOG_REMAIN_SEGS_FILE_PATH, XLOG_REMAIN_SEGS_BACKUP_FILE_PATH)));
}
}
static bool ReadRemainSegsFileContentToBuffer(int fd, char**contentBuffer)
{
uint32 magicNumber = 0;
XLogRecPtr oldStartPoint;
uint32_t readLen = read(fd, &magicNumber, sizeof(const uint32));
if ((readLen < sizeof(const uint32)) || (magicNumber != REMAIN_SEG_MAGIC_NUM)) {
ereport(WARNING, (errcode_for_file_access(), errmsg("read magic number res %u, len %u from remain segs file"
"\"%s\" is less than %lu: %m.", magicNumber, readLen, XLOG_REMAIN_SEGS_FILE_PATH,
sizeof(const uint32))));
return false;
}
readLen = read(fd, &oldStartPoint, sizeof(XLogRecPtr));
if (readLen < sizeof(XLogRecPtr)) {
ereport(WARNING, (errcode_for_file_access(), errmsg("read startpoint len %u from remain segs file \"%s\""
" is less than %lu: %m.", readLen, XLOG_REMAIN_SEGS_FILE_PATH, sizeof(XLogRecPtr))));
return false;
}
uint32 remainEntryNum = 0;
readLen = read(fd, &remainEntryNum, sizeof(uint32));
if (readLen < sizeof(uint32)) {
ereport(WARNING, (errcode_for_file_access(), errmsg("read remain entry num len %u from remain segs file"
"\"%s\" is less than %lu: %m.",
readLen, XLOG_REMAIN_SEGS_FILE_PATH, sizeof(uint32))));
return false;
}
uint32 contentLen = sizeof(const uint32) + sizeof(XLogRecPtr) + sizeof(uint32);
contentLen += remainEntryNum * sizeof(ExtentTag);
contentLen += sizeof(const uint32);
char *buffer = (char *)palloc_huge(CurrentMemoryContext, (contentLen + sizeof(pg_crc32c)));
uint32 usedLen = 0;
*(uint32 *)buffer = REMAIN_SEG_MAGIC_NUM;
usedLen += sizeof(uint32);
*(XLogRecPtr *)(buffer + usedLen) = oldStartPoint;
usedLen += sizeof(XLogRecPtr);
*(uint32 *)(buffer + usedLen) = remainEntryNum;
usedLen += sizeof(uint32);
readLen = read(fd, buffer + usedLen, remainEntryNum * sizeof(ExtentTag));
if (readLen < (remainEntryNum * sizeof(ExtentTag))) {
ereport(WARNING, (errcode_for_file_access(), errmsg("read extenttag len %u from remain segs file"
"\"%s\" is less than %lu:%m.", readLen, XLOG_REMAIN_SEGS_FILE_PATH,
remainEntryNum * sizeof(ExtentTag))));
pfree_ext(buffer);
return false;
}
usedLen += (remainEntryNum * sizeof(ExtentTag));
readLen = read(fd, &magicNumber, sizeof(const uint32));
if (readLen < sizeof(const uint32) || (magicNumber != REMAIN_SEG_MAGIC_NUM)) {
ereport(WARNING, (errcode_for_file_access(), errmsg("read magic number %u, len %u from remain segs file"
"\"%s\" is less than %lu: %m.", magicNumber, readLen, XLOG_REMAIN_SEGS_FILE_PATH,
sizeof(const uint32))));
pfree_ext(buffer);
return false;
}
*(uint32 *)(buffer + usedLen) = magicNumber;
usedLen += sizeof(const uint32);
pg_crc32c oldCrc32c;
readLen = read(fd, &oldCrc32c, sizeof(pg_crc32c));
if (readLen < sizeof(pg_crc32c)) {
pfree_ext(buffer);
return false;
}
pg_crc32c curCrc32c;
INIT_CRC32C(curCrc32c);
COMP_CRC32C(curCrc32c, buffer, usedLen);
FIN_CRC32C(curCrc32c);
if (oldCrc32c != curCrc32c) {
ereport(WARNING, (errcode_for_file_access(), errmsg("Read crc isn't same with content crc.")));
pfree_ext(buffer);
return false;
}
*contentBuffer = buffer;
return true;
}
static bool ReadRemainSegsFileContent(int fd, HTAB* remainSegsHtbl, XLogRecPtr *oldStartPoint)
{
if (remainSegsHtbl == NULL || oldStartPoint == NULL) {
ereport(DEBUG5, (errmodule(MOD_SEGMENT_PAGE), errmsg("oldStartPoint or remainSegsHtbl is null")));
return true;
}
if (oldStartPoint != NULL) {
*oldStartPoint = InvalidXLogRecPtr;
}
char *contentBuffer = NULL;
bool readRes = ReadRemainSegsFileContentToBuffer(fd, &contentBuffer);
if (readRes == false || contentBuffer == NULL) {
return false;
}
uint32 usedLen = sizeof(const uint32);
*oldStartPoint = *(XLogRecPtr *)(contentBuffer + usedLen);
usedLen += sizeof(XLogRecPtr);
uint32 remainEntryNum = *(uint32 *)(contentBuffer + usedLen);
usedLen += sizeof(uint32);
RecoveryRemainSegsFromRemainSegsFile(remainSegsHtbl, remainEntryNum, contentBuffer + usedLen);
pfree_ext(contentBuffer);
return true;
}
static void ReadRemainSegsFile()
{
bool isFileExist = IsRemainSegsFileExist();
if (!isFileExist) {
return;
}
if (t_thrd.xlog_cxt.remain_segs == NULL) {
t_thrd.xlog_cxt.remain_segs = redo_create_remain_segs_htbl();
}
HeapMemResetHash(t_thrd.xlog_cxt.remain_segs, "remain_segs");
int fd = BasicOpenFile(XLOG_REMAIN_SEGS_FILE_PATH, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
if (fd < 0) {
ereport(WARNING, (errcode_for_file_access(),
errmsg("could not open remain segs file \"%s\": %s", XLOG_REMAIN_SEGS_FILE_PATH, TRANSLATE_ERRNO)));
return;
}
XLogRecPtr oldStartPoint = InvalidXLogRecPtr;
bool readRightContent = ReadRemainSegsFileContent(fd, t_thrd.xlog_cxt.remain_segs, &oldStartPoint);
if (readRightContent) {
SetRemainSegsStartPoint(oldStartPoint);
} else {
HeapMemResetHash(t_thrd.xlog_cxt.remain_segs, "remain_segs");
}
if (close(fd)) {
ereport(WARNING, (errcode_for_file_access(),
errmsg("could not close remain segs file \"%s\": %s", XLOG_REMAIN_SEGS_FILE_PATH, TRANSLATE_ERRNO)));
}
if (!readRightContent) {
RenameRemainSegsFile();
}
}
static uint32 XLogGetRemainContentLen()
{
uint32 contentLen = 0;
contentLen += sizeof(const uint32);
contentLen += sizeof(XLogRecPtr);
contentLen += sizeof(uint32);
if (t_thrd.xlog_cxt.remain_segs != NULL) {
contentLen += hash_get_num_entries(t_thrd.xlog_cxt.remain_segs) * sizeof(ExtentTag);
}
contentLen += sizeof(const uint32);
return contentLen;
}
static void XLogMakeUpRemainSegsContent(char *contentBuffer)
{
uint32 usedLen = 0;
const uint32 magicNum = REMAIN_SEG_MAGIC_NUM;
errno_t errc = memcpy_s(contentBuffer, sizeof(const uint32), &magicNum, sizeof(const uint32));
securec_check(errc, "\0", "\0");
usedLen += sizeof(uint32);
XLogRecPtr recPtr = GetRemainSegsStartPoint();
errc = memcpy_s(contentBuffer + usedLen, sizeof(XLogRecPtr), &recPtr, sizeof(XLogRecPtr));
securec_check(errc, "\0", "\0");
usedLen += sizeof(XLogRecPtr);
uint32 remainSegsNum = 0;
if (t_thrd.xlog_cxt.remain_segs != NULL) {
remainSegsNum = hash_get_num_entries(t_thrd.xlog_cxt.remain_segs);
}
errc = memcpy_s(contentBuffer + usedLen, sizeof(uint32), &remainSegsNum, sizeof(uint32));
securec_check(errc, "\0", "\0");
usedLen += sizeof(uint32);
if (remainSegsNum != 0) {
HASH_SEQ_STATUS status;
hash_seq_init(&status, t_thrd.xlog_cxt.remain_segs);
ExtentTag *extentTag = NULL;
while ((extentTag = (ExtentTag *)hash_seq_search(&status)) != NULL) {
errc = memcpy_s(contentBuffer + usedLen, sizeof(ExtentTag), extentTag, sizeof(ExtentTag));
securec_check(errc, "\0", "\0");
usedLen += sizeof(ExtentTag);
}
}
errc = memcpy_s(contentBuffer + usedLen, sizeof(const uint32), &magicNum, sizeof(const uint32));
securec_check(errc, "\0", "\0");
return;
}
void XLogCheckRemainSegs()
{
if (SS_PRIMARY_ONDEMAND_RECOVERY) {
return;
}
uint32 contentLen = XLogGetRemainContentLen();
pg_crc32c crc;
char* contentBuffer = (char *)palloc_huge(CurrentMemoryContext, (contentLen + sizeof(pg_crc32c)));
XLogMakeUpRemainSegsContent(contentBuffer);
INIT_CRC32C(crc);
COMP_CRC32C(crc, contentBuffer, contentLen);
FIN_CRC32C(crc);
*(pg_crc32c *)(contentBuffer + contentLen) = crc;
int fd = BasicOpenFile(XLOG_REMAIN_SEGS_FILE_PATH, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY, S_IRUSR | S_IWUSR);
if (fd < 0) {
pfree_ext(contentBuffer);
ereport(WARNING, (errcode_for_file_access(),
errmsg("could not create remain segs file \"%s\": %s", XLOG_REMAIN_SEGS_FILE_PATH, TRANSLATE_ERRNO)));
return;
}
WriteRemainSegsFile(fd, contentBuffer, contentLen + sizeof(pg_crc32c));
if (close(fd)) {
ereport(WARNING, (errcode_for_file_access(), errmsg("could not close remain segs file \"%s\": %s",
XLOG_REMAIN_SEGS_FILE_PATH, TRANSLATE_ERRNO)));
}
pfree_ext(contentBuffer);
if (t_thrd.xlog_cxt.remain_segs != NULL) {
HeapMemResetHash(t_thrd.xlog_cxt.remain_segs, "remain_segs");
}
}
void XLogUpdateRemainCommitLsn()
{
if (t_thrd.xlog_cxt.remain_segs == NULL) {
return;
}
uint32 remainSegsNum = hash_get_num_entries(t_thrd.xlog_cxt.remain_segs);
if (remainSegsNum == 0) {
return;
}
HASH_SEQ_STATUS status;
hash_seq_init(&status, t_thrd.xlog_cxt.remain_segs);
ExtentTag *extentTag = NULL;
while ((extentTag = (ExtentTag *)hash_seq_search(&status)) != NULL) {
if (extentTag->remainExtentType == DROP_SEGMENT) {
UpdateRemainCommitLsn(extentTag->lsn);
}
}
XLogRecPtr remaincommit = GetRemainCommitLsn();
if (!XLogRecPtrIsInvalid(remaincommit)) {
ereport(WARNING, (errmsg("There is dropped extent leaking, incremental checkpoint is not safe before %X/%X.",
(uint32)(remaincommit >> 32), (uint32)remaincommit)));
}
}
* When adding or decreasing semaphores used by xlog in the following function
* please update XLogSemas accordingly
*/
void InitWalSemaphores()
{
g_instance.wal_context = AllocSetContextCreate(g_instance.instance_context, "WalContext", ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE, SHARED_CONTEXT);
MemoryContext old_context = MemoryContextSwitchTo(g_instance.wal_context);
* Lock structure used for XLog local flush notification
*/
g_instance.wal_cxt.walFlushWaitLock = (WALFlushWaitLockPadded *)palloc(sizeof(WALFlushWaitLockPadded) * 2);
g_instance.wal_cxt.walFlushWaitLock = (WALFlushWaitLockPadded *)TYPEALIGN(sizeof(WALFlushWaitLockPadded),
g_instance.wal_cxt.walFlushWaitLock);
g_instance.wal_cxt.walFlushWaitLock->l.lock = LWLockAssign(LWTRANCHE_WAL_FLUSH_WAIT);
PGSemaphoreCreate(&g_instance.wal_cxt.walFlushWaitLock->l.sem);
PGSemaphoreReset(&g_instance.wal_cxt.walFlushWaitLock->l.sem);
* Lock structure used for WAL buffer initialization notification
*/
g_instance.wal_cxt.walBufferInitWaitLock =
(WALBufferInitWaitLockPadded *)palloc(sizeof(WALBufferInitWaitLockPadded) * 2);
g_instance.wal_cxt.walBufferInitWaitLock =
(WALBufferInitWaitLockPadded *)TYPEALIGN(sizeof(WALBufferInitWaitLockPadded),
g_instance.wal_cxt.walBufferInitWaitLock);
g_instance.wal_cxt.walBufferInitWaitLock->l.lock = LWLockAssign(LWTRANCHE_WAL_BUFFER_INIT_WAIT);
PGSemaphoreCreate(&g_instance.wal_cxt.walBufferInitWaitLock->l.sem);
PGSemaphoreReset(&g_instance.wal_cxt.walBufferInitWaitLock->l.sem);
* Lock structure used for XLog init-ahead notification
*/
g_instance.wal_cxt.walInitSegLock = (WALInitSegLockPadded *)palloc(sizeof(WALInitSegLockPadded) * 2);
g_instance.wal_cxt.walInitSegLock = (WALInitSegLockPadded *)TYPEALIGN(sizeof(WALInitSegLockPadded),
g_instance.wal_cxt.walInitSegLock);
g_instance.wal_cxt.walInitSegLock->l.lock = LWLockAssign(LWTRANCHE_WAL_INIT_SEGMENT);
PGSemaphoreCreate(&g_instance.wal_cxt.walInitSegLock->l.sem);
PGSemaphoreReset(&g_instance.wal_cxt.walInitSegLock->l.sem);
* Lock structure used for backedns lsn waitting.
*/
g_instance.wal_cxt.walSyncRepWaitLock = (WALSyncRepWaitLockPadded*)palloc(sizeof(WALSyncRepWaitLockPadded) * 2);
g_instance.wal_cxt.walSyncRepWaitLock = (WALSyncRepWaitLockPadded*)TYPEALIGN(sizeof(WALSyncRepWaitLockPadded),
g_instance.wal_cxt.walSyncRepWaitLock);
g_instance.wal_cxt.walSyncRepWaitLock->l.lock = LWLockAssign(LWTRANCHE_SYNCREP_WAIT);
g_instance.wal_cxt.cbmWaitTaskLock = (CBMTaskLockPadded*)palloc(sizeof(CBMTaskLockPadded) * 2);
g_instance.wal_cxt.cbmWaitTaskLock = (CBMTaskLockPadded*)TYPEALIGN(sizeof(CBMTaskLockPadded),
g_instance.wal_cxt.cbmWaitTaskLock);
g_instance.wal_cxt.cbmWaitTaskLock->l.lock = LWLockAssign(LWTRANCHE_CBM_TASK_WAIT);
PGSemaphoreCreate(&g_instance.wal_cxt.cbmWaitTaskLock->l.sem);
PGSemaphoreReset(&g_instance.wal_cxt.cbmWaitTaskLock->l.sem);
(void)MemoryContextSwitchTo(old_context);
}
static void checkHadrInSwitchover()
{
FILE *fd = NULL;
char *switchoverFile = "cluster_maintance";
fd = fopen(switchoverFile, "r");
if (fd == NULL) {
ereport(LOG, (errmsg("===Not in the streaming DR switchover===")));
return;
}
ereport(LOG, (errmsg("===In the streaming DR switchover===")));
g_instance.streaming_dr_cxt.isInSwitchover = true;
fclose(fd);
return;
}
void ExtremeRedoWaitRecoverySusPendFinish(XLogRecPtr lsn)
{
if (RecoveryIsSuspend()) {
pg_atomic_write_u64(&g_instance.startup_cxt.suspend_lsn, lsn);
pg_memory_barrier();
while (RecoveryIsSuspend()) {
pg_usleep(ONE_SECOND_TO_MICROSECOND);
RedoInterruptCallBack();
}
pg_atomic_write_u64(&g_instance.startup_cxt.suspend_lsn, InvalidXLogRecPtr);
}
}
void handleRecoverySusPend(XLogRecPtr lsn)
{
if (RecoveryIsSuspend()) {
if (IsExtremeRedo()) {
ExtremeDispatchClosefdMarkToAllRedoWorker();
ExtremeWaitAllReplayWorkerIdle();
} else if (IsParallelRedo()) {
if (AmStartupProcess()) {
ProcTxnWorkLoad(true);
}
parallel_recovery::SendClosefdMarkToAllWorkers();
parallel_recovery::WaitAllPageWorkersQueueEmpty();
} else {
}
smgrcloseall();
pg_atomic_write_u64(&g_instance.startup_cxt.suspend_lsn, lsn);
pg_memory_barrier();
ereport(LOG, (errmsg("recovery suspend, set suspend lsn is %X/%X",
(uint32)(lsn >> XLOG_LSN_SWAP), (uint32)lsn)));
ThreadId PageRepairPID = g_instance.pid_cxt.PageRepairPID;
if (PageRepairPID != 0) {
gs_signal_send(PageRepairPID, SIGUSR2);
}
RequestCheckpoint(CHECKPOINT_FLUSH_DIRTY|CHECKPOINT_WAIT);
CheckNeedRenameFile();
if (t_thrd.xlog_cxt.readFile >= 0) {
close(t_thrd.xlog_cxt.readFile);
t_thrd.xlog_cxt.readFile = -1;
}
while (RecoveryIsSuspend()) {
if (!WalRcvInProgress() && g_instance.pid_cxt.WalReceiverPID == 0) {
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
SpinLockAcquire(&walrcv->mutex);
walrcv->receivedUpto = 0;
SpinLockRelease(&walrcv->mutex);
RequestXLogStreaming(&lsn, t_thrd.xlog_cxt.PrimaryConnInfo, REPCONNTARGET_PRIMARY,
u_sess->attr.attr_storage.PrimarySlotName);
}
pg_usleep(ONE_SECOND_TO_MICROSECOND);
RedoInterruptCallBack();
}
pg_atomic_write_u64(&g_instance.startup_cxt.suspend_lsn, InvalidXLogRecPtr);
ereport(LOG, (errmsg("cancle recovery suspend")));
}
}
* Read term from xlog and update term_from_xlog.
*/
static inline void UpdateTermFromXLog(uint32 xlTerm)
{
uint32 readTerm = xlTerm & XLOG_MASK_TERM;
if (readTerm > g_instance.comm_cxt.localinfo_cxt.term_from_xlog) {
g_instance.comm_cxt.localinfo_cxt.term_from_xlog = readTerm;
}
}
void init_extreme_rto_standby_read_first_snapshot(const XLogRecPtr checkpoint_loc)
{
if (!IsExtremeRedo()) {
return;
}
if (!g_instance.attr.attr_storage.EnableHotStandby) {
return;
}
g_instance.comm_cxt.predo_cxt.exrto_snapshot->gen_snap_time = 0;
g_instance.comm_cxt.predo_cxt.exrto_snapshot->read_lsn = checkpoint_loc;
g_instance.comm_cxt.predo_cxt.exrto_snapshot->snapshot_csn = t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo;
g_instance.comm_cxt.predo_cxt.exrto_snapshot->xmin = t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid;
g_instance.comm_cxt.predo_cxt.exrto_snapshot->xmax = t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid;
if (TransactionIdIsValid(t_thrd.xact_cxt.ShmemVariableCache->standbyXmin) &&
t_thrd.xact_cxt.ShmemVariableCache->standbyXmin <= t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid) {
g_instance.comm_cxt.predo_cxt.exrto_snapshot->xmin = t_thrd.xact_cxt.ShmemVariableCache->standbyXmin;
}
}
static bool hex_string_to_int(char* hex_string, uint32* result)
{
uint32 num = 0;
char* temp = hex_string;
uint32 c = 0;
uint32 index = 0;
if (NULL == hex_string) {
return false;
}
while (*temp++ != '\0') {
num++;
}
while (num--) {
if (hex_string[num] >= 'A' && hex_string[num] <= 'F') {
c = (uint32)((hex_string[num] - 'A') + 10);
} else if (hex_string[num] >= '0' && hex_string[num] <= '9') {
c = (uint32)(hex_string[num] - '0');
} else {
return false;
}
*result += c << (index * 4);
index++;
}
return true;
}
static inline void set_hot_standby_recycle_xid()
{
DIR *dir = NULL;
struct dirent *ptr = NULL;
uint32 segnum = 0;
char *dir_name = "pg_clog";
if ((dir = opendir(dir_name)) == NULL) {
return;
}
while ((ptr = readdir(dir)) != NULL) {
if (ptr->d_type != DT_REG) {
continue;
}
if (!hex_string_to_int(ptr->d_name, &segnum)) {
closedir(dir);
return;
}
uint32 segnum_xid = BLCKSZ * CLOG_XACTS_PER_BYTE * SLRU_PAGES_PER_SEGMENT;
TransactionId xid = (uint64)segnum * segnum_xid;
pg_atomic_write_u64(&g_instance.undo_cxt.hotStandbyRecycleXid, xid);
ereport(LOG, (errmsg("Startup: write hotStandbyRecycleXid %lu", xid)));
closedir(dir);
return;
}
closedir(dir);
}
* This must be called ONCE during postmaster or standalone-backend startup
*/
void StartupXLOG(void)
{
XLogCtlInsert *Insert = NULL;
CheckPoint checkPoint;
CheckPointNew checkPointNew;
CheckPointPlus checkPointPlus;
CheckPointUndo checkPointUndo;
uint32 recordLen = 0;
bool wasShutdown = false;
bool DBStateShutdown = false;
bool reachedStopPoint = false;
bool haveBackupLabel = false;
bool haveTblspcMap = false;
XLogRecPtr RecPtr, checkPointLoc, EndOfLog;
XLogSegNo endLogSegNo;
XLogRecord *record = NULL;
TransactionId oldestActiveXID;
bool backupEndRequired = false;
bool backupFromStandby = false;
bool backupFromRoach = false;
DBState dbstate_at_startup;
XLogReaderState *xlogreader = NULL;
XLogPageReadPrivate readprivate;
bool RecoveryByPending = false;
bool ArchiveRecoveryByPending = false;
bool AbnormalShutdown = true;
bool SSOndemandRecoveryExitNormal = true;
struct stat st;
errno_t rcm = 0;
TransactionId latestCompletedXid;
bool wasCheckpoint = false;
errno_t errorno = EOK;
XLogRecPtr max_page_flush_lsn = MAX_XLOG_REC_PTR;
checkPointNew.next_csn = COMMITSEQNO_FIRST_NORMAL + 1;
checkPointPlus.next_csn = COMMITSEQNO_FIRST_NORMAL + 1;
checkPointPlus.length = 0;
checkPointPlus.recent_global_xmin = InvalidTransactionId;
checkPointUndo.next_csn = COMMITSEQNO_FIRST_NORMAL + 1;
checkPointUndo.length = 0;
checkPointUndo.recent_global_xmin = InvalidTransactionId;
checkPointUndo.globalRecycleXid = InvalidTransactionId;
t_thrd.xlog_cxt.startup_processing = true;
t_thrd.xlog_cxt.RedoDone = false;
t_thrd.xlog_cxt.currentRetryTimes = 0;
t_thrd.xlog_cxt.ssXlogReadFailedTimes = 0;
t_thrd.xlog_cxt.forceFinishHappened = false;
g_instance.comm_cxt.predo_cxt.redoPf.recovery_done_ptr = 0;
g_instance.comm_cxt.predo_cxt.redoPf.redo_done_time = 0;
pg_atomic_write_u32(&(g_instance.comm_cxt.localinfo_cxt.is_finish_redo), 0);
NotifyGscRecoveryStarted();
* Initialize WAL insert status array and the flush index - lastWalStatusEntryFlushed.
*/
if (g_instance.wal_cxt.walInsertStatusTable == NULL) {
MemoryContext old_context = MemoryContextSwitchTo(g_instance.wal_context);
InitXlogStatuEntryTblSize();
g_instance.wal_cxt.walInsertStatusTable = (WALInsertStatusEntry *)palloc(
sizeof(WALInsertStatusEntry) *
(GET_WAL_INSERT_STATUS_ENTRY_CNT() + 1));
g_instance.wal_cxt.walInsertStatusTable =
(WALInsertStatusEntry *)TYPEALIGN(sizeof(WALInsertStatusEntry), g_instance.wal_cxt.walInsertStatusTable);
g_instance.wal_cxt.xlogFlushStats = (XlogFlushStatistics *)palloc0(sizeof(XlogFlushStatistics));
(void)MemoryContextSwitchTo(old_context);
}
errorno =
memset_s(g_instance.wal_cxt.walInsertStatusTable,
sizeof(WALInsertStatusEntry) *
GET_WAL_INSERT_STATUS_ENTRY_CNT(),
0,
sizeof(WALInsertStatusEntry) *
GET_WAL_INSERT_STATUS_ENTRY_CNT());
securec_check(errorno, "", "");
for (int i = 0; i < GET_WAL_INSERT_STATUS_ENTRY_CNT();
i++) {
g_instance.wal_cxt.walInsertStatusTable[i].LRC = i;
}
g_instance.wal_cxt.lastWalStatusEntryFlushed = -1;
g_instance.wal_cxt.lastLRCScanned = WAL_SCANNED_LRC_INIT;
g_instance.wal_cxt.lastLRCWrited = WAL_SCANNED_LRC_INIT;
g_instance.wal_cxt.lastLRCFlushed = WAL_SCANNED_LRC_INIT;
* Read control file and check XLOG status looks valid.
*
* Note: in most control paths, *ControlFile is already valid and we need
* not do ReadControlFile() here, but might as well do it to be sure.
*/
ReadControlFile();
if (ENABLE_DMS && ENABLE_DSS) {
g_instance.dms_cxt.SSReformInfo.reform_ver_startup_wait = g_instance.dms_cxt.SSReformInfo.reform_ver;
SSReadReformerCtrl();
if (SS_CLUSTER_ONDEMAND_NOT_NORAML && SS_PRIMARY_MODE) {
SSOndemandRecoveryExitNormal = false;
if (SS_STANDBY_PROMOTING) {
ereport(FATAL, (errmsg("[SS reform][On-demand] Do not allow switchover if "
"ondemand recovery is not finish")));
} else {
ereport(LOG, (errmsg("[SS reform][On-demand] Ondemand recovery do not finish in last reform")));
}
}
if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) {
ereport(LOG, (errmsg("[SS reform] Standby:%d promoting, will become new primary later", SS_MY_INST_ID)));
} else if (SS_STANDBY_MODE) {
t_thrd.shemem_ptr_cxt.ControlFile->state = DB_SHUTDOWNED;
}
}
if (FORCE_FINISH_ENABLED) {
max_page_flush_lsn = mpfl_read_max_flush_lsn();
ereport(LOG,
(errcode_for_file_access(), errmsg("StartupXLOG: read page lsn info from file, LSN: %X/%X.",
(uint32)(max_page_flush_lsn >> 32), (uint32)max_page_flush_lsn)));
if (max_page_flush_lsn == MAX_XLOG_REC_PTR) {
mpfl_new_file();
} else {
SetMinRecoverPointForStats(max_page_flush_lsn);
}
} else {
mpfl_ulink_file();
}
ereport(LOG, (errcode(ERRCODE_LOG),
errmsg("StartupXLOG: biggest_lsn_in_page is set to %X/%X, enable_update_max_page_flush_lsn:%u",
(uint32)(max_page_flush_lsn >> 32), (uint32)max_page_flush_lsn,
g_instance.attr.attr_storage.enable_update_max_page_flush_lsn)));
t_thrd.xlog_cxt.max_page_flush_lsn = max_page_flush_lsn;
pg_atomic_write_u32(&(g_instance.comm_cxt.predo_cxt.permitFinishRedo), ATOMIC_FALSE);
set_global_max_page_flush_lsn(max_page_flush_lsn);
if (IsUnderPostmaster) {
t_thrd.shemem_ptr_cxt.ControlFile->timeline = t_thrd.shemem_ptr_cxt.ControlFile->timeline + 1;
ereport(LOG, (errmsg("database system timeline: %u", t_thrd.shemem_ptr_cxt.ControlFile->timeline)));
if (t_thrd.shemem_ptr_cxt.ControlFile->timeline == 0) {
t_thrd.shemem_ptr_cxt.ControlFile->timeline = 1;
}
}
if (t_thrd.shemem_ptr_cxt.ControlFile->state < DB_SHUTDOWNED ||
t_thrd.shemem_ptr_cxt.ControlFile->state > DB_IN_PRODUCTION ||
!XRecOffIsValid(t_thrd.shemem_ptr_cxt.ControlFile->checkPoint)) {
ereport(FATAL, (errmsg("control file contains invalid data")));
}
if (t_thrd.shemem_ptr_cxt.ControlFile->state == DB_SHUTDOWNED) {
DBStateShutdown = true;
AbnormalShutdown = false;
ereport(LOG,
(errmsg("database system was shut down at %s", str_time(t_thrd.shemem_ptr_cxt.ControlFile->time))));
} else if (t_thrd.shemem_ptr_cxt.ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY) {
DBStateShutdown = true;
ereport(LOG, (errmsg("database system was shut down in recovery at %s",
str_time(t_thrd.shemem_ptr_cxt.ControlFile->time))));
} else if (t_thrd.shemem_ptr_cxt.ControlFile->state == DB_SHUTDOWNING) {
ereport(LOG, (errmsg("database system shutdown was interrupted; last known up at %s",
str_time(t_thrd.shemem_ptr_cxt.ControlFile->time))));
} else if (t_thrd.shemem_ptr_cxt.ControlFile->state == DB_IN_CRASH_RECOVERY) {
ereport(LOG, (errmsg("database system was interrupted while in recovery at %s",
str_time(t_thrd.shemem_ptr_cxt.ControlFile->time)),
errhint("This probably means that some data is corrupted and"
" you will have to use the last backup for recovery.")));
} else if (t_thrd.shemem_ptr_cxt.ControlFile->state == DB_IN_ARCHIVE_RECOVERY) {
ereport(LOG, (errmsg("database system was interrupted while in recovery at log time %s",
str_time(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.time)),
errhint("If this has occurred more than once some data might be corrupted"
" and you might need to choose an earlier recovery target.")));
} else if (t_thrd.shemem_ptr_cxt.ControlFile->state == DB_IN_PRODUCTION) {
ereport(LOG, (errmsg("database system was interrupted; last known up at %s",
str_time(t_thrd.shemem_ptr_cxt.ControlFile->time))));
}
#ifdef XLOG_REPLAY_DELAY
if (t_thrd.shemem_ptr_cxt.ControlFile->state != DB_SHUTDOWNED) {
pg_usleep(60000000L);
}
#endif
* Verify that pg_xlog and pg_xlog/archive_status exist. In cases where
* someone has performed a copy for PITR, these directories may have been
* excluded and need to be re-created.
*/
ValidateXLOGDirectoryStructure();
remove_xlogtemp_files();
* Clear out any old relcache cache files. This is *necessary* if we do
* any WAL replay, since that would probably result in the cache files
* being out of sync with database reality. In theory we could leave them
* in place if the database had been cleanly shut down, but it seems
* safest to just remove them always and let them be rebuilt during the
* first backend startup.
*/
RelationCacheInitFileRemove();
* Initialize on the assumption we want to recover to the same timeline
* that's active according to pg_control.
*/
t_thrd.xlog_cxt.recoveryTargetTLI = t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.ThisTimeLineID;
if (g_instance.attr.attr_storage.max_active_gtt > 0) {
RemovePgTempFiles();
}
readRecoveryCommandFile();
if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_UNSET && g_instance.pid_cxt.PageRepairPID != 0) {
g_instance.repair_cxt.support_repair = true;
} else {
g_instance.repair_cxt.support_repair = false;
}
if (t_thrd.xlog_cxt.ArchiveRestoreRequested) {
t_thrd.xlog_cxt.ArchiveRecoveryRequested = true;
}
load_server_mode();
if (t_thrd.xlog_cxt.server_mode == PENDING_MODE && !t_thrd.xlog_cxt.ArchiveRecoveryRequested) {
ArchiveRecoveryByPending = true;
}
if (t_thrd.xlog_cxt.server_mode == PENDING_MODE || t_thrd.xlog_cxt.server_mode == STANDBY_MODE) {
t_thrd.xlog_cxt.ArchiveRecoveryRequested = true;
t_thrd.xlog_cxt.StandbyModeRequested = true;
}
t_thrd.xlog_cxt.expectedTLIs = readTimeLineHistory(t_thrd.xlog_cxt.recoveryTargetTLI);
* If pg_control's timeline is not in expectedTLIs, then we cannot
* proceed: the backup is not part of the history of the requested
* timeline.
*/
if (!list_member_int(t_thrd.xlog_cxt.expectedTLIs,
(int)t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.ThisTimeLineID)) {
ereport(FATAL, (errmsg("requested timeline %u is not a child of database system timeline %u",
t_thrd.xlog_cxt.recoveryTargetTLI,
t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.ThisTimeLineID)));
}
* Save the selected recovery target timeline ID and
* archive_cleanup_command in shared memory so that other processes can
* see them
*/
t_thrd.shemem_ptr_cxt.XLogCtl->RecoveryTargetTLI = t_thrd.xlog_cxt.recoveryTargetTLI;
errorno = strncpy_s(t_thrd.shemem_ptr_cxt.XLogCtl->archiveCleanupCommand,
sizeof(t_thrd.shemem_ptr_cxt.XLogCtl->archiveCleanupCommand) - 1,
t_thrd.xlog_cxt.archiveCleanupCommand ? t_thrd.xlog_cxt.archiveCleanupCommand : "",
sizeof(t_thrd.shemem_ptr_cxt.XLogCtl->archiveCleanupCommand) - 1);
securec_check(errorno, "", "");
t_thrd.shemem_ptr_cxt.XLogCtl->archiveCleanupCommand[MAXPGPATH - 1] = '\0';
if (t_thrd.xlog_cxt.ArchiveRecoveryRequested) {
if (t_thrd.xlog_cxt.StandbyModeRequested) {
ereport(LOG, (errmsg("entering standby mode")));
} else if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_XID) {
ereport(LOG,
(errmsg("starting point-in-time recovery to XID " XID_FMT, t_thrd.xlog_cxt.recoveryTargetXid)));
} else if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_TIME) {
ereport(LOG, (errmsg("starting point-in-time recovery to %s",
timestamptz_to_str(t_thrd.xlog_cxt.recoveryTargetTime))));
}
#ifdef PGXC
else if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_BARRIER) {
ereport(LOG, (errmsg("starting point-in-time recovery to barrier %s",
(t_thrd.xlog_cxt.recoveryTargetBarrierId))));
}
#endif
else if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_NAME) {
ereport(LOG, (errmsg("starting point-in-time recovery to \"%s\"", t_thrd.xlog_cxt.recoveryTargetName)));
} else if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_LSN) {
ereport(LOG, (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
(uint32)(t_thrd.xlog_cxt.recoveryTargetLSN >> 32),
(uint32)t_thrd.xlog_cxt.recoveryTargetLSN)));
} else {
ereport(LOG, (errmsg("starting archive recovery")));
}
}
* Take ownership of the wakeup latch if we're going to sleep during
* recovery.
*/
if (t_thrd.xlog_cxt.StandbyModeRequested) {
OwnLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch);
OwnLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->dataRecoveryLatch);
if (IsExtremeRedo()) {
OwnLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupDelayLatch);
}
}
errorno = memset_s(&readprivate, sizeof(XLogPageReadPrivate), 0, sizeof(XLogPageReadPrivate));
securec_check(errorno, "", "");
if (ENABLE_DMS && ENABLE_DSS) {
xlogreader = SSXLogReaderAllocate(&SSXLogPageRead, &readprivate, ALIGNOF_BUFFER);
close_readFile_if_open();
if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING || SS_ONDEMAND_REALTIME_BUILD_READY_TO_BUILD) {
SSCSNLOGShmemClear();
SSCLOGShmemClear();
SSMultiXactShmemClear();
}
ereport(LOG, (errmsg("[SS reform] My instance %d, checkpoint record loc %X/%X, redo loc %X/%X",
g_instance.attr.attr_storage.dms_attr.instance_id,
(uint32)(t_thrd.shemem_ptr_cxt.ControlFile->checkPoint >> 32),
(uint32)t_thrd.shemem_ptr_cxt.ControlFile->checkPoint,
(uint32)(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo >> 32),
(uint32)t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo)));
} else {
xlogreader = XLogReaderAllocate(&XLogPageRead, &readprivate);
}
if (xlogreader == NULL) {
ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"),
errdetail("Failed while allocating an XLog reading processor")));
}
xlogreader->system_identifier = t_thrd.shemem_ptr_cxt.ControlFile->system_identifier;
g_instance.comm_cxt.predo_cxt.redoPf.redo_start_time = GetCurrentTimestamp();
if (read_backup_label(&checkPointLoc, &backupEndRequired, &backupFromStandby, &backupFromRoach)) {
List *tablespaces = NIL;
* Archive recovery was requested, and thanks to the backup label file,
* we know how far we need to replay to reach consistency. Enter
* archive recovery directly.
*/
t_thrd.xlog_cxt.InArchiveRecovery = true;
if (t_thrd.xlog_cxt.StandbyModeRequested) {
t_thrd.xlog_cxt.StandbyMode = true;
}
ereport(LOG, (errmsg("request archive recovery due to backup label file")));
* When a backup_label file is present, we want to roll forward from
* the checkpoint it identifies, rather than using pg_control.
*/
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0);
if (record != NULL) {
rcm = memcpy_s(&checkPoint, sizeof(CheckPoint), XLogRecGetData(xlogreader), sizeof(CheckPoint));
securec_check(rcm, "", "");
recordLen = record->xl_tot_len;
if (record->xl_tot_len == CHECKPOINTNEW_LEN) {
rcm = memcpy_s(&checkPointNew, sizeof(checkPointNew), XLogRecGetData(xlogreader),
sizeof(checkPointNew));
securec_check(rcm, "", "");
} else if (record->xl_tot_len == CHECKPOINTPLUS_LEN) {
rcm = memcpy_s(&checkPointPlus, sizeof(checkPointPlus), XLogRecGetData(xlogreader),
sizeof(checkPointPlus));
securec_check(rcm, "", "");
} else if (record->xl_tot_len == CHECKPOINTUNDO_LEN) {
rcm = memcpy_s(&checkPointUndo, sizeof(checkPointUndo), XLogRecGetData(xlogreader),
sizeof(checkPointUndo));
securec_check(rcm, "", "");
}
wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
UpdateTermFromXLog(record->xl_term);
ereport(DEBUG1,
(errmsg("checkpoint record is at %X/%X", (uint32)(checkPointLoc >> 32), (uint32)checkPointLoc)));
t_thrd.xlog_cxt.InRecovery = true;
* Make sure that REDO location exists. This may not be the case
* if there was a crash during an online backup, which left a
* backup_label around that references a WAL segment that's
* already been archived.
*/
if (XLByteLT(checkPoint.redo, checkPointLoc) && !ReadRecord(xlogreader, checkPoint.redo, LOG, false)) {
ereport(FATAL,
(errmsg("could not find redo location referenced by checkpoint record"),
errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".",
t_thrd.proc_cxt.DataDir)));
}
} else {
ereport(FATAL,
(errmsg("could not locate required checkpoint record"),
errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".",
t_thrd.proc_cxt.DataDir)));
wasShutdown = false;
}
if (read_tablespace_map(&tablespaces)) {
ListCell *lc;
errno_t rc = EOK;
foreach (lc, tablespaces) {
tablespaceinfo *ti = (tablespaceinfo *)lfirst(lc);
int length = PG_TBLSPCS + strlen(ti->oid) + 1;
char *linkloc = (char *)palloc0(length);
if (ENABLE_DSS) {
rc = snprintf_s(linkloc, length, length - 1, "%s/pg_tblspc/%s",
g_instance.attr.attr_storage.dss_attr.ss_dss_data_vg_name, ti->oid);
} else {
rc = snprintf_s(linkloc, length, length - 1, "pg_tblspc/%s", ti->oid);
}
securec_check_ss_c(rc, "", "");
* Remove the existing symlink if any and Create the symlink
* under PGDATA.
*/
remove_tablespace_symlink(linkloc);
if (symlink(ti->path, linkloc) < 0) {
pfree(linkloc);
ereport(ERROR,
(errcode_for_file_access(), errmsg("could not create symbolic link \"%s\": %s", linkloc, TRANSLATE_ERRNO)));
}
pfree(linkloc);
pfree(ti->oid);
pfree(ti->path);
pfree(ti);
}
haveTblspcMap = true;
}
haveBackupLabel = true;
} else {
* If tablespace_map file is present without backup_label file, there
* is no use of such file. There is no harm in retaining it, but it
* is better to get rid of the map file so that we don't have any
* redundant file in data directory and it will avoid any sort of
* confusion. It seems prudent though to just rename the file out
* of the way rather than delete it completely, also we ignore any
* error that occurs in rename operation as even if map file is
* present without backup_label file, it is harmless.
*/
if (stat(TABLESPACE_MAP, &st) == 0) {
unlink(TABLESPACE_MAP_OLD);
if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
ereport(LOG, (errmsg("ignoring file \"%s\" because no file \"%s\" exists", TABLESPACE_MAP,
BACKUP_LABEL_FILE),
errdetail("File \"%s\" was renamed to \"%s\".", TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
else
ereport(LOG,
(errmsg("ignoring \"%s\" file because no \"%s\" file exists", TABLESPACE_MAP,
BACKUP_LABEL_FILE),
errdetail("Could not rename file \"%s\" to \"%s\": %m.", TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
}
* It's possible that archive recovery was requested, but we don't
* know how far we need to replay the WAL before we reach consistency.
* This can happen for example if a base backup is taken from a running
* server using an atomic filesystem snapshot, without calling
* pg_start/stop_backup. Or if you just kill a running master server
* and put it into archive recovery by creating a recovery.conf file.
*
* Our strategy in that case is to perform crash recovery first,
* replaying all the WAL present in pg_xlog, and only enter archive
* recovery after that.
*
* But usually we already know how far we need to replay the WAL (up to
* minRecoveryPoint, up to backupEndPoint, or until we see an
* end-of-backup record), and we can enter archive recovery directly.
*/
if (t_thrd.xlog_cxt.ArchiveRecoveryRequested &&
(!XLByteEQ(t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint, InvalidXLogRecPtr) ||
t_thrd.shemem_ptr_cxt.ControlFile->backupEndRequired ||
!XLByteEQ(t_thrd.shemem_ptr_cxt.ControlFile->backupEndPoint, InvalidXLogRecPtr) ||
t_thrd.shemem_ptr_cxt.ControlFile->state == DB_SHUTDOWNED)) {
t_thrd.xlog_cxt.InArchiveRecovery = true;
ereport(LOG, (errmsg_internal("StartupXLOG: archive recovery set true")));
if (t_thrd.xlog_cxt.StandbyModeRequested) {
t_thrd.xlog_cxt.StandbyMode = true;
}
}
* Get the last valid checkpoint record. If the latest one according
* to pg_control is broken, try the next-to-last one.
*/
checkPointLoc = t_thrd.shemem_ptr_cxt.ControlFile->checkPoint;
t_thrd.xlog_cxt.RedoStartLSN = t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo;
g_instance.comm_cxt.predo_cxt.redoPf.redo_start_ptr = t_thrd.xlog_cxt.RedoStartLSN;
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1);
if (record != NULL) {
ereport(DEBUG1,
(errmsg("checkpoint record is at %X/%X", (uint32)(checkPointLoc >> 32), (uint32)checkPointLoc)));
} else if (t_thrd.xlog_cxt.StandbyMode) {
* The last valid checkpoint record required for a streaming
* recovery exists in neither standby nor the primary.
*/
ereport(PANIC, (errmsg("could not locate a valid checkpoint record")));
} else {
checkPointLoc = t_thrd.shemem_ptr_cxt.ControlFile->prevCheckPoint;
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2);
if (record != NULL) {
ereport(LOG, (errmsg("using previous checkpoint record at %X/%X", (uint32)(checkPointLoc >> 32),
(uint32)checkPointLoc)));
t_thrd.xlog_cxt.InRecovery = true;
} else {
ereport(PANIC, (errmsg("could not locate a valid checkpoint record")));
}
}
rcm = memcpy_s(&checkPoint, sizeof(CheckPoint), XLogRecGetData(xlogreader), sizeof(CheckPoint));
securec_check(rcm, "", "");
recordLen = record->xl_tot_len;
if (record->xl_tot_len == CHECKPOINTNEW_LEN) {
rcm = memcpy_s(&checkPointNew, sizeof(checkPointNew), XLogRecGetData(xlogreader),
sizeof(checkPointNew));
securec_check(rcm, "", "");
} else if (record->xl_tot_len == CHECKPOINTPLUS_LEN) {
rcm = memcpy_s(&checkPointPlus, sizeof(checkPointPlus), XLogRecGetData(xlogreader),
sizeof(checkPointPlus));
securec_check(rcm, "", "");
} else if (record->xl_tot_len == CHECKPOINTUNDO_LEN) {
rcm = memcpy_s(&checkPointUndo, sizeof(checkPointUndo), XLogRecGetData(xlogreader),
sizeof(checkPointUndo));
securec_check(rcm, "", "");
}
UpdateTermFromXLog(record->xl_term);
wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
wasCheckpoint = wasShutdown || (record->xl_info == XLOG_CHECKPOINT_ONLINE);
}
* initialize double write, recover partial write
* in SS Switchover, skip dw init since we didn't do ShutdownXLOG
*/
if (!SS_STANDBY_MODE && !SS_PRIMARY_DEMOTING) {
dw_ext_init();
dw_init();
g_instance.dms_cxt.dw_init = true;
}
undo::RecoveryUndoSystemMeta();
g_instance.policy_cxt.account_table_lock = LWLockAssign(LWTRANCHE_ACCOUNT_TABLE);
t_thrd.xlog_cxt.LastRec = RecPtr = checkPointLoc;
ereport(LOG, (errmsg("redo record is at %X/%X; shutdown %s", (uint32)(checkPoint.redo >> 32),
(uint32)checkPoint.redo, wasShutdown ? "TRUE" : "FALSE")));
ereport(DEBUG1, (errmsg("next MultiXactId: " XID_FMT "; next MultiXactOffset: " XID_FMT, checkPoint.nextMulti,
checkPoint.nextMultiOffset)));
ereport(DEBUG1, (errmsg("oldest unfrozen transaction ID: " XID_FMT ", in database %u", checkPoint.oldestXid,
checkPoint.oldestXidDB)));
if (!TransactionIdIsNormal(checkPoint.nextXid)) {
ereport(PANIC, (errmsg("invalid next transaction ID")));
}
#ifdef ENABLE_MOT
* Recover MOT
*/
if (!IsInitdb) {
MOTRecover();
}
#endif
t_thrd.xact_cxt.ShmemVariableCache->nextXid = checkPoint.nextXid;
t_thrd.xact_cxt.ShmemVariableCache->nextOid = checkPoint.nextOid;
t_thrd.xact_cxt.ShmemVariableCache->oidCount = 0;
MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
SetMultiXactIdLimit(FirstMultiXactId, TemplateDbOid);
t_thrd.shemem_ptr_cxt.XLogCtl->ckptXid = checkPoint.oldestXid;
t_thrd.shemem_ptr_cxt.XLogCtl->IsRecoveryDone = false;
t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandBuildDone = false;
t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandRedoDone = false;
latestCompletedXid = checkPoint.nextXid;
TransactionIdRetreat(latestCompletedXid);
t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid = latestCompletedXid;
t_thrd.xact_cxt.ShmemVariableCache->xmin = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
t_thrd.xact_cxt.ShmemVariableCache->recentLocalXmin = latestCompletedXid + 1;
t_thrd.xact_cxt.ShmemVariableCache->startupMaxXid = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
g_instance.ckpt_cxt_ctl->ckpt_view.ckpt_current_redo_point = checkPoint.redo;
update_dirty_page_queue_rec_lsn(checkPoint.redo, true);
* for gtm environment, we need to set the local csn to next xid to increase.
* for gtm environment, set local csn to 0 first and then it will be set by redo xact
* or transaction's commit, which ensure local csn will not be larger than gtm csn.
*/
if (GTM_FREE_MODE) {
if ((recordLen == CHECKPOINTNEW_LEN)) {
t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo = checkPointNew.next_csn;
ereport(LOG, (errmsg("%s Mode: start local next csn from checkpoint %lu, next xid %lu", "GTM-Free",
checkPointNew.next_csn, t_thrd.xact_cxt.ShmemVariableCache->nextXid)));
} else if ((recordLen == CHECKPOINTPLUS_LEN)) {
t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo = checkPointPlus.next_csn;
ereport(LOG, (errmsg("%s Mode: start local next csn from checkpoint %lu, next xid %lu", "GTM-Free",
checkPointPlus.next_csn, t_thrd.xact_cxt.ShmemVariableCache->nextXid)));
} else if (recordLen == CHECKPOINTUNDO_LEN) {
t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo = checkPointUndo.next_csn;
ereport(LOG, (errmsg("%s Mode: start local next csn from checkpoint %lu, next xid %lu", "GTM-Free",
checkPointUndo.next_csn, t_thrd.xact_cxt.ShmemVariableCache->nextXid)));
} else {
t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
ereport(LOG, (errmsg("%s Mode: start local next csn from next_xid %lu", "GTM-Free",
t_thrd.xact_cxt.ShmemVariableCache->nextXid)));
}
} else {
t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo = COMMITSEQNO_FIRST_NORMAL + 1;
if (recordLen == CHECKPOINTNEW_LEN) {
ereport(LOG, (errmsg("%s Mode: start local next csn from checkpoint %lu", "GTM-Lite",
(checkPointNew.next_csn))));
t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo = checkPointNew.next_csn;
} else if (recordLen == CHECKPOINTPLUS_LEN) {
ereport(LOG, (errmsg("%s Mode: start local next csn from checkpoint %lu", "GTM-Lite",
(checkPointPlus.next_csn))));
t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo = checkPointPlus.next_csn;
} else if (recordLen == CHECKPOINTUNDO_LEN) {
ereport(LOG, (errmsg("%s Mode: start local next csn from checkpoint %lu", "GTM-Lite",
(checkPointUndo.next_csn))));
t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo = checkPointUndo.next_csn;
}
}
if (!TransactionIdIsValid(t_thrd.xact_cxt.ShmemVariableCache->standbyXmin)) {
if (recordLen == CHECKPOINTPLUS_LEN) {
t_thrd.xact_cxt.ShmemVariableCache->standbyXmin = checkPointPlus.recent_global_xmin;
} else if (recordLen == CHECKPOINTUNDO_LEN) {
t_thrd.xact_cxt.ShmemVariableCache->standbyXmin = checkPointUndo.recent_global_xmin;
} else if (wasShutdown) {
t_thrd.xact_cxt.ShmemVariableCache->standbyXmin = checkPoint.nextXid - 1;
}
}
t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin = InvalidTransactionId;
if (t_thrd.proc->workingVersionNum >= INPLACE_UPDATE_VERSION_NUM) {
pg_atomic_write_u64(&g_instance.undo_cxt.globalFrozenXid, checkPointUndo.globalRecycleXid);
pg_atomic_write_u64(&g_instance.undo_cxt.globalRecycleXid, checkPointUndo.globalRecycleXid);
ereport(LOG, (errmsg("Startup: write global globalRecycleXid %lu from checkpoint %lu",
g_instance.undo_cxt.globalRecycleXid, checkPointUndo.globalRecycleXid)));
} else {
pg_atomic_write_u64(&g_instance.undo_cxt.globalFrozenXid, InvalidTransactionId);
pg_atomic_write_u64(&g_instance.undo_cxt.globalRecycleXid, InvalidTransactionId);
}
set_hot_standby_recycle_xid();
* Initialize replication slots, before there's a chance to remove
* required resources.
*/
t_thrd.xact_cxt.ShmemVariableCache->xlogMaxCSN = t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo - 1;
init_instance_slot();
init_instance_slot_thread();
if (!ENABLE_DMS || (SS_REFORM_REFORMER && !SS_PRIMARY_DEMOTED)) {
StartupReplicationSlots();
}
* Startup logical state, needs to be setup now so we have proper data
* during crash recovery.
*/
StartupReorderBuffer();
* Initialize start LSN point of CBM system by validating the last-finished CBM file.
* At this moment, checkpoint would not happen so it is safe to do the desicion
* without holding the control file lock.
*/
if (u_sess->attr.attr_storage.enable_cbm_tracking && g_instance.pid_cxt.CBMWriterPID == 0) {
CBMTrackInit(true, checkPoint.redo);
t_thrd.cbm_cxt.XlogCbmSys->needReset = false;
} else {
ereport(LOG, (errmsg("Mark CBM Writer reset is true.")));
t_thrd.cbm_cxt.XlogCbmSys->needReset = true;
}
startupInitDelayXlog();
startupInitDelayDDL();
* We must replay WAL entries using the same TimeLineID they were created
* under, so temporarily adopt the TLI indicated by the checkpoint (see
* also xlog_redo()).
*/
t_thrd.xlog_cxt.ThisTimeLineID = checkPoint.ThisTimeLineID;
* Before running in recovery, scan pg_twophase and fill in its status
* to be able to work on entries generated by redo. Doing a scan before
* taking any recovery action has the merit to discard any 2PC files that
* are newer than the first record to replay, saving from any conflicts at
* replay. This avoids as well any subsequent scans when doing recovery
* of the on-disk two-phase data.
*/
if (!ENABLE_DMS || SS_PRIMARY_MODE) {
restoreTwoPhaseData();
}
StartupCSNLOG();
t_thrd.xlog_cxt.lastFullPageWrites = checkPoint.fullPageWrites;
t_thrd.xlog_cxt.RedoRecPtr = t_thrd.shemem_ptr_cxt.XLogCtl->RedoRecPtr =
t_thrd.shemem_ptr_cxt.XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
if (ENABLE_INCRE_CKPT) {
t_thrd.xlog_cxt.doPageWrites = false;
} else {
t_thrd.xlog_cxt.doPageWrites = t_thrd.xlog_cxt.lastFullPageWrites;
}
if (XLByteLT(RecPtr, checkPoint.redo)) {
ereport(PANIC, (errmsg("invalid redo in checkpoint record")));
}
* Check whether we need to force recovery from WAL. If it appears to
* have been a clean shutdown and we did not have a recovery.conf file,
* then assume no recovery needed.
*/
if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) {
t_thrd.xlog_cxt.InRecovery = true;
if (SS_STANDBY_PROMOTING) {
ereport(LOG, (errmsg("[SS switchover] Standby promote: redo shutdown checkpoint now")));
}
}
if (XLByteLT(checkPoint.redo, RecPtr)) {
#ifdef ENABLE_MULTIPLE_NODES
if (wasShutdown) {
ereport(PANIC, (errmsg("invalid redo record in shutdown checkpoint")));
}
#endif
t_thrd.xlog_cxt.InRecovery = true;
} else if (t_thrd.shemem_ptr_cxt.ControlFile->state != DB_SHUTDOWNED) {
t_thrd.xlog_cxt.InRecovery = true;
} else if (t_thrd.xlog_cxt.ArchiveRecoveryRequested) {
t_thrd.xlog_cxt.InRecovery = true;
if (ArchiveRecoveryByPending) {
RecoveryByPending = true;
}
} else if (!SSOndemandRecoveryExitNormal) {
t_thrd.xlog_cxt.InRecovery = true;
}
if (SS_STANDBY_MODE && t_thrd.xlog_cxt.InRecovery == true && SS_ONDEMAND_REALTIME_BUILD_DISABLED) {
ereport(LOG, (errmsg("[SS reform] Skip redo replay in standby mode")));
t_thrd.xlog_cxt.InRecovery = false;
}
g_instance.dms_cxt.SSRecoveryInfo.in_ondemand_recovery = false;
SetDefaultExtremeRtoMode();
if (SS_PRIMARY_MODE && ENABLE_ONDEMAND_RECOVERY && (SS_STANDBY_FAILOVER || SS_PRIMARY_NORMAL_REFORM) &&
t_thrd.xlog_cxt.InRecovery == true) {
if (SSOndemandRecoveryExitNormal) {
StartupOndemandRecovery();
ereport(LOG, (errmsg("[SS reform][On-demand] replayed in extreme rto ondemand recovery mode")));
} else {
ereport(LOG, (errmsg("[SS reform][On-demand] do not allow replay in ondemand recovery if "
"last ondemand recovery crash, replayed in extreme rto recovery mode")));
}
}
if (SS_PRIMARY_MODE || SS_DISASTER_MAIN_STANDBY_NODE) {
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
SSUpdateReformerCtrl();
LWLockRelease(ControlFileLock);
}
if (SS_ONDEMAND_REALTIME_BUILD_READY_TO_BUILD) {
t_thrd.xlog_cxt.InRecovery = true;
SetOndemandExtremeRtoMode();
g_instance.dms_cxt.SSRecoveryInfo.recovery_pause_flag = false;
g_instance.dms_cxt.SSRecoveryInfo.ondemand_realtime_build_status = BUILD_NORMAL;
ereport(LOG, (errmsg("[SS reform][On-demand] realtime build start finish, set status to BUILD_NORMAL")));
}
ConfigRecoveryParallelism();
ReadRemainSegsFile();
checkHadrInSwitchover();
if (t_thrd.xlog_cxt.InRecovery) {
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
* Update pg_control to show that we are recovering and to show the
* selected checkpoint as the place we are starting from. We also mark
* pg_control with any minimum recovery stop point obtained from a
* backup history file.
*/
dbstate_at_startup = t_thrd.shemem_ptr_cxt.ControlFile->state;
if (t_thrd.xlog_cxt.InArchiveRecovery) {
t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
} else {
ereport(LOG, (errmsg("database system was not properly shut down; "
"automatic recovery in progress")));
t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_CRASH_RECOVERY;
}
PrintCkpXctlControlFile(t_thrd.shemem_ptr_cxt.ControlFile->checkPoint,
&(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy), checkPointLoc, &checkPoint,
t_thrd.shemem_ptr_cxt.ControlFile->prevCheckPoint, "StartupXLOG");
t_thrd.shemem_ptr_cxt.ControlFile->prevCheckPoint = t_thrd.shemem_ptr_cxt.ControlFile->checkPoint;
t_thrd.shemem_ptr_cxt.ControlFile->checkPoint = checkPointLoc;
t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy = checkPoint;
if (t_thrd.xlog_cxt.InArchiveRecovery) {
if (XLByteLT(t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint, checkPoint.redo)) {
t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint = checkPoint.redo;
}
}
* Set backupStartPoint if we're starting recovery from a backup label guide.
*
* NOTICE: there are several scenarios for this as follow, hope you like it.
* 1. recovery a backup-interrupted master, ArchiveRecoveryRequested(true),
* ArchiveRestoreRequested(false), backupEndRequired(false) and
* backupFromStandby(false).
* 2. restore a master exclusive backup, ArchiveRecoveryRequested(true),
* ArchiveRestoreRequested(true), backupEndRequired(false) and
* backupFromStandby(false).
* 3. restore a standby exclusive backup, ArchiveRecoveryRequested(true),
* ArchiveRestoreRequested(true), backupEndRequired(false) and
* backupFromStandby(true).
* 4. restore a gs_ctl full build standby, ArchiveRecoveryRequested(true),
* ArchiveRestoreRequested(false), backupEndRequired(true) and
* backupFromStandby(false).
* 5. restore a gs_rewind inc build standby, ArchiveRecoveryRequested(true),
* ArchiveRestoreRequested(false), backupEndRequired(false) and
* backupFromStandby(true).
* 6. recovery as pending or standby, ArchiveRecoveryRequested(true),
* ArchiveRestoreRequested(false), backupEndRequired(false) and
* backupFromStandby(false) without backup label file.
*
* When we try to recover a backup-interrupted master, treat it as a normal
* starting even we choose to recover from the checkpoint loaded from backup
* label file.
*
* Set backupEndPoint and use minRecoveryPoint as the backup end location
* if we're starting recovery from a base backup which was taken from the
* standby. In this case, the database system status in pg_control must
* indicate DB_IN_ARCHIVE_RECOVERY. If not, which means that backup is
* corrupted, so we cancel recovery.
*/
if (haveBackupLabel) {
if (t_thrd.xlog_cxt.ArchiveRestoreRequested || backupEndRequired || backupFromStandby) {
t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint = checkPoint.redo;
t_thrd.shemem_ptr_cxt.ControlFile->backupEndRequired = backupEndRequired;
} else {
t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint = InvalidXLogRecPtr;
t_thrd.shemem_ptr_cxt.ControlFile->backupEndRequired = false;
}
if (backupFromStandby) {
if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY) {
ereport(FATAL, (errmsg("backup_label contains data inconsistent with control file"),
errhint("This means that the backup is corrupted and you will "
"have to use another backup for recovery.")));
}
t_thrd.shemem_ptr_cxt.ControlFile->backupEndPoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
ereport(LOG, (errmsg("backup_from_standby: set backup end point to %X/%X",
(uint32)(t_thrd.shemem_ptr_cxt.ControlFile->backupEndPoint >> 32),
(uint32)t_thrd.shemem_ptr_cxt.ControlFile->backupEndPoint)));
} else if (backupFromRoach) {
t_thrd.shemem_ptr_cxt.ControlFile->backupEndPoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
ereport(LOG, (errmsg("perform roach backup restore and set backup end point to %X/%X",
(uint32)(t_thrd.shemem_ptr_cxt.ControlFile->backupEndPoint >> 32),
(uint32)t_thrd.shemem_ptr_cxt.ControlFile->backupEndPoint)));
g_instance.roach_cxt.isRoachRestore = true;
}
}
t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL);
UpdateControlFile();
t_thrd.xlog_cxt.minRecoveryPoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
SetMinRecoverPointForStats(t_thrd.xlog_cxt.minRecoveryPoint);
* Reset pgstat data if database system was abnormal shutdown (as
* crash), because it may be invalid after recovery.
*/
if (AbnormalShutdown) {
pgstat_reset_all();
}
* If there was a backup label file, it's done its job and the info
* has now been propagated into pg_control. We must get rid of the
* label file so that if we crash during recovery, we'll pick up at
* the latest recovery restartpoint instead of going all the way back
* to the backup start point. It seems prudent though to just rename
* the file out of the way rather than delete it completely.
*/
if (haveBackupLabel) {
unlink(BACKUP_LABEL_OLD);
durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
}
* If there was a tablespace_map file, it's done its job and the
* symlinks have been created. We must get rid of the map file so
* that if we crash during recovery, we don't create symlinks again.
* It seems prudent though to just rename the file out of the way
* rather than delete it completely.
*/
if (haveTblspcMap) {
unlink(TABLESPACE_MAP_OLD);
durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
}
CheckRequiredParameterValues(DBStateShutdown);
* We're in recovery, so unlogged relations may be trashed and must be
* reset. This should be done BEFORE allowing Hot Standby
* connections, so that read-only backends don't try to read whatever
* garbage is left over from before.
*/
if (!RecoveryByPending && (!SS_STANDBY_FAILOVER && SSModifySharedLunAllowed())) {
ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
}
* Likewise, delete any saved transaction snapshot files that got left
* behind by crashed backends.
*/
DeleteAllExportedSnapshotFiles();
* Initialize for Hot Standby, if enabled. We won't let backends in
* yet, not until we've reached the min recovery point specified in
* control file and we've established a recovery snapshot from a
* running-xacts WAL record.
*/
if (t_thrd.xlog_cxt.ArchiveRecoveryRequested && g_instance.attr.attr_storage.EnableHotStandby) {
TransactionId *xids = NULL;
int nxids;
ereport(DEBUG1, (errmsg("initializing for hot standby")));
InitRecoveryTransactionEnvironment();
if (wasShutdown) {
oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
} else {
oldestActiveXID = checkPoint.oldestActiveXid;
}
Assert(TransactionIdIsValid(oldestActiveXID));
ProcArrayInitRecovery(t_thrd.xact_cxt.ShmemVariableCache->nextXid);
* maintained during recovery and need not be started yet. */
StartupCLOG();
* If we're beginning at a shutdown checkpoint, we know that
* nothing was running on the master at this point. So fake-up an
* empty running-xacts record and use that here and now. Recover
* additional standby state for prepared transactions.
*/
if (wasShutdown) {
#ifdef ENABLE_MULTIPLE_NODES
TransactionId latestnextXid;
RunningTransactionsData running;
* Construct a RunningTransactions snapshot representing a
* shut down server, with only prepared transactions still
* alive. We're never overflowed at this point because all
* subxids are listed with their parent prepared transactions.
*/
running.nextXid = checkPoint.nextXid;
running.oldestRunningXid = oldestActiveXID;
latestnextXid = checkPoint.nextXid;
TransactionIdRetreat(latestnextXid);
Assert(TransactionIdIsNormal(latestnextXid));
running.latestCompletedXid = latestnextXid;
ProcArrayApplyRecoveryInfo(&running);
#endif
StandbyRecoverPreparedTransactions();
}
}
* Initialize shared variables for tracking progress of WAL replay, as
* if we had just replayed the record before the REDO location (or the
* checkpoint record itself, if it's a shutdown checkpoint).
*/
SpinLockAcquire(&xlogctl->info_lck);
if (XLByteLT(checkPoint.redo, RecPtr)) {
xlogctl->replayEndRecPtr = checkPoint.redo;
xlogctl->lastReplayedReadRecPtr = checkPoint.redo;
} else {
xlogctl->replayEndRecPtr = t_thrd.xlog_cxt.EndRecPtr;
xlogctl->lastReplayedReadRecPtr = t_thrd.xlog_cxt.ReadRecPtr;
}
g_instance.comm_cxt.predo_cxt.redoPf.last_replayed_end_ptr = xlogctl->lastReplayedEndRecPtr;
xlogctl->lastReplayedEndRecPtr = xlogctl->replayEndRecPtr;
xlogctl->recoveryLastXTime = 0;
xlogctl->currentChunkStartTime = 0;
xlogctl->recoveryPause = false;
SpinLockRelease(&xlogctl->info_lck);
t_thrd.xlog_cxt.XLogReceiptTime = GetCurrentTimestamp();
* Let postmaster know we've started redo now, so that it can launch
* checkpointer to perform restartpoints. We don't bother during
* crash recovery as restartpoints can only be performed during
* archive recovery. And we'd like to keep crash recovery simple, to
* avoid introducing bugs that could affect you when recovering after
* crash.
*
* After this point, we can no longer assume that we're the only
* process in addition to postmaster! Also, fsync requests are
* subsequently to be handled by the checkpointer, not locally.
*/
if ((t_thrd.xlog_cxt.ArchiveRecoveryRequested || IS_PGXC_COORDINATOR || IS_SINGLE_NODE || isRestoreMode) &&
IsUnderPostmaster) {
PublishStartupProcessInformation();
EnableSyncRequestForwarding();
SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
if (ENABLE_INCRE_CKPT) {
t_thrd.xlog_cxt.pagewriter_launched = true;
} else {
t_thrd.xlog_cxt.bgwriterLaunched = true;
}
}
* Report the values of relevant variables used for the follow consistent check.
* In some scenario, it takes a long time to recover up to the consistent point
* before starting up. We can evaluate the remainning time based on the current
* redo LSN and future consistent point.
*/
ereport(LOG, (errmsg("redo minRecoveryPoint at %X/%X; backupStartPoint at %X/%X; "
"backupEndPoint at %X/%X;backupEndRequired %s",
(uint32)(t_thrd.xlog_cxt.minRecoveryPoint >> 32), (uint32)t_thrd.xlog_cxt.minRecoveryPoint,
(uint32)(t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint >> 32),
(uint32)t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint,
(uint32)(t_thrd.shemem_ptr_cxt.ControlFile->backupEndPoint >> 32),
(uint32)t_thrd.shemem_ptr_cxt.ControlFile->backupEndPoint,
t_thrd.shemem_ptr_cxt.ControlFile->backupEndRequired ? "TRUE" : "FALSE")));
init_extreme_rto_standby_read_first_snapshot(checkPoint.redo);
pg_atomic_write_u32(&t_thrd.walreceiverfuncs_cxt.WalRcv->rcvDoneFromShareStorage, false);
CheckRecoveryConsistency();
ResetXLogStatics();
* might physically precede it, though.
* When dms is enabled, due to failover and switchover of intercluster could happend frequently,
* checkPoint.redo is equel to RecPtr and checkPoint.redo is last record, so here record which
* next read should be Startptr of checkPoint.redo, instead of Endptr of checkPoint.redo, which
* could be NULL. Otherwise, the process will end in a loop of this function(ReadRecord).
*/
if (XLByteLT(checkPoint.redo, RecPtr) || SS_STANDBY_PROMOTING ||
(XLByteLE(checkPoint.redo, RecPtr) && SS_ONDEMAND_REALTIME_BUILD_NORMAL)) {
record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
} else {
record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
}
XLogReaderState *oldXlogReader = xlogreader;
if (record != NULL) {
bool recoveryContinue = true;
bool recoveryApply = true;
TimestampTz xtime;
instr_time rec_startTime;
instr_time rec_endTime;
XLogRecPtr redoStartPtr = t_thrd.xlog_cxt.ReadRecPtr;
uint64 redoTotalBytes;
UpdateTermFromXLog(record->xl_term);
t_thrd.xlog_cxt.InRedo = true;
g_instance.comm_cxt.predo_cxt.redoPf.preEndPtr = 0;
ResourceManagerStartup();
StartUpMultiRedo(xlogreader, sizeof(readprivate));
if (IsExtremeRedo()) {
xlogreader->isPRProcess = true;
record = ExtremeReadNextXLogRecord(&xlogreader, LOG);
if (record == NULL) {
ereport(PANIC, (errmsg("redo starts at %X/%X", (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32),
(uint32)t_thrd.xlog_cxt.ReadRecPtr)));
}
t_thrd.xlog_cxt.readSource = 0;
} else if (IsParallelRedo()) {
xlogreader->isPRProcess = true;
xlogreader = parallel_recovery::NewReaderState(xlogreader, true);
}
ereport(LOG, (errmsg("redo starts at %X/%X", (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32),
(uint32)t_thrd.xlog_cxt.ReadRecPtr)));
#ifndef ENABLE_LITE_MODE
#ifdef ENABLE_OBS
update_stop_barrier();
#endif
#endif
INSTR_TIME_SET_CURRENT(rec_startTime);
g_instance.dms_cxt.SSReformInfo.redo_start_time = GetCurrentTimestamp();
t_thrd.xlog_cxt.RedoStartLSN = t_thrd.xlog_cxt.ReadRecPtr;
g_instance.comm_cxt.predo_cxt.redoPf.redo_start_ptr = t_thrd.xlog_cxt.RedoStartLSN;
knl_g_set_redo_finish_status(0);
ereport(LOG, (errmsg("set knl_g_set_redo_finish_status to false when starting redo")));
do {
TermFileData term_file;
#ifdef WAL_DEBUG
if (u_sess->attr.attr_storage.XLOG_DEBUG ||
(rmid == RM_XACT_ID && u_sess->attr.attr_common.trace_recovery_messages <= DEBUG2) ||
(rmid != RM_XACT_ID && u_sess->attr.attr_common.trace_recovery_messages <= DEBUG3)) {
StringInfoData buf;
initStringInfo(&buf);
appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ", (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32),
(uint32)t_thrd.xlog_cxt.ReadRecPtr, (uint32)(t_thrd.xlog_cxt.EndRecPtr >> 32),
(uint32)t_thrd.xlog_cxt.EndRecPtr);
xlog_outrec(&buf, xlogreader);
appendStringInfo(&buf, " - ");
RmgrTable[record->xl_rmid].rm_desc(&buf, xlogreader);
ereport(LOG, (errmsg("%s", buf.data)));
pfree_ext(buf.data);
}
#endif
RedoInterruptCallBack();
* Pause WAL replay, if requested by a hot-standby session via SetRecoveryPause().
*
* Note that we intentionally don't take the info_lck spinlock
* here. We might therefore read a slightly stale value of
* the recoveryPause flag, but it can't be very stale (no
* worse than the last spinlock we did acquire). Since a
* pause request is a pretty asynchronous thing anyway,
* possibly responding to it one WAL record later than we
* otherwise would is a minor issue, so it doesn't seem worth
* adding another spinlock cycle to prevent that.
*/
if (xlogctl->recoveryPause) {
recoveryPausesHere();
}
GetRedoStartTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_2]);
if (recoveryStopsHere(xlogreader, &recoveryApply)) {
reachedStopPoint = true;
recoveryContinue = false;
ExtremeExtremeRtoStopHere();
if (!recoveryApply &&
(t_thrd.xlog_cxt.server_mode == PRIMARY_MODE || t_thrd.xlog_cxt.server_mode == NORMAL_MODE ||
(IS_OBS_DISASTER_RECOVER_MODE &&
(t_thrd.xlog_cxt.recoveryTarget != RECOVERY_TARGET_TIME_OBS)))) {
ExtremeWaitAllRedoWorkerQueueEmpty();
break;
}
}
#ifndef ENABLE_MULTIPLE_NODES
CountAndGetRedoTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_2],
t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_3]);
* If we've been asked to lag the master, wait on latch until
* enough time has passed.
*/
if (RecoveryApplyDelay(xlogreader)) {
* We test for paused recovery again here. If user sets
* delayed apply, it may be because they expect to pause
* recovery in case of problems, so we must test again
* here otherwise pausing during the delay-wait wouldn't
* work.
*/
if (xlogctl->recoveryPause) {
recoveryPausesHere();
}
}
CountRedoTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_3]);
#else
CountRedoTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_2]);
#endif
if (ENABLE_DMS && !SS_PERFORMING_SWITCHOVER && SSRecoveryApplyDelay()) {
if (xlogctl->recoveryPause) {
recoveryPausesHere();
}
}
* ShmemVariableCache->nextXid must be beyond record's xid.
*
* We don't expect anyone else to modify nextXid, hence we
* don't need to hold a lock while examining it. We still
* acquire the lock to modify it, though.
*/
if (TransactionIdFollowsOrEquals(record->xl_xid, t_thrd.xact_cxt.ShmemVariableCache->nextXid)) {
LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
if (TransactionIdFollowsOrEquals(record->xl_xid, t_thrd.xact_cxt.ShmemVariableCache->nextXid)) {
t_thrd.xact_cxt.ShmemVariableCache->nextXid = record->xl_xid;
TransactionIdAdvance(t_thrd.xact_cxt.ShmemVariableCache->nextXid);
}
LWLockRelease(XidGenLock);
}
* Update shared replayEndRecPtr before replaying this record,
* so that XLogFlush will update minRecoveryPoint correctly.
*/
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->replayEndRecPtr = t_thrd.xlog_cxt.EndRecPtr;
SpinLockRelease(&xlogctl->info_lck);
TransactionId xl_xid = record->xl_xid;
if (t_thrd.xlog_cxt.standbyState >= STANDBY_INITIALIZED && TransactionIdIsValid(xl_xid)) {
CSNLogRecordAssignedTransactionId(xl_xid);
} else if (!SS_ONDEMAND_REALTIME_BUILD_NORMAL && TransactionIdIsValid(xl_xid)) {
ExtendCSNLOG(xl_xid);
}
xtime = GetLatestXTime();
XLogReaderState *newXlogReader = xlogreader;
if (xlogreader->isPRProcess && !IsExtremeRedo()) {
newXlogReader = parallel_recovery::NewReaderState(xlogreader);
}
GetRedoStartTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_4]);
DispatchRedoRecord(xlogreader, t_thrd.xlog_cxt.expectedTLIs, xtime);
CountRedoTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_4]);
t_thrd.xlog_cxt.LastRec = t_thrd.xlog_cxt.ReadRecPtr;
#ifdef ENABLE_HTAP
if (t_thrd.postmaster_cxt.HaShmData->current_mode == STANDBY_MODE) {
pg_atomic_write_u64(&IMCU_CACHE->m_xlog_latest_lsn, xlogreader->EndRecPtr);
}
#endif
if (!recoveryContinue &&
(t_thrd.xlog_cxt.server_mode == PRIMARY_MODE || t_thrd.xlog_cxt.server_mode == NORMAL_MODE ||
(IS_OBS_DISASTER_RECOVER_MODE && (t_thrd.xlog_cxt.recoveryTarget != RECOVERY_TARGET_TIME_OBS)) ||
IS_MULTI_DISASTER_RECOVER_MODE)) {
ExtremeWaitAllRedoWorkerQueueEmpty();
break;
}
if (!IsExtremeRedo() && CheckForForceFinishRedoTrigger(&term_file)) {
t_thrd.xlog_cxt.forceFinishHappened = true;
ereport(WARNING,
(errmsg("[ForceFinish]force finish redo triggered when StartupXLOG, ReadRecPtr:%08X/%08X,"
"EndRecPtr:%08X/%08X, StandbyMode:%u, startup_processing:%u, dummyStandbyMode:%u",
(uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32), (uint32)t_thrd.xlog_cxt.ReadRecPtr,
(uint32)(t_thrd.xlog_cxt.EndRecPtr >> 32), (uint32)t_thrd.xlog_cxt.EndRecPtr,
t_thrd.xlog_cxt.StandbyMode, t_thrd.xlog_cxt.startup_processing,
dummyStandbyMode)));
ArchiveXlogForForceFinishRedo(xlogreader, &term_file);
newXlogReader->readOff = INVALID_READ_OFF;
}
if (xlogctl->recoverySusPend) {
if (IsExtremeRedo()) {
ExtremeRedoWaitRecoverySusPendFinish(xlogreader->EndRecPtr);
} else {
handleRecoverySusPend(xlogreader->EndRecPtr);
}
}
GetRedoStartTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_1]);
if (xlogreader->isPRProcess && IsExtremeRedo()) {
record = ExtremeReadNextXLogRecord(&xlogreader, LOG);
} else {
xlogreader = newXlogReader;
record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
}
CountRedoTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_1]);
} while (record != NULL);
if (SS_IN_ONDEMAND_RECOVERY) {
OnDemandSendRecoveryEndMarkToWorkersAndWaitForReach(0);
} else {
SendRecoveryEndMarkToWorkersAndWaitForFinish(0);
}
RecoveryXlogReader(oldXlogReader, xlogreader);
if (!(IS_OBS_DISASTER_RECOVER_MODE || IS_MULTI_DISASTER_RECOVER_MODE)) {
if (ENABLE_DSS) {
t_thrd.xlog_cxt.recoveryPauseAtTarget = false;
}
if (t_thrd.xlog_cxt.recoveryPauseAtTarget && reachedStopPoint) {
SetRecoveryPause(true);
recoveryPausesHere();
}
}
g_instance.comm_cxt.predo_cxt.redoPf.recovery_done_ptr = t_thrd.xlog_cxt.ReadRecPtr;
g_instance.comm_cxt.predo_cxt.redoPf.redo_done_time = GetCurrentTimestamp();
knl_g_set_redo_finish_status(REDO_FINISH_STATUS_LOCAL | REDO_FINISH_STATUS_CM);
ereport(LOG, (errmsg("set knl_g_set_redo_finish_status to true when redo done")));
ereport(LOG, (errmsg("redo done at %X/%X, end at %X/%X", (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32),
(uint32)t_thrd.xlog_cxt.ReadRecPtr, (uint32)(t_thrd.xlog_cxt.EndRecPtr >> 32),
(uint32)t_thrd.xlog_cxt.EndRecPtr)));
INSTR_TIME_SET_CURRENT(rec_endTime);
g_instance.dms_cxt.SSReformInfo.redo_end_time = GetCurrentTimestamp();
INSTR_TIME_SUBTRACT(rec_endTime, rec_startTime);
redoTotalBytes = t_thrd.xlog_cxt.EndRecPtr - redoStartPtr;
g_instance.dms_cxt.SSReformInfo.redo_total_bytes = redoTotalBytes;
uint64 totalTime = INSTR_TIME_GET_MICROSEC(rec_endTime);
uint64 speed = 0;
if (totalTime > 0) {
speed = (redoTotalBytes / totalTime) * 1000000 / 1024 / 1024;
}
ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
errmsg("[PR]: Recoverying elapsed: %lu us, redoTotalBytes:%lu,"
"EndRecPtr:%lu, redoStartPtr:%lu,speed:%lu MB/s, totalTime:%lu",
INSTR_TIME_GET_MICROSEC(rec_endTime), redoTotalBytes, t_thrd.xlog_cxt.EndRecPtr,
redoStartPtr, speed, totalTime)));
redo_unlink_stats_file();
parallel_recovery::redo_dump_all_stats();
if ((!IS_SHARED_STORAGE_STANDBY_CLUSTER_STANDBY_MODE) && CheckForSwitchoverTrigger()) {
XLogRecPtr receivedUpto = GetWalRcvWriteRecPtr(NULL);
XLogRecPtr EndRecPtrTemp = t_thrd.xlog_cxt.EndRecPtr;
XLByteAdvance(EndRecPtrTemp, SizeOfXLogRecord);
if (XLByteLT(EndRecPtrTemp, receivedUpto) && !FORCE_FINISH_ENABLED) {
ereport(PANIC, (errmsg("there are some received xlog have not been redo "
"the tail of last redo lsn:%X/%X, received lsn:%X/%X",
(uint32)(EndRecPtrTemp >> 32), (uint32)EndRecPtrTemp,
(uint32)(receivedUpto >> 32), (uint32)receivedUpto)));
}
}
xtime = GetLatestXTime();
if (xtime) {
ereport(LOG, (errmsg("last completed transaction was at log time %s", timestamptz_to_str(xtime))));
}
t_thrd.xlog_cxt.InRedo = false;
} else {
ereport(LOG, (errmsg("redo is not required")));
if (SS_IN_ONDEMAND_RECOVERY) {
g_instance.dms_cxt.SSRecoveryInfo.in_ondemand_recovery = false;
g_instance.dms_cxt.SSRecoveryInfo.cluster_ondemand_status = CLUSTER_NORMAL;
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
g_instance.dms_cxt.SSReformerControl.clusterStatus = CLUSTER_NORMAL;
SSUpdateReformerCtrl();
LWLockRelease(ControlFileLock);
SSRequestAllStandbyReloadReformCtrlPage();
}
}
}
undo::InitUndoCountThreshold();
* There exists no XLOG_BACKUP_END(xlog) in backup of standby server.
* If we restore from backup of standby server, we check the backupEndPoint to
* determine whether the backup is complete.
* Since the backupEndPoint may be the last xlog in backup, we must check whether
* we reach the consistency point after redoing all the xlog.
*/
if (backupFromStandby) {
CheckRecoveryConsistency();
}
UpdateRemainSegsStartPoint(t_thrd.xlog_cxt.EndRecPtr);
XLogUpdateRemainCommitLsn();
* Kill WAL receiver, if it's still running, before we continue to write
* the startup checkpoint record. It will trump over the checkpoint and
* subsequent records if it's still alive when we start writing WAL.
*/
ShutdownWalRcv();
ShutdownDataRcv();
if (g_instance.attr.attr_storage.enable_uwal) {
if (t_thrd.postmaster_cxt.HaShmData->current_mode == PENDING_MODE ||
t_thrd.postmaster_cxt.HaShmData->current_mode == STANDBY_MODE) {
int ret = GsUwalPrimaryInitNotify();
if (ret != 0) {
ereport(PANIC, (errmsg("uwal primary init notify failed at pending, ret: %d", ret)));
}
}
}
*
* Check to see if the XLOG sequence contained any unresolved
* references to uninitialized pages.
*/
XLogCheckInvalidPages();
if (IS_MULTI_DISASTER_RECOVER_MODE) {
ereport(LOG, (errmsg("reach stop barrier wait startupxlog here")));
while (reachedStopPoint) {
pg_usleep(1000000L);
RedoInterruptCallBack();
}
}
XLogCheckRemainSegs();
* We don't need the latch anymore. It's not strictly necessary to disown
* it, but let's do it for the sake of tidiness.
*/
if (t_thrd.xlog_cxt.StandbyModeRequested) {
if (!IsExtremeRedo()) {
DisownLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch);
}
DisownLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->dataRecoveryLatch);
if (IsExtremeRedo()) {
DisownLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupDelayLatch);
}
}
* We are now done reading the xlog from stream. Turn off streaming
* recovery to force fetching the files (which would be required at end of
* recovery, e.g., timeline history file) from archive or pg_xlog.
*/
t_thrd.xlog_cxt.StandbyMode = false;
* Re-fetch the last valid or last applied record, so we can identify the
* exact endpoint of what we consider the valid portion of WAL.
*/
xlogreader->readSegNo = 0;
xlogreader->readOff = 0;
xlogreader->readLen = 0;
record = ReadRecord(xlogreader, t_thrd.xlog_cxt.LastRec, PANIC, false);
UpdateTermFromXLog(record->xl_term);
EndOfLog = t_thrd.xlog_cxt.EndRecPtr;
XLByteToPrevSeg(EndOfLog, endLogSegNo);
if (!SS_DISASTER_STANDBY_CLUSTER && (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING)) {
bool use_existent = true;
(void)XLogFileInit(endLogSegNo, &use_existent, true);
}
uint32 redoReadOff = t_thrd.xlog_cxt.readOff;
if (IS_SHARED_STORAGE_MODE) {
GetWritePermissionSharedStorage();
CheckShareStorageCtlInfo(EndOfLog);
}
* Complain if we did not roll forward far enough to render the backup
* dump consistent. Note: it is indeed okay to look at the local variable
* minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
* be further ahead --- ControlFile->minRecoveryPoint cannot have been
* advanced beyond the WAL we processed.
*/
if (t_thrd.xlog_cxt.InRecovery && (XLByteLT(EndOfLog, t_thrd.xlog_cxt.minRecoveryPoint) ||
!XLogRecPtrIsInvalid(t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint))) {
if (reachedStopPoint) {
ereport(FATAL, (errmsg("requested recovery stop point is before consistent recovery point")));
}
* Ran off end of WAL before reaching end-of-backup WAL record, or
* minRecoveryPoint. That's usually a bad sign, indicating that you
* tried to recover from an online backup but never called
* pg_stop_backup(), or you didn't archive all the WAL up to that
* point. However, this also happens in crash recovery, if the system
* crashes while an online backup is in progress. We must not treat
* that as an error, or the database will refuse to start up.
*/
if (t_thrd.xlog_cxt.ArchiveRecoveryRequested || t_thrd.shemem_ptr_cxt.ControlFile->backupEndRequired) {
if (t_thrd.shemem_ptr_cxt.ControlFile->backupEndRequired) {
ereport(FATAL,
(errmsg("WAL ends before end of online backup"),
errhint("All WAL generated while online backup was taken must be available at recovery.")));
} else if (!XLogRecPtrIsInvalid(t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint)) {
ereport(FATAL,
(errmsg("WAL ends before end of online backup"),
errhint(
"Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and "
"all WAL up to that point must be available at recovery.")));
} else {
ereport(FATAL, (errmsg("WAL ends before consistent recovery point")));
}
}
}
t_thrd.shemem_ptr_cxt.XLogCtl->ThisTimeLineID = t_thrd.xlog_cxt.ThisTimeLineID;
* We are now done reading the old WAL. Turn off archive fetching if it
* was active, and make a writable copy of the last WAL segment. (Note
* that we also have a copy of the last block of the old WAL in readBuf;
* we will use that below.)
*/
if (t_thrd.xlog_cxt.ArchiveRecoveryRequested) {
exitArchiveRecovery(xlogreader->readPageTLI, endLogSegNo);
}
* Prepare to write WAL starting at EndOfLog position, and init xlog
* buffer cache using the block containing the last record from the
* previous incarnation.
*/
t_thrd.xlog_cxt.openLogSegNo = endLogSegNo;
if (!g_instance.attr.attr_storage.enable_uwal) {
t_thrd.xlog_cxt.openLogFile = XLogFileOpen(t_thrd.xlog_cxt.openLogSegNo);
}
t_thrd.xlog_cxt.openLogOff = 0;
Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
Insert->PrevByteSize = XLogRecPtrToBytePos(EndOfLog) - XLogRecPtrToBytePos(t_thrd.xlog_cxt.LastRec);
Insert->CurrLRC = 0;
if (ENABLE_ONDEMAND_REALTIME_BUILD && SS_STANDBY_FAILOVER) {
errorno = memset_s(t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks,
sizeof(XLogRecPtr) * g_instance.attr.attr_storage.XLOGbuffers, 0,
sizeof(XLogRecPtr) * g_instance.attr.attr_storage.XLOGbuffers);
securec_check(errorno, "", "");
ereport(LOG, (errmsg("[SS failover]: Successfully reset xlblocks by role :%d when enable REALTIME BUILD.",
(int)t_thrd.role)));
}
* Tricky point here: readBuf contains the *last* block that the LastRec
* record spans, not the one it starts in. The last block is indeed the
* one we want to use.
*/
if (EndOfLog % XLOG_BLCKSZ != 0) {
char *page = NULL;
int len;
int firstIdx;
XLogRecPtr pageBeginPtr;
pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
if (redoReadOff != (pageBeginPtr % XLogSegSize)) {
ereport(FATAL, (errmsg("redo check error: EndOfLog: %lx, pageBeginPtr: %lx, redoReadOff: %x", EndOfLog,
pageBeginPtr, redoReadOff)));
}
firstIdx = XLogRecPtrToBufIdx(EndOfLog);
page = &t_thrd.shemem_ptr_cxt.XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
len = EndOfLog % XLOG_BLCKSZ;
errorno = memcpy_s(page, XLOG_BLCKSZ, xlogreader->readBuf, len);
securec_check(errorno, "", "");
errorno = memset_s(page + len, XLOG_BLCKSZ - len, 0, XLOG_BLCKSZ - len);
securec_check(errorno, "", "");
t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
t_thrd.shemem_ptr_cxt.XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
} else {
* There is no partial block to copy. Just set InitializedUpTo,
* and let the first attempt to insert a log record to initialize
* the next buffer.
*/
t_thrd.shemem_ptr_cxt.XLogCtl->InitializedUpTo = EndOfLog;
}
t_thrd.xlog_cxt.LogwrtResult->Write = t_thrd.xlog_cxt.LogwrtResult->Flush = EndOfLog;
t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtResult = *t_thrd.xlog_cxt.LogwrtResult;
#ifndef ENABLE_MULTIPLE_NODES
t_thrd.xlog_cxt.LogwrtPaxos->Write = t_thrd.xlog_cxt.LogwrtPaxos->Consensus = EndOfLog;
t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtPaxos = *t_thrd.xlog_cxt.LogwrtPaxos;
#endif
t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtRqst.Write = EndOfLog;
t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtRqst.Flush = EndOfLog;
g_instance.comm_cxt.predo_cxt.redoPf.primary_flush_ptr = EndOfLog;
* Prepare WAL buffer and other variables for WAL writing before any
* call to LocalSetXLogInsertAllowed() calls that enables WAL writing
*/
g_instance.wal_cxt.flushResult = EndOfLog;
AdvanceXLInsertBuffer<false>(EndOfLog, false, NULL);
g_instance.wal_cxt.sentResult = EndOfLog;
if (ENABLE_DMS) {
ereport(LOG, (errmsg("[SS reform] EndOfLog:%lu.", EndOfLog)));
}
oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
* Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
* record before resource manager writes cleanup WAL records or checkpoint
* record is written.
*/
if (ENABLE_INCRE_CKPT) {
Insert->fullPageWrites = false;
} else {
Insert->fullPageWrites = t_thrd.xlog_cxt.lastFullPageWrites;
}
if (IS_EXRTO_READ) {
(void)LWLockAcquire(ExrtoRecycleResidualUndoLock, LW_EXCLUSIVE);
g_instance.undo_cxt.is_exrto_residual_undo_file_recycled = false;
LWLockRelease(ExrtoRecycleResidualUndoLock);
}
LocalSetXLogInsertAllowed();
UpdateFullPageWrites();
t_thrd.xlog_cxt.LocalXLogInsertAllowed = -1;
if (t_thrd.xlog_cxt.InRecovery) {
* Resource managers might need to write WAL records, eg, to record
* index cleanup actions. So temporarily enable XLogInsertAllowed in
* this process only.
*/
LocalSetXLogInsertAllowed();
EndRedoXlog();
t_thrd.xlog_cxt.LocalXLogInsertAllowed = -1;
CheckForRestartPoint();
* And finally, execute the recovery_end_command, if any.
*/
if (t_thrd.xlog_cxt.recoveryEndCommand) {
ExecuteRecoveryCommand(t_thrd.xlog_cxt.recoveryEndCommand, "recovery_end_command", true);
}
}
* Clean up any (possibly bogus) future WAL segments on the old timeline.
*/
if (t_thrd.xlog_cxt.ArchiveRestoreRequested) {
RemoveNonParentXlogFiles(EndOfLog, t_thrd.xlog_cxt.ThisTimeLineID);
}
* Preallocate additional log files, if wanted.
*/
PreallocXlogFiles(EndOfLog);
* Reset initial contents of unlogged relations. This has to be done
* AFTER recovery is complete so that any unlogged relations created
* during recovery also get picked up.
*/
if (t_thrd.xlog_cxt.InRecovery && !RecoveryByPending && SSModifySharedLunAllowed()) {
ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
}
* Okay, we're officially UP.
*/
t_thrd.xlog_cxt.InRecovery = false;
g_instance.roach_cxt.isRoachRestore = false;
if (!SS_STANDBY_FAILOVER && !SS_STANDBY_PROMOTING && !SS_IN_ONDEMAND_RECOVERY && SSOndemandRecoveryExitNormal) {
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_PRODUCTION;
t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL);
UpdateControlFile();
LWLockRelease(ControlFileLock);
}
t_thrd.shemem_ptr_cxt.XLogCtl->lastSegSwitchTime = (pg_time_t)time(NULL);
latestCompletedXid = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
TransactionIdRetreat(latestCompletedXid);
t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid = latestCompletedXid;
* Start up the commit log and subtrans, if not already done for hot
* standby.
*/
StartupCLOG();
* Perform end of recovery actions for any SLRUs that need it.
*/
StartupMultiXact();
* Recover knowledge about replay progress of known replication partners.
*/
#ifndef ENABLE_MULTIPLE_NODES
StartupReplicationOrigin();
#endif
TrimCLOG();
RecoverPreparedTransactions();
* If any of the critical GUCs have changed, log them before we allow
* backends to write WAL.
*/
LocalSetXLogInsertAllowed();
* Shutdown the recovery environment. This must occur after
* RecoverPreparedTransactions(), see notes for lock_twophase_recover()
*/
if (t_thrd.xlog_cxt.standbyState != STANDBY_DISABLED) {
ShutdownRecoveryTransactionEnvironment();
}
ShutdownReadFileFacility();
XLogReaderFree(xlogreader);
xlogreader = NULL;
XLogReportParameters();
* if switch gtm-lite mode to gtm-free mode, local next csn maybe larger than next xid,
* set next xid to the larger one.
*/
if (GTM_FREE_MODE &&
t_thrd.xact_cxt.ShmemVariableCache->nextXid < t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo) {
t_thrd.xact_cxt.ShmemVariableCache->nextXid = t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo;
}
t_thrd.xact_cxt.ShmemVariableCache->nextXid += g_instance.shmem_cxt.MaxBackends;
* nextxid may access the next clog page which has not extended yet,
* If we do checkpoint, nextxid record as a part of runningxacts, and
* we may try to access according clog page during recovery.
* Just extend it here to avoid this situation.
*
* Standby node of standby cluster can not extend page in SS_DORADO_CLUSTER,
* because a xlog record will be inserted for extending clog page.
*/
if (!SS_DORADO_STANDBY_CLUSTER_STANDBY_NODE) {
ExtendCLOG(t_thrd.xact_cxt.ShmemVariableCache->nextXid);
ExtendCSNLOG(t_thrd.xact_cxt.ShmemVariableCache->nextXid);
}
* All done. Allow backends to write WAL. (Although the bool flag is
* probably atomic in itself, we use the info_lck here to ensure that
* there are no race conditions concerning visibility of other recent
* updates to shared memory.)
*/
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->SharedRecoveryInProgress = false;
xlogctl->IsRecoveryDone = true;
SpinLockRelease(&xlogctl->info_lck);
if (!SS_IN_ONDEMAND_RECOVERY) {
NotifyGscRecoveryFinished();
if (ENABLE_INCRE_CKPT) {
RecoveryQueueState *state = &g_instance.ckpt_cxt_ctl->ckpt_redo_state;
(void)LWLockAcquire(state->recovery_queue_lock, LW_EXCLUSIVE);
state->start = state->end;
(void)LWLockRelease(state->recovery_queue_lock);
}
}
}
if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING || !SSOndemandRecoveryExitNormal) {
if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) {
g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status = ALLOW_CKPT;
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS %s] allow checkpoint execute.",
SS_STANDBY_FAILOVER ? "failover" : "switchover")));
pg_memory_barrier();
}
if (!SS_IN_ONDEMAND_RECOVERY) {
ereport(LOG, (errmodule(MOD_DMS),
errmsg("[SS reform][SS failover][SS switchover] start full checkpoint.")));
RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT);
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_PRODUCTION;
t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL);
UpdateControlFile();
LWLockRelease(ControlFileLock);
SSRecheckBufferPool();
ereport(LOG, (errmodule(MOD_DMS),
errmsg("[SS reform][SS failover][SS switchover] finished full checkpoint"
" and update control file")));
}
}
NextXidAfterReovery = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
OldestXidAfterRecovery = t_thrd.xact_cxt.ShmemVariableCache->oldestXid;
PendingPreparedXactsCount = GetPendingXactCount();
IsPendingXactsRecoveryDone = false;
t_thrd.xact_cxt.ShmemVariableCache->startupMaxXid = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
if (GTM_LITE_MODE && !TransactionIdIsValid(t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin)) {
if (recordLen == CHECKPOINTPLUS_LEN) {
t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin = checkPointPlus.recent_global_xmin;
} else if (recordLen == CHECKPOINTUNDO_LEN) {
t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin = checkPointUndo.recent_global_xmin;
} else if (wasShutdown) {
t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin = checkPoint.nextXid - 1;
}
}
* May exist gds with no xlog for heap page,
* so the xid in page may bigger than ShmemVariableCache->nextXid
*/
if (!SS_STANDBY_MODE) {
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
CalculateLocalLatestSnapshot(true);
LWLockRelease(ProcArrayLock);
}
if (SS_IN_REFORM && !SS_PRIMARY_DEMOTING && ENABLE_DMS && ENABLE_DSS && SS_PRIMARY_MODE) {
if (SS_STANDBY_PROMOTING || SS_STANDBY_FAILOVER) {
g_instance.dms_cxt.SSReformInfo.needSyncConfig = true;
}
if (gs_signal_send(PostmasterPid, SIGHUP) != 0) {
ereport(WARNING, (errmsg("[SS Reform]send SIGHUP to PM failed when reform is ending")));
}
}
if (SS_PERFORMING_SWITCHOVER && g_instance.dms_cxt.SSClusterState == NODESTATE_STANDBY_PROMOTING) {
ereport(LOG, (errmsg("[SS switchover] Standby promote: StartupXLOG finished, promote success")));
Assert(g_instance.dms_cxt.SSClusterState == NODESTATE_STANDBY_PROMOTING);
g_instance.dms_cxt.SSClusterState = NODESTATE_STANDBY_PROMOTED;
}
if (SS_IN_ONDEMAND_RECOVERY) {
ereport(LOG, (errmsg("[SS reform][On-demand] ondemand redo, nextXid: " XID_FMT ", startupMaxXid: " XID_FMT
", recentLocalXmin: " XID_FMT ", recentGlobalXmin: %lu, PendingPreparedXacts: %d"
", NextCommitSeqNo: %lu, cutoff_csn_min: %lu.",
NextXidAfterReovery, t_thrd.xact_cxt.ShmemVariableCache->startupMaxXid,
t_thrd.xact_cxt.ShmemVariableCache->recentLocalXmin,
t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin, PendingPreparedXactsCount,
t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo,
t_thrd.xact_cxt.ShmemVariableCache->cutoff_csn_min)));
OnDemandWaitRedoFinish();
XLogCheckInvalidPages();
XLogCheckRemainSegs();
RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT);
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_PRODUCTION;
t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL);
UpdateControlFile();
LWLockRelease(ControlFileLock);
SSRecheckBufferPool();
ereport(LOG, (errmodule(MOD_DMS),
errmsg("[SS reform][On-demand] finished full checkpoint and update control file")));
NotifyGscRecoveryFinished();
if (ENABLE_INCRE_CKPT) {
RecoveryQueueState *state = &g_instance.ckpt_cxt_ctl->ckpt_redo_state;
(void)LWLockAcquire(state->recovery_queue_lock, LW_EXCLUSIVE);
state->start = state->end;
(void)LWLockRelease(state->recovery_queue_lock);
}
g_instance.dms_cxt.SSRecoveryInfo.in_ondemand_recovery = false;
SSOndemandClearRedoDoneState();
}
if (SS_PRIMARY_MODE) {
g_instance.dms_cxt.SSRecoveryInfo.cluster_ondemand_status = CLUSTER_NORMAL;
g_instance.dms_cxt.SSRecoveryInfo.ondemand_realtime_build_status = DISABLED;
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
g_instance.dms_cxt.SSReformerControl.clusterStatus = CLUSTER_NORMAL;
SSUpdateReformerCtrl();
LWLockRelease(ControlFileLock);
SSRequestAllStandbyReloadReformCtrlPage();
}
ereport(LOG, (errmsg("redo done, nextXid: " XID_FMT ", startupMaxXid: " XID_FMT ", recentLocalXmin: " XID_FMT
", recentGlobalXmin: %lu, PendingPreparedXacts: %d"
", NextCommitSeqNo: %lu, cutoff_csn_min: %lu.",
NextXidAfterReovery, t_thrd.xact_cxt.ShmemVariableCache->startupMaxXid,
t_thrd.xact_cxt.ShmemVariableCache->recentLocalXmin,
t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin, PendingPreparedXactsCount,
t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo,
t_thrd.xact_cxt.ShmemVariableCache->cutoff_csn_min)));
#ifdef ENABLE_MOT
* Cleanup MOT recovery
*/
if (!IsInitdb) {
MOTRecoveryDone();
}
#endif
if (ENABLE_DMS && !SS_PRIMARY_DEMOTED && !SS_DISASTER_STANDBY_CLUSTER) {
StartupWaitReform();
}
if (SS_DISASTER_MAIN_STANDBY_NODE) {
SendPostmasterSignal(PMSIGNAL_UPDATE_PROMOTING);
}
}
void CopyXlogForForceFinishRedo(XLogSegNo logSegNo, uint32 termId, XLogReaderState *xlogreader,
XLogRecPtr lastRplEndLsn)
{
char srcPath[MAXPGPATH];
char dstPath[MAXPGPATH];
errno_t errorno = EOK;
errorno = snprintf_s(srcPath, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", SS_XLOGDIR, xlogreader->readPageTLI,
(uint32)((logSegNo) / XLogSegmentsPerXLogId), (uint32)((logSegNo) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
errorno = snprintf_s(dstPath, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X%08X", SS_XLOGDIR, termId,
t_thrd.xlog_cxt.ThisTimeLineID, (uint32)((logSegNo) / XLogSegmentsPerXLogId),
(uint32)((logSegNo) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
XLogFileCopy(srcPath, dstPath);
XLogFileTruncate(srcPath, lastRplEndLsn);
}
void RenameXlogForForceFinishRedo(XLogSegNo beginSegNo, TimeLineID tli, uint32 termId)
{
ereport(WARNING, (errcode(ERRCODE_LOG), errmsg("RenameXlogForForceFinishRedo INFO beginSegNo:%lu, tli:%u, "
"termId:%u",
beginSegNo, tli, termId)));
for (uint32 sn = beginSegNo + 1;; ++sn) {
char srcPath[MAXPGPATH];
char dstPath[MAXPGPATH];
errno_t errorno = EOK;
errorno = snprintf_s(srcPath, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", SS_XLOGDIR, tli,
(uint32)((sn) / XLogSegmentsPerXLogId), (uint32)((sn) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
errorno = snprintf_s(dstPath, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X%08X", SS_XLOGDIR, termId, tli,
(uint32)((sn) / XLogSegmentsPerXLogId), (uint32)((sn) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
if (durable_rename(srcPath, dstPath, WARNING) != 0) {
ereport(WARNING, (errcode(ERRCODE_LOG), errmsg("RenameXlogForForceFinishRedo failed, srcPath:%s, "
"dstPath:%s",
srcPath, dstPath)));
break;
}
}
}
void SetSwitchHistoryFile(XLogRecPtr switchLsn, XLogRecPtr catchLsn, uint32 termId)
{
char path[MAXPGPATH];
char tmpPath[MAXPGPATH];
errno_t errorno = EOK;
XLogSwitchInfo info;
FILE *hfile = NULL;
info.termId = termId;
info.switchLsn = switchLsn;
info.catchLsn = catchLsn;
info.oldestRunningXid = PrescanPreparedTransactions(NULL, NULL);
info.latestCompletedXid = t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid;
info.invaildPageCnt = t_thrd.xlog_cxt.invaildPageCnt;
info.imcompleteActionCnt = t_thrd.xlog_cxt.imcompleteActionCnt;
errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%s", SS_XLOGDIR, termId, XLOG_SWITCH_HISTORY_FILE);
securec_check_ss(errorno, "", "");
errorno = snprintf_s(tmpPath, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%s.temp",
SS_XLOGDIR,termId, XLOG_SWITCH_HISTORY_FILE);
securec_check_ss(errorno, "", "");
ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
errmsg("[REDO_STATS]SetSwitchHistoryFile: ready to set switch file, termId:%u, "
"switchLsn:%lu, catchLsn:%lu, oldestRunningXid:%lu, latestCompleteXid:%lu, "
"invaildPageCnt:%u, imcompleteActionCnt:%u",
termId, switchLsn, catchLsn, info.oldestRunningXid, info.latestCompletedXid,
info.invaildPageCnt, info.imcompleteActionCnt)));
canonicalize_path(tmpPath);
hfile = fopen(tmpPath, "w");
if (hfile == NULL) {
ereport(WARNING, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
errmsg("[REDO_STATS]SetSwitchHistoryFile: fopen tmpPath:%s failed, %s", tmpPath, TRANSLATE_ERRNO)));
return;
}
if (0 == (fwrite(&info, 1, sizeof(XLogSwitchInfo), hfile))) {
ereport(WARNING, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
errmsg("[REDO_STATS]SetSwitchHistoryFile: fwrite tmpPath:%s failed, %s", tmpPath, TRANSLATE_ERRNO)));
}
fclose(hfile);
if (durable_rename(tmpPath, path, WARNING) != 0) {
ereport(WARNING,
(errcode(ERRCODE_CASE_NOT_FOUND), errmsg("SetSwitchHistoryFile should not have failed, tmpPath:%s, "
"path:%s",
tmpPath, path)));
}
}
bool FallbackXlogreader(XLogReaderState *xlogreader, XLogRecPtr readPtr, XLogRecPtr endPtr)
{
errno_t rc = EOK;
xlogreader->ReadRecPtr = readPtr;
xlogreader->EndRecPtr = endPtr;
xlogreader->readOff = INVALID_READ_OFF;
rc = memset_s(xlogreader->readBuf, xlogreader->readLen, 0, xlogreader->readLen);
securec_check(rc, "\0", "\0");
t_thrd.xlog_cxt.LastRec = readPtr;
if (t_thrd.xlog_cxt.readFile >= 0) {
close(t_thrd.xlog_cxt.readFile);
t_thrd.xlog_cxt.readFile = -1;
return true;
}
return false;
}
void ReOpenXlog(XLogReaderState *xlogreader)
{
uint32 sources;
int emode;
sources = XLOG_FROM_PG_XLOG;
emode = ((XLogPageReadPrivate *)(xlogreader->private_data))->emode;
if (t_thrd.xlog_cxt.InArchiveRecovery)
sources |= XLOG_FROM_ARCHIVE;
t_thrd.xlog_cxt.readFile = XLogFileReadAnyTLI(t_thrd.xlog_cxt.readSegNo, emode, sources);
if (t_thrd.xlog_cxt.readFile < 0) {
ereport(WARNING, (errmsg("FallbackXlogreader: XLogFileReadAnyTLI failed, "
"readseg:%lu, emode:%d, sources:%u",
t_thrd.xlog_cxt.readSegNo, emode, sources)));
}
}
void ArchiveXlogForForceFinishRedo(XLogReaderState *xlogreader, TermFileData *term_file)
{
bool finishRedo = term_file->finish_redo;
uint32 termId = term_file->term;
if (finishRedo == true) {
if ((get_real_recovery_parallelism() > 1) && (GetRedoWorkerCount() > 0)) {
ProcTxnWorkLoad(true);
parallel_recovery::SendClearMarkToAllWorkers();
parallel_recovery::WaitRedoWorkerIdle();
}
XLogRecPtr lastRplReadLsn;
XLogRecPtr lastRplEndLsn = GetXLogReplayRecPtr(NULL, &lastRplReadLsn);
XLogRecPtr receivedUpto = GetWalRcvWriteRecPtr(NULL);
XLogSegNo beginSegNo;
bool closed = false;
ShutdownWalRcv();
ShutdownDataRcv();
ereport(WARNING,
(errcode(ERRCODE_LOG),
errmsg("[ForceFinish] ArchiveXlogForForceFinishRedo INFO lastRplReadLsn:%lu, lastRplEndLsn:%lu, "
"receivedUpto:%lu",
lastRplReadLsn, lastRplEndLsn, receivedUpto)));
XLByteToSeg(lastRplEndLsn, beginSegNo);
closed = FallbackXlogreader(xlogreader, lastRplReadLsn, lastRplEndLsn);
CopyXlogForForceFinishRedo(beginSegNo, termId, xlogreader, lastRplEndLsn);
RenameXlogForForceFinishRedo(beginSegNo, xlogreader->readPageTLI, termId);
if (closed) {
ReOpenXlog(xlogreader);
}
t_thrd.xlog_cxt.invaildPageCnt = 0;
t_thrd.xlog_cxt.imcompleteActionCnt = 0;
XLogCheckInvalidPages();
btree_clear_imcompleteAction();
SetSwitchHistoryFile(lastRplEndLsn, receivedUpto, termId);
t_thrd.xlog_cxt.invaildPageCnt = 0;
t_thrd.xlog_cxt.imcompleteActionCnt = 0;
set_wal_rcv_write_rec_ptr(lastRplEndLsn);
t_thrd.xlog_cxt.receivedUpto = lastRplEndLsn;
pg_atomic_write_u32(&(g_instance.comm_cxt.localinfo_cxt.is_finish_redo), 0);
ereport(WARNING, (errcode(ERRCODE_LOG), errmsg("[ForceFinish]ArchiveXlogForForceFinishRedo is over!")));
}
}
void backup_cut_xlog_file(XLogRecPtr lastReplayedEndRecPtr)
{
errno_t errorno = EOK;
ereport(LOG, (errmsg("end of backup reached")));
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
if (XLByteLT(t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint, lastReplayedEndRecPtr)) {
t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
SetMinRecoverPointForStats(lastReplayedEndRecPtr);
}
errorno = memset_s(&t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint, sizeof(XLogRecPtr), 0, sizeof(XLogRecPtr));
securec_check(errorno, "", "");
errorno = memset_s(&t_thrd.shemem_ptr_cxt.ControlFile->backupEndPoint, sizeof(XLogRecPtr), 0, sizeof(XLogRecPtr));
securec_check(errorno, "", "");
t_thrd.shemem_ptr_cxt.ControlFile->backupEndRequired = false;
UpdateControlFile();
LWLockRelease(ControlFileLock);
}
static void sendPMBeginHotStby()
{
if (!t_thrd.xlog_cxt.LocalHotStandbyActive &&
t_thrd.xlog_cxt.reachedConsistency && IsUnderPostmaster) {
bool xminValid = true;
XLogRecPtr lastReplayedEndRecPtr = t_thrd.shemem_ptr_cxt.XLogCtl->lastReplayedEndRecPtr;
#ifndef ENABLE_MULTIPLE_NODES
xminValid = TransactionIdIsValid(t_thrd.xact_cxt.ShmemVariableCache->standbyXmin) ||
!g_instance.attr.attr_storage.EnableHotStandby;
#endif
if (xminValid) {
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->SharedHotStandbyActive = true;
SpinLockRelease(&xlogctl->info_lck);
t_thrd.xlog_cxt.LocalHotStandbyActive = true;
pg_atomic_write_u32(&(g_instance.comm_cxt.predo_cxt.hotStdby), ATOMIC_TRUE);
ereport(LOG, (errmsg("send signal to be hot standby at %X/%X", (uint32)(lastReplayedEndRecPtr >> 32),
(uint32)lastReplayedEndRecPtr)));
SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
}
}
}
* Checks if recovery has reached a consistent state. When consistency is
* reached and we have a valid starting standby snapshot, tell postmaster
* that it can start accepting read-only connections.
*/
void CheckRecoveryConsistency(void)
{
#ifndef ENABLE_MULTIPLE_NODES
if (!IsDCFReadyOrDisabled()) {
return;
}
#endif
XLogRecPtr lastReplayedEndRecPtr;
XLogCtlData *XLogCtl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&XLogCtl->info_lck);
lastReplayedEndRecPtr = t_thrd.shemem_ptr_cxt.XLogCtl->lastReplayedEndRecPtr;
SpinLockRelease(&XLogCtl->info_lck);
if (pg_atomic_read_u32(&(g_instance.comm_cxt.predo_cxt.permitFinishRedo)) == ATOMIC_FALSE &&
XLByteLE(t_thrd.xlog_cxt.max_page_flush_lsn, lastReplayedEndRecPtr)) {
pg_atomic_write_u32(&(g_instance.comm_cxt.predo_cxt.permitFinishRedo), ATOMIC_TRUE);
ereport(WARNING, (errmsg("permit finish redo at %X/%X", (uint32)(lastReplayedEndRecPtr >> 32),
(uint32)lastReplayedEndRecPtr)));
}
* During crash recovery, we don't reach a consistent state until we've
* replayed all the WAL.
*/
if (XLogRecPtrIsInvalid(t_thrd.xlog_cxt.minRecoveryPoint))
return;
* Have we reached the point where our base backup was completed?
* backupStartPoint may clear when XLOG_BACKUP_END redo
*/
if (!XLogRecPtrIsInvalid(t_thrd.shemem_ptr_cxt.ControlFile->backupEndPoint) &&
(!t_thrd.shemem_ptr_cxt.ControlFile->backupEndRequired ||
!XLogRecPtrIsInvalid(t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint)) &&
XLByteLE(t_thrd.shemem_ptr_cxt.ControlFile->backupEndPoint, lastReplayedEndRecPtr)) {
* We have reached the end of base backup, as indicated by pg_control.
* The data on disk is now consistent. Reset backupStartPoint and
* backupEndPoint, and update minRecoveryPoint to make sure we don't
* allow starting up at an earlier point even if recovery is stopped
* and restarted soon after this.
*/
backup_cut_xlog_file(lastReplayedEndRecPtr);
}
* Have we passed our safe starting point? Note that minRecoveryPoint
* is known to be incorrectly set if ControlFile->backupEndRequired,
* until the XLOG_BACKUP_RECORD arrives to advise us of the correct
* minRecoveryPoint. All we know prior to that is that we're not
* consistent yet.
*/
if (!t_thrd.xlog_cxt.reachedConsistency && !t_thrd.shemem_ptr_cxt.ControlFile->backupEndRequired &&
XLByteLE(t_thrd.xlog_cxt.minRecoveryPoint, lastReplayedEndRecPtr) &&
XLogRecPtrIsInvalid(t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint)) {
* We found the maintence of `minRecoveryPoint' is not correct,
* in some scenario, `minRecoveryPoint' set too early, actually it does not
* reach the Consitency.
* For MppDB, standby is a warm standby not hot standby, standby does not
* supoort read only query any more, we do `XLogCheckInvalidPages` when xlog
* redo done not in the process of redo to work around this problem.
*/
t_thrd.xlog_cxt.reachedConsistency = true;
ereport(LOG, (errmsg("consistent recovery state reached at %X/%X", (uint32)(lastReplayedEndRecPtr >> 32),
(uint32)lastReplayedEndRecPtr)));
parallel_recovery::redo_dump_all_stats();
redo_unlink_stats_file();
}
CheckIsStopRecovery();
* Have we got a valid starting snapshot that will allow queries to be
* run? If so, we can tell postmaster that the database is consistent now,
* enabling connections.
*/
sendPMBeginHotStby();
}
* Is the system still in recovery?
*
* Unlike testing InRecovery, this works in any process that's connected to
* shared memory.
*
* As a side-effect, we initialize the local TimeLineID and RedoRecPtr
* variables the first time we see that recovery is finished.
*/
bool RecoveryInProgress(void)
{
* We check shared state each time only until we leave recovery mode. We
* can't re-enter recovery, so there's no need to keep checking after the
* shared variable has once been seen false.
*/
if (!t_thrd.xlog_cxt.LocalRecoveryInProgress) {
if (!ENABLE_DMS || (ENABLE_DMS && !SS_STANDBY_PROMOTING &&
g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status == NOT_ACTIVE)) {
return false;
}
}
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
t_thrd.xlog_cxt.LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
* Initialize TimeLineID and RedoRecPtr when we discover that recovery
* is finished. InitPostgres() relies upon this behaviour to ensure
* that InitXLOGAccess() is called at backend startup. (If you change
* this, see also LocalSetXLogInsertAllowed.)
*/
if (!t_thrd.xlog_cxt.LocalRecoveryInProgress) {
InitXLOGAccess();
}
return t_thrd.xlog_cxt.LocalRecoveryInProgress;
}
* Is HotStandby active yet? This is only important in special backends
* since normal backends won't ever be able to connect until this returns
* true. Postmaster knows this by way of signal, not via shared memory.
*
* Unlike testing standbyState, this works in any process that's connected to
* shared memory. (And note that standbyState alone doesn't tell the truth
* anyway.)
*/
bool HotStandbyActive(void)
{
* We check shared state each time only until Hot Standby is active. We
* can't de-activate Hot Standby, so there's no need to keep checking
* after the shared variable has once been seen true.
*/
if (t_thrd.xlog_cxt.LocalHotStandbyActive) {
return true;
}
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
t_thrd.xlog_cxt.LocalHotStandbyActive = xlogctl->SharedHotStandbyActive;
SpinLockRelease(&xlogctl->info_lck);
return t_thrd.xlog_cxt.LocalHotStandbyActive;
}
* Like HotStandbyActive(), but to be used only in WAL replay code,
* where we don't need to ask any other process what the state is.
*/
bool HotStandbyActiveInReplay(void)
{
Assert(AmStartupProcess() || !IsPostmasterEnvironment || AmPageRedoProcess());
return t_thrd.xlog_cxt.LocalHotStandbyActive;
}
* Is this process allowed to insert new WAL records?
*
* Ordinarily this is essentially equivalent to !RecoveryInProgress().
* But we also have provisions for forcing the result "true" or "false"
* within specific processes regardless of the global state.
*/
bool XLogInsertAllowed(void)
{
if (SS_CLUSTER_ONDEMAND_RECOVERY && SS_STANDBY_MODE) {
ereport(ERROR, (errmsg("[SS] SS Standby can't write xlog during recovery!")));
}
if (t_thrd.xlog_cxt.LocalXLogInsertAllowed >= 0) {
return (bool)t_thrd.xlog_cxt.LocalXLogInsertAllowed;
}
if (RecoveryInProgress()) {
return false;
}
t_thrd.xlog_cxt.LocalXLogInsertAllowed = 1;
return true;
}
* Make XLogInsertAllowed() return true in the current process only.
*
* Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
* and even call LocalSetXLogInsertAllowed() again after that.
*/
static void LocalSetXLogInsertAllowed(void)
{
Assert(t_thrd.xlog_cxt.LocalXLogInsertAllowed == -1);
t_thrd.xlog_cxt.LocalXLogInsertAllowed = 1;
InitXLOGAccess();
}
* Subroutine to try to fetch and validate a prior checkpoint record.
*
* whichChkpt identifies the checkpoint (merely for reporting purposes).
* 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
*/
XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int whichChkpt)
{
XLogRecord *record = NULL;
if (!XRecOffIsValid(RecPtr)) {
switch (whichChkpt) {
case 1:
ereport(LOG, (errmsg("invalid primary checkpoint link in control file")));
break;
case 2:
ereport(LOG, (errmsg("invalid secondary checkpoint link in control file")));
break;
default:
ereport(LOG, (errmsg("invalid checkpoint link in backup_label file")));
break;
}
return NULL;
}
record = ReadRecord(xlogreader, RecPtr, LOG, true);
if (record == NULL) {
switch (whichChkpt) {
case 1:
ereport(LOG, (errmsg("invalid primary checkpoint record")));
break;
case 2:
ereport(LOG, (errmsg("invalid secondary checkpoint record")));
break;
default:
ereport(LOG, (errmsg("invalid checkpoint record")));
break;
}
return NULL;
}
if (record->xl_rmid != RM_XLOG_ID) {
switch (whichChkpt) {
case 1:
ereport(LOG, (errmsg("invalid resource manager ID in primary checkpoint record")));
break;
case 2:
ereport(LOG, (errmsg("invalid resource manager ID in secondary checkpoint record")));
break;
default:
ereport(LOG, (errmsg("invalid resource manager ID in checkpoint record")));
break;
}
return NULL;
}
if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN && record->xl_info != XLOG_CHECKPOINT_ONLINE) {
switch (whichChkpt) {
case 1:
ereport(LOG, (errmsg("invalid xl_info in primary checkpoint record")));
break;
case 2:
ereport(LOG, (errmsg("invalid xl_info in secondary checkpoint record")));
break;
default:
ereport(LOG, (errmsg("invalid xl_info in checkpoint record")));
break;
}
return NULL;
}
if (record->xl_tot_len != CHECKPOINT_LEN && record->xl_tot_len != CHECKPOINTNEW_LEN &&
record->xl_tot_len != CHECKPOINTPLUS_LEN && record->xl_tot_len != CHECKPOINTUNDO_LEN) {
switch (whichChkpt) {
case 1:
ereport(LOG, (errmsg("invalid length of primary checkpoint record")));
break;
case 2:
ereport(LOG, (errmsg("invalid length of secondary checkpoint record")));
break;
default:
ereport(LOG, (errmsg("invalid length of checkpoint record")));
break;
}
return NULL;
}
return record;
}
* This must be called during startup of a backend process, except that
* it need not be called in a standalone backend (which does StartupXLOG
* instead). We need to initialize the local copies of ThisTimeLineID and
* RedoRecPtr.
*
* Note: before Postgres 8.0, we went to some effort to keep the postmaster
* process's copies of ThisTimeLineID and RedoRecPtr valid too. This was
* unnecessary however, since the postmaster itself never touches XLOG anyway.
*/
void InitXLOGAccess(void)
{
XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
t_thrd.xlog_cxt.ThisTimeLineID = t_thrd.shemem_ptr_cxt.XLogCtl->ThisTimeLineID;
if (!ENABLE_DMS) {
Assert(t_thrd.xlog_cxt.ThisTimeLineID != 0 || IsBootstrapProcessingMode());
}
(void)GetRedoRecPtr();
if (ENABLE_INCRE_CKPT) {
t_thrd.xlog_cxt.doPageWrites = false;
} else {
t_thrd.xlog_cxt.doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
}
InitXLogInsert();
}
* Return the current Redo pointer from shared memory.
*
* As a side-effect, the local RedoRecPtr copy is updated.
*/
XLogRecPtr GetRedoRecPtr(void)
{
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
XLogRecPtr ptr;
* The possibly not up-to-date copy in XlogCtl is enough. Even if we
* grabbed a WAL insertion slot to read the master copy, someone might
* update it just after we've released the lock.
*/
SpinLockAcquire(&xlogctl->info_lck);
ptr = xlogctl->RedoRecPtr;
SpinLockRelease(&xlogctl->info_lck);
if (t_thrd.xlog_cxt.RedoRecPtr < ptr) {
t_thrd.xlog_cxt.RedoRecPtr = ptr;
}
return t_thrd.xlog_cxt.RedoRecPtr;
}
* Return information needed to decide whether a modified block needs a
* full-page image to be included in the WAL record.
*
* The returned values are cached copies from backend-private memory, and
* possibly out-of-date. XLogInsertRecord will re-check them against
* up-to-date values, while holding the WAL insert lock.
*/
void GetFullPageWriteInfo(XLogFPWInfo *fpwInfo_p)
{
fpwInfo_p->redoRecPtr = t_thrd.xlog_cxt.RedoRecPtr;
fpwInfo_p->doPageWrites = t_thrd.xlog_cxt.doPageWrites && !ENABLE_INCRE_CKPT;
fpwInfo_p->forcePageWrites = t_thrd.shemem_ptr_cxt.XLogCtl->FpwBeforeFirstCkpt && !IsInitdb && !ENABLE_INCRE_CKPT;
}
* GetInsertRecPtr -- Returns the current insert position.
*
* NOTE: The value *actually* returned is the position of the last full
* xlog page. It lags behind the real insert position by at most 1 page.
* For that, we don't need to acquire WALInsertLock which can be quite
* heavily contended, and an approximation is enough for the current
* usage of this function.
*/
XLogRecPtr GetInsertRecPtr(void)
{
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
XLogRecPtr recptr;
SpinLockAcquire(&xlogctl->info_lck);
recptr = xlogctl->LogwrtRqst.Write;
SpinLockRelease(&xlogctl->info_lck);
return recptr;
}
* GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
* position known to be fsync'd to disk.
*/
XLogRecPtr GetFlushRecPtr(void)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
XLogRecPtr recptr;
SpinLockAcquire(&xlogctl->info_lck);
recptr = xlogctl->LogwrtResult.Flush;
SpinLockRelease(&xlogctl->info_lck);
return recptr;
}
* Get the time of the last xlog segment switch
*/
pg_time_t GetLastSegSwitchTime(void)
{
pg_time_t result;
LWLockAcquire(WALWriteLock, LW_SHARED);
result = t_thrd.shemem_ptr_cxt.XLogCtl->lastSegSwitchTime;
LWLockRelease(WALWriteLock);
return result;
}
* GetRecoveryTargetTLI - get the current recovery target timeline ID
*/
TimeLineID GetRecoveryTargetTLI(void)
{
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
TimeLineID result;
SpinLockAcquire(&xlogctl->info_lck);
result = xlogctl->RecoveryTargetTLI;
SpinLockRelease(&xlogctl->info_lck);
return result;
}
TimeLineID GetThisTargetTLI(void)
{
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
TimeLineID result;
SpinLockAcquire(&xlogctl->info_lck);
result = xlogctl->ThisTimeLineID;
SpinLockRelease(&xlogctl->info_lck);
return result;
}
* DummyStandbySetRecoveryTargetTLI - set the current recovery target timeline ID
*/
void DummyStandbySetRecoveryTargetTLI(TimeLineID timeLineID)
{
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->RecoveryTargetTLI = timeLineID;
xlogctl->ThisTimeLineID = timeLineID;
SpinLockRelease(&xlogctl->info_lck);
return;
}
* This must be called ONCE during postmaster or standalone-backend shutdown
*/
void ShutdownXLOG(int code, Datum arg)
{
if (SS_PRIMARY_DEMOTING) {
ereport(LOG, (errmsg("[SS switchover] primary demote: doing shutdown checkpoint")));
if (SS_DISASTER_STANDBY_CLUSTER) {
CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
} else {
CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
}
ckpt_shutdown_pagewriter();
if (g_instance.ckpt_cxt_ctl->dirty_page_queue != NULL) {
pfree(g_instance.ckpt_cxt_ctl->dirty_page_queue);
g_instance.ckpt_cxt_ctl->dirty_page_queue = NULL;
}
g_instance.dms_cxt.dw_init = false;
dw_exit(true);
dw_exit(false);
return;
}
ereport(LOG, (errmsg("shutting down")));
if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) {
ereport(LOG, (errmsg("[SS failover/SS switchover] Standby promote: skipping shutdown checkpoint")));
} else if (SS_PRIMARY_NORMAL_REFORM && SS_CLUSTER_ONDEMAND_NOT_NORAML) {
ereport(LOG, (errmsg("[SS normal reform] Normal reform but ondemand recovery not finish in the last time: skipping shutdown checkpoint")));
} else {
if (RecoveryInProgress()) {
(void)CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
} else {
* If archiving is enabled, rotate the last XLOG file so that all the
* remaining records are archived (postmaster wakes up the archiver
* process one more time at the end of shutdown). The checkpoint
* record will go to the next XLOG file and won't be archived (yet).
*/
if (!IsInitdb && XLogArchivingActive() && XLogArchiveCommandSet()) {
(void)RequestXLogSwitch();
}
if (g_instance.wal_cxt.upgradeSwitchMode != ExtremelyFast) {
if (SS_DISASTER_STANDBY_CLUSTER) {
CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
} else {
CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
}
}
}
}
#ifndef ENABLE_MULTIPLE_NODES
if (g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
StopPaxosModule();
}
#endif
ckpt_shutdown_pagewriter();
if (g_instance.ckpt_cxt_ctl->dirty_page_queue != NULL) {
pfree(g_instance.ckpt_cxt_ctl->dirty_page_queue);
g_instance.ckpt_cxt_ctl->dirty_page_queue = NULL;
}
g_instance.bgwriter_cxt.unlink_rel_hashtbl = NULL;
g_instance.bgwriter_cxt.unlink_rel_fork_hashtbl = NULL;
g_instance.ckpt_cxt_ctl->prune_queue_lock = NULL;
g_instance.ckpt_cxt_ctl->ckpt_redo_state.recovery_queue_lock = NULL;
g_instance.ckpt_cxt_ctl->snapshotBlockLock = NULL;
g_instance.bgwriter_cxt.rel_hashtbl_lock = NULL;
g_instance.bgwriter_cxt.rel_one_fork_hashtbl_lock = NULL;
undo::ResetUndoZoneLock();
if (ENABLE_DSS && IsInitdb &&
g_instance.dms_cxt.SSReformerControl.primaryInstId == INVALID_INSTANCEID) {
SSReadReformerCtrl();
}
if (!ENABLE_DMS || (!SS_STANDBY_MODE && !SS_STANDBY_FAILOVER && !SS_STANDBY_PROMOTING)) {
ShutdownCLOG();
ShutdownCSNLOG();
ShutdownMultiXact();
}
g_instance.dms_cxt.dw_init = false;
dw_exit(true);
dw_exit(false);
if (!ENABLE_DMS || g_instance.dms_cxt.SSClusterState == NODESTATE_NORMAL) {
ClearPageRepairTheadMem();
g_instance.repair_cxt.page_repair_hashtbl_lock = NULL;
g_instance.repair_cxt.file_repair_hashtbl_lock = NULL;
}
if (IsInitdb) {
ShutdownShareStorageXLogCopy();
if (t_thrd.xlog_cxt.openLogFile >= 0) {
XLogFileClose();
}
}
ereport(LOG, (errmsg("database system is shut down")));
}
* Log start of a checkpoint.
*/
static void LogCheckpointStart(unsigned int flags, bool restartpoint)
{
const char *msg = NULL;
* XXX: This is hopelessly untranslatable. We could call gettext_noop for
* the main message, but what about all the flags?
*/
if (restartpoint) {
msg = "restartpoint starting:%s%s%s%s%s%s%s";
} else {
msg = "checkpoint starting:%s%s%s%s%s%s%s";
}
ereport(LOG, (errmsg(msg, (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
(flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
(flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "", (flags & CHECKPOINT_FORCE) ? " force" : "",
(flags & CHECKPOINT_WAIT) ? " wait" : "", (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
(flags & CHECKPOINT_CAUSE_TIME) ? " time" : "")));
}
* Log end of a checkpoint.
*/
static void LogCheckpointEnd(bool restartpoint)
{
long write_secs, sync_secs, total_secs, longest_secs, average_secs;
int write_usecs, sync_usecs, total_usecs, longest_usecs, average_usecs;
uint64 average_sync_time;
t_thrd.xlog_cxt.CheckpointStats->ckpt_end_t = GetCurrentTimestamp();
TimestampDifference(t_thrd.xlog_cxt.CheckpointStats->ckpt_write_t, t_thrd.xlog_cxt.CheckpointStats->ckpt_sync_t,
&write_secs, &write_usecs);
TimestampDifference(t_thrd.xlog_cxt.CheckpointStats->ckpt_sync_t, t_thrd.xlog_cxt.CheckpointStats->ckpt_sync_end_t,
&sync_secs, &sync_usecs);
u_sess->stat_cxt.BgWriterStats->m_checkpoint_write_time += write_secs * 1000 + write_usecs / 1000;
u_sess->stat_cxt.BgWriterStats->m_checkpoint_sync_time += sync_secs * 1000 + sync_usecs / 1000;
* All of the published timing statistics are accounted for. Only
* continue if a log message is to be written.
*/
if (!u_sess->attr.attr_common.log_checkpoints) {
return;
}
TimestampDifference(t_thrd.xlog_cxt.CheckpointStats->ckpt_start_t, t_thrd.xlog_cxt.CheckpointStats->ckpt_end_t,
&total_secs, &total_usecs);
* Timing values returned from CheckpointStats are in microseconds.
* Convert to the second plus microsecond form that TimestampDifference
* returns for homogeneous printing.
*/
longest_secs = (long)(t_thrd.xlog_cxt.CheckpointStats->ckpt_longest_sync / 1000000);
longest_usecs = t_thrd.xlog_cxt.CheckpointStats->ckpt_longest_sync - (uint64)longest_secs * 1000000;
average_sync_time = 0;
if (t_thrd.xlog_cxt.CheckpointStats->ckpt_sync_rels > 0) {
average_sync_time = t_thrd.xlog_cxt.CheckpointStats->ckpt_agg_sync_time /
t_thrd.xlog_cxt.CheckpointStats->ckpt_sync_rels;
}
average_secs = (long)(average_sync_time / 1000000);
average_usecs = average_sync_time - (uint64)average_secs * 1000000;
if (restartpoint) {
ereport(LOG, (errmsg("restartpoint complete: wrote %d buffers (%.1f%%); "
"%d transaction log file(s) added, %d removed, %d recycled; "
"write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
"sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
t_thrd.xlog_cxt.CheckpointStats->ckpt_bufs_written,
(double)t_thrd.xlog_cxt.CheckpointStats->ckpt_bufs_written * 100 /
g_instance.attr.attr_storage.NBuffers,
t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_added,
t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_removed,
t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_recycled, write_secs, write_usecs / 1000,
sync_secs, sync_usecs / 1000, total_secs, total_usecs / 1000,
t_thrd.xlog_cxt.CheckpointStats->ckpt_sync_rels, longest_secs, longest_usecs / 1000,
average_secs, average_usecs / 1000)));
} else {
ereport(LOG, (errmsg("checkpoint complete: wrote %d buffers (%.1f%%); "
"%d transaction log file(s) added, %d removed, %d recycled; "
"write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
"sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
t_thrd.xlog_cxt.CheckpointStats->ckpt_bufs_written,
(double)t_thrd.xlog_cxt.CheckpointStats->ckpt_bufs_written * 100 /
g_instance.attr.attr_storage.NBuffers,
t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_added,
t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_removed,
t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_recycled, write_secs, write_usecs / 1000,
sync_secs, sync_usecs / 1000, total_secs, total_usecs / 1000,
t_thrd.xlog_cxt.CheckpointStats->ckpt_sync_rels, longest_secs, longest_usecs / 1000,
average_secs, average_usecs / 1000)));
}
}
* Perform a checkpoint --- either during shutdown, or on-the-fly
*
* flags is a bitwise OR of the following:
* CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
* CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
* CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
* ignoring checkpoint_completion_target parameter.
* CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
* since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
* CHECKPOINT_END_OF_RECOVERY).
*
* Note: flags contains other bits, of interest here only for logging purposes.
* In particular note that this routine is synchronous and does not pay
* attention to CHECKPOINT_WAIT.
*/
void CreateCheckPoint(int flags)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
bool shutdown = false;
bool removexlog = false;
CheckPoint checkPoint;
CheckPointNew checkPointNew;
CheckPointPlus checkPointPlus;
CheckPointUndo checkPointUndo;
XLogRecPtr recptr = InvalidXLogRecPtr;
XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
uint32 freespace;
XLogSegNo _logSegNo;
XLogRecPtr curInsert;
VirtualTransactionId *vxids = NULL;
int nvxids = 0;
int32 lastlrc = 0;
errno_t errorno = EOK;
XLogRecPtr curMinRecLSN = InvalidXLogRecPtr;
bool doFullCheckpoint = !ENABLE_INCRE_CKPT;
TransactionId oldest_active_xid = InvalidTransactionId;
TransactionId globalXmin = InvalidTransactionId;
* An end-of-recovery checkpoint is really a shutdown checkpoint, just
* issued at a different time.
*/
gstrace_entry(GS_TRC_ID_CreateCheckPoint);
if ((unsigned int)flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY)) {
shutdown = true;
}
if (RecoveryInProgress() && ((unsigned int)flags & CHECKPOINT_END_OF_RECOVERY) == 0) {
ereport(ERROR,
(errcode(ERRCODE_INVALID_TRANSACTION_STATE), errmsg("can't create a checkpoint during recovery")));
}
if (SS_DORADO_STANDBY_CLUSTER) {
ereport(ERROR,
(errcode(ERRCODE_INVALID_TRANSACTION_STATE), errmsg("can't create a checkpoint on SS dorado standby cluster")));
}
if (SS_STANDBY_MODE) {
return;
} else if (g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status == NOT_ALLOW_CKPT) {
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS %s] do not do CreateCheckpoint during %s.",
SS_STANDBY_FAILOVER ? "failover" : "switchover",
SS_STANDBY_FAILOVER ? "failover" : "switchover")));
return;
} else if (SS_IN_ONDEMAND_RECOVERY && !SS_ONDEMAND_REDO_DONE) {
return;
} else if (SS_PRIMARY_DEMOTED &&
!(dms_reform_failed() || dms_reform_last_failed() || g_instance.dms_cxt.SSReformInfo.in_reform == false)) {
ereport(LOG, (errmodule(MOD_DMS),
errmsg("[SS reform][SS switchover] do not do CreateCheckpoint during primary demoted.")));
return;
}
if (shutdown || ((unsigned int)flags & (CHECKPOINT_FORCE))) {
doFullCheckpoint = true;
}
* Acquire CheckpointLock to ensure only one checkpoint happens at a time.
* (This is just pro forma, since in the present system structure there is
* only one process that is allowed to issue checkpoints at any given
* time.)
*/
LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
* Prepare to accumulate statistics.
*
* Note: because it is possible for log_checkpoints to change while a
* checkpoint proceeds, we always accumulate stats, even if
* log_checkpoints is currently off.
*/
errorno = memset_s(t_thrd.xlog_cxt.CheckpointStats, sizeof(CheckpointStatsData), 0, sizeof(CheckpointStatsData));
securec_check(errorno, "", "");
t_thrd.xlog_cxt.CheckpointStats->ckpt_start_t = GetCurrentTimestamp();
* Use a critical section to force system panic if we have trouble.
*/
START_CRIT_SECTION();
if (shutdown) {
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
t_thrd.shemem_ptr_cxt.ControlFile->state = DB_SHUTDOWNING;
t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL);
UpdateControlFile();
LWLockRelease(ControlFileLock);
}
* Let smgr prepare for checkpoint; this has to happen before we determine
* the REDO pointer. Note that smgr must not do anything that'd have to
* be undone if we decide no checkpoint is needed.
*/
SyncPreCheckpoint();
errorno = memset_s(&checkPoint, sizeof(checkPoint), 0, sizeof(checkPoint));
securec_check(errorno, "", "");
checkPoint.time = (pg_time_t)time(NULL);
* For Hot Standby, derive the oldestActiveXid before we fix the redo
* pointer. This allows us to begin accumulating changes to assemble our
* starting snapshot of locks and transactions.
*/
if (!shutdown) {
oldest_active_xid = GetOldestActiveTransactionId(&globalXmin);
}
if (!shutdown && XLogStandbyInfoActive()) {
checkPoint.oldestActiveXid = oldest_active_xid;
} else {
checkPoint.oldestActiveXid = InvalidTransactionId;
}
#ifdef ENABLE_MOT
if (doFullCheckpoint) {
CallCheckpointCallback(EVENT_CHECKPOINT_CREATE_SNAPSHOT, 0);
}
#endif
StartSuspendWalInsert(&lastlrc);
curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
if ((ENABLE_INCRE_CKPT && (flags & CHECKPOINT_CAUSE_TIME)) || doFullCheckpoint) {
update_dirty_page_queue_rec_lsn(curInsert, true);
}
* If this isn't a shutdown or forced checkpoint, and we have not inserted
* any XLOG records since the start of the last checkpoint, skip the
* checkpoint. The idea here is to avoid inserting duplicate checkpoints
* when the system is idle. That wastes log space, and more importantly it
* exposes us to possible loss of both current and previous checkpoint
* records if the machine crashes just as we're writing the update.
* (Perhaps it'd make even more sense to checkpoint only when the previous
* checkpoint record is in a different xlog page?)
*
* We have to make two tests to determine that nothing has happened since
* the start of the last checkpoint: current insertion point must match
* the end of the last checkpoint record, and its redo pointer must point
* to itself.
*/
if (((unsigned int)flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_FORCE)) == 0) {
if (curInsert ==
t_thrd.shemem_ptr_cxt.ControlFile->checkPoint + MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
t_thrd.shemem_ptr_cxt.ControlFile->checkPoint == t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo) {
#ifdef ENABLE_MOT
if (doFullCheckpoint) {
CallCheckpointCallback(EVENT_CHECKPOINT_ABORT, 0);
}
#endif
StopSuspendWalInsert(lastlrc);
LWLockRelease(CheckpointLock);
END_CRIT_SECTION();
gstrace_exit(GS_TRC_ID_CreateCheckPoint);
return;
}
} else if (ENABLE_INCRE_CKPT && doFullCheckpoint) {
* enableIncrementalCheckpoint guc is on, but some conditions shuld do
* full checkpoint.
*/
g_instance.ckpt_cxt_ctl->full_ckpt_expected_flush_loc = get_dirty_page_queue_tail();
g_instance.ckpt_cxt_ctl->full_ckpt_redo_ptr = curInsert;
pg_memory_barrier();
if (get_dirty_page_num() > 0) {
g_instance.ckpt_cxt_ctl->flush_all_dirty_page = true;
}
ereport(LOG, (errmsg("will do full checkpoint, need flush %ld pages.", get_dirty_page_num())));
}
if (!doFullCheckpoint) {
curMinRecLSN = ckpt_get_min_rec_lsn();
if (XLogRecPtrIsInvalid(curMinRecLSN)) {
* If dirty page list is empty, use current queue rec lsn.
* Check queue head buffer rec lsn again (if some page dirtied after getting
* get_dirty_page_queue_rec_lsn), get min rec lsn.
*/
XLogRecPtr cur_queue_rec_lsn = get_dirty_page_queue_rec_lsn();
curMinRecLSN = ckpt_get_min_rec_lsn();
if (XLogRecPtrIsInvalid(curMinRecLSN) || XLByteLT(cur_queue_rec_lsn, curMinRecLSN)) {
curMinRecLSN = cur_queue_rec_lsn;
}
if (u_sess->attr.attr_storage.log_pagewriter) {
ereport(DEBUG1,
(errmodule(MOD_INCRE_CKPT),
errmsg("get min_recLSN is invalid, dirty page num is %ld, get queue reclsn is %08X/%08X",
get_dirty_page_num(), (uint32)(curMinRecLSN >> XLOG_LSN_SWAP), (uint32)curMinRecLSN)));
}
}
XLogRecPtr remaincommit = GetRemainCommitLsn();
if (XLByteEQ(curMinRecLSN, g_instance.ckpt_cxt_ctl->ckpt_view.ckpt_current_redo_point) ||
XLByteLT(curMinRecLSN, remaincommit)) {
StopSuspendWalInsert(lastlrc);
LWLockRelease(CheckpointLock);
END_CRIT_SECTION();
if (u_sess->attr.attr_storage.log_pagewriter) {
ereport(LOG, (errmodule(MOD_INCRE_CKPT),
errmsg("Checkpoint meets prev checkpoint lsn is %08X/%08X, now min rec lsn is %08X/%08X, "
"checkpoint flag is %d, remaincommit lsn is %08X/%08X",
(uint32)(g_instance.ckpt_cxt_ctl->ckpt_view.ckpt_current_redo_point >> XLOG_LSN_SWAP),
(uint32)g_instance.ckpt_cxt_ctl->ckpt_view.ckpt_current_redo_point,
(uint32)(curMinRecLSN >> XLOG_LSN_SWAP),
(uint32)(curMinRecLSN), flags,
(uint32)(remaincommit >> XLOG_LSN_SWAP), (uint32)(remaincommit))));
}
gstrace_exit(GS_TRC_ID_CreateCheckPoint);
return;
} else if (XLByteLT(curMinRecLSN,
g_instance.ckpt_cxt_ctl->ckpt_view.ckpt_current_redo_point)) {
ereport(PANIC,
(errmodule(MOD_INCRE_CKPT),
errmsg("curMinRecLSN little prev checkpoint lsn is %08X/%08X,now lsn is %08X/%08X",
(uint32)(g_instance.ckpt_cxt_ctl->ckpt_view.ckpt_current_redo_point >> XLOG_LSN_SWAP),
(uint32)g_instance.ckpt_cxt_ctl->ckpt_view.ckpt_current_redo_point,
(uint32)(curMinRecLSN >> XLOG_LSN_SWAP), (uint32)curMinRecLSN)));
}
if (u_sess->attr.attr_storage.log_pagewriter) {
ereport(LOG,
(errmodule(MOD_INCRE_CKPT),
errmsg("Checkpoint prev checkpoint lsn is %08X/%08X, now will create new checkpoint is %08X/%08X",
(uint32)(g_instance.ckpt_cxt_ctl->ckpt_view.ckpt_current_redo_point >> XLOG_LSN_SWAP),
(uint32)g_instance.ckpt_cxt_ctl->ckpt_view.ckpt_current_redo_point,
(uint32)(curMinRecLSN >> XLOG_LSN_SWAP), (uint32)curMinRecLSN)));
}
}
* An end-of-recovery checkpoint is created before anyone is allowed to
* write WAL. To allow us to write the checkpoint record, temporarily
* enable XLogInsertAllowed. (This also ensures ThisTimeLineID is
* initialized, which we need here and in AdvanceXLInsertBuffer.)
*/
if ((unsigned int)flags & CHECKPOINT_END_OF_RECOVERY) {
LocalSetXLogInsertAllowed();
}
checkPoint.ThisTimeLineID = t_thrd.xlog_cxt.ThisTimeLineID;
checkPoint.fullPageWrites = Insert->fullPageWrites;
* Compute new REDO record ptr = location of next XLOG record.
*
* NB: this is NOT necessarily where the checkpoint record itself will be,
* since other backends may insert more XLOG records while we're off doing
* the buffer flush work. Those XLOG records are logically after the
* checkpoint, even though physically before it. Got that?
*/
if (!doFullCheckpoint) {
* Incremental Checkpoint use queue first page recLSN, when the dirty page queue is empty,
* choose the dirty page queue recLSN. Dirty page queue lsn has computed redo ptr when
* update_dirty_page_queue_rec_lsn.
*/
checkPoint.redo = curMinRecLSN;
} else {
freespace = INSERT_FREESPACE(curInsert);
if (freespace == 0) {
if (curInsert % XLogSegSize == 0) {
curInsert += SizeOfXLogLongPHD;
} else {
curInsert += SizeOfXLogShortPHD;
}
}
checkPoint.redo = curInsert;
}
* Here we update the shared RedoRecPtr for future XLogInsert calls; this
* must be done while holding the insert lock AND the info_lck.
*
* Note: if we fail to complete the checkpoint, RedoRecPtr will be left
* pointing past where it really needs to point. This is okay; the only
* consequence is that XLogInsert might back up whole buffers that it
* didn't really need to. We can't postpone advancing RedoRecPtr because
* XLogInserts that happen while we are dumping buffers must assume that
* their buffer changes are not included in the checkpoint.
*/
t_thrd.xlog_cxt.RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
#ifdef ENABLE_MOT
if (doFullCheckpoint) {
CallCheckpointCallback(EVENT_CHECKPOINT_SNAPSHOT_READY, checkPoint.redo);
}
#endif
* Now we can release WAL insert lock, allowing other xacts to proceed
* while we are flushing disk buffers.
*/
StopSuspendWalInsert(lastlrc);
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->RedoRecPtr = checkPoint.redo;
SpinLockRelease(&xlogctl->info_lck);
* If enabled, log checkpoint start. We postpone this until now so as not
* to log anything if we decided to skip the checkpoint.
*/
if (u_sess->attr.attr_common.log_checkpoints) {
LogCheckpointStart((unsigned int)flags, false);
}
TRACE_POSTGRESQL_CHECKPOINT_START(flags);
* Get the other info we need for the checkpoint record.
*/
LWLockAcquire(XidGenLock, LW_SHARED);
checkPoint.nextXid = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
checkPoint.oldestXid = t_thrd.xact_cxt.ShmemVariableCache->oldestXid;
checkPoint.oldestXidDB = t_thrd.xact_cxt.ShmemVariableCache->oldestXidDB;
LWLockRelease(XidGenLock);
MultiXactGetCheckptMulti(shutdown, &checkPoint.nextMulti, &checkPoint.nextMultiOffset);
checkPointUndo.globalRecycleXid = pg_atomic_read_u64(&g_instance.undo_cxt.globalRecycleXid);
* Having constructed the checkpoint record, ensure all shmem disk buffers
* and commit-log buffers are flushed to disk.
*
* This I/O could fail for various reasons. If so, we will fail to
* complete the checkpoint, but there is no reason to force a system
* panic. Accordingly, exit critical section while doing it.
*/
END_CRIT_SECTION();
* In some cases there are groups of actions that must all occur on
* one side or the other of a checkpoint record. Before flushing the
* checkpoint record we must explicitly wait for any backend currently
* performing those groups of actions.
*
* One example is end of transaction, so we must wait for any transactions
* that are currently in commit critical sections. If an xact inserted
* its commit record into XLOG just before the REDO point, then a crash
* restart from the REDO point would not replay that record, which means
* that our flushing had better include the xact's update of pg_clog. So
* we wait till he's out of his commit critical section before proceeding.
* See notes in RecordTransactionCommit().
*
* Because we've already released WALInsertLock, this test is a bit fuzzy:
* it is possible that we will wait for xacts we didn't really need to
* wait for. But the delay should be short and it seems better to make
* checkpoint take a bit longer than to hold locks longer than necessary.
* (In fact, the whole reason we have this issue is that xact.c does
* commit record XLOG insertion and clog update as two separate steps
* protected by different locks, but again that seems best on grounds of
* minimizing lock contention.)
*
* A transaction that has not yet set delayChkpt when we look cannot be at
* risk, since he's not inserted his commit record yet; and one that's
* already cleared it is not at risk either, since he's done fixing clog
* and we will correctly flush the update below. So we cannot miss any
* xacts we need to wait for.
*/
vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
if (nvxids > 0) {
do {
pg_usleep(10000L);
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
}
pfree_ext(vxids);
* Truncate pg_csnlog if possible. We can throw away all data before
* the oldest XMIN of any running transaction. No future transaction will
* attempt to reference any pg_csnlog entry older than that (see Asserts
* in csnlog.c). During recovery, though, we mustn't do this because
* StartupCSNLOG hasn't been called yet.
*/
pg_time_t now = (pg_time_t)time(NULL);
int elapsed_secs = now - t_thrd.checkpoint_cxt.last_truncate_log_time;
if (!RecoveryInProgress() && !SS_IN_ONDEMAND_RECOVERY &&
(GTM_FREE_MODE || TransactionIdIsNormal(t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin))) {
* Reduce the frequency of trucate CSN log to avoid the probability of lock contention
* Incremental chekpoint does not require frequent truncate of csnlog.
*/
if (doFullCheckpoint || elapsed_secs >= u_sess->attr.attr_storage.fullCheckPointTimeout) {
TransactionId cutoff_xid = GetOldestXmin(NULL);
* Since vacuum xid,xmin are not included in oldestxmin,
* local oldest active xid may lower than oldestxmin,
* don't truncate it for safe.
*/
if (TransactionIdIsNormal(globalXmin) && TransactionIdPrecedes(globalXmin, cutoff_xid)) {
cutoff_xid = globalXmin;
}
if (g_instance.attr.attr_storage.enable_ustore) {
TransactionId globalRecycleXid = pg_atomic_read_u64(&g_instance.undo_cxt.globalRecycleXid);
if (TransactionIdIsNormal(globalRecycleXid) && TransactionIdPrecedes(globalRecycleXid, cutoff_xid)) {
cutoff_xid = globalRecycleXid;
}
}
TruncateCSNLOG(cutoff_xid);
t_thrd.checkpoint_cxt.last_truncate_log_time = now;
}
}
CheckPointGuts(checkPoint.redo, flags, doFullCheckpoint);
#ifdef ENABLE_MOT
if (doFullCheckpoint) {
CallCheckpointCallback(EVENT_CHECKPOINT_BEGIN_CHECKPOINT, 0);
}
#endif
* Take a snapshot of running transactions and write this to WAL. This
* allows us to reconstruct the state of running transactions during
* archive recovery, if required. Skip, if this info disabled.
*
* If we are shutting down, or Startup process is completing crash
* recovery we don't need to write running xact data.
*/
if (!SS_SINGLE_CLUSTER && !shutdown && XLogStandbyInfoActive()) {
LogStandbySnapshot();
}
START_CRIT_SECTION();
LWLockAcquire(OidGenLock, LW_SHARED);
checkPoint.nextOid = t_thrd.xact_cxt.ShmemVariableCache->nextOid;
if (!shutdown) {
checkPoint.nextOid += t_thrd.xact_cxt.ShmemVariableCache->oidCount;
}
LWLockRelease(OidGenLock);
* Select point at which we can truncate the log, which we base on the
* prior checkpoint's earliest info.
*/
XLByteToSeg(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo, _logSegNo);
checkPoint.remove_seg = 1;
if (_logSegNo && !GetDelayXlogRecycle()) {
KeepLogSeg(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo, &_logSegNo, curInsert, false);
checkPoint.remove_seg = _logSegNo;
_logSegNo--;
* Save one more segment to handle the corner case. For example, standby has replayed to
* the end to one segment file (000000010000002400000078) and the LSN is exactly 24/79000000.
* The corresponding slot on primary is 24/79000000. As a result, if standby restart to
* request xlog stream from the lastest valid record (23/xxxxxxx), cannot find on primary.
*/
if (_logSegNo > 0) {
_logSegNo--;
}
removexlog = true;
#ifdef ENABLE_DISTRIBUTE_TEST
if (TEST_STUB(DN_PRIMARY_CHECKPOINT_KEEPXLOG, default_error_emit)) {
ereport(get_distribute_test_param()->elevel,
(errmsg("default_error_emit create check point time:%ds, stub_name:%s",
get_distribute_test_param()->sleep_time, get_distribute_test_param()->test_stub_name)));
checkPoint.remove_seg = 1;
}
#endif
}
* Now insert the checkpoint record into XLOG.
*/
XLogBeginInsert();
if (t_thrd.proc->workingVersionNum >= INPLACE_UPDATE_VERSION_NUM) {
errno_t rcm = memcpy_s(&checkPointUndo, sizeof(CheckPoint), &checkPoint, sizeof(CheckPoint));
securec_check(rcm, "", "");
if (IsBootstrapProcessingMode() ||
!COMMITSEQNO_IS_COMMITTED(t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo)) {
checkPointUndo.next_csn = COMMITSEQNO_FIRST_NORMAL + 1;
checkPointUndo.length = (uint64)sizeof(CheckPointUndo);
checkPointUndo.recent_global_xmin = InvalidTransactionId;
checkPointUndo.globalRecycleXid = 0;
} else {
checkPointUndo.next_csn = t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo;
checkPointUndo.length = (uint64)sizeof(CheckPointUndo);
checkPointUndo.recent_global_xmin = t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin;
checkPointUndo.globalRecycleXid = pg_atomic_read_u64(&g_instance.undo_cxt.globalRecycleXid);
}
XLogRegisterData((char *)(&checkPointUndo), sizeof(checkPointUndo));
} else if (t_thrd.proc->workingVersionNum >= GTMLITE_VERSION_NUM && u_sess->attr.attr_common.upgrade_mode != 1) {
errno_t rcm = memcpy_s(&checkPointPlus, sizeof(CheckPoint), &checkPoint, sizeof(CheckPoint));
securec_check(rcm, "", "");
if (IsBootstrapProcessingMode() ||
!COMMITSEQNO_IS_COMMITTED(t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo)) {
checkPointPlus.next_csn = COMMITSEQNO_FIRST_NORMAL + 1;
checkPointPlus.length = (uint64)sizeof(CheckPointPlus);
checkPointPlus.recent_global_xmin = InvalidTransactionId;
} else {
checkPointPlus.next_csn = t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo;
checkPointPlus.length = (uint64)sizeof(CheckPointPlus);
checkPointPlus.recent_global_xmin = t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin;
}
XLogRegisterData((char *)(&checkPointPlus), sizeof(checkPointPlus));
} else if (t_thrd.proc->workingVersionNum > GTM_OLD_VERSION_NUM && u_sess->attr.attr_common.upgrade_mode != 1) {
errno_t rcm = memcpy_s(&checkPointNew, sizeof(CheckPoint), &checkPoint, sizeof(CheckPoint));
securec_check(rcm, "", "");
if (IsBootstrapProcessingMode() ||
!COMMITSEQNO_IS_COMMITTED(t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo)) {
checkPointNew.next_csn = COMMITSEQNO_FIRST_NORMAL + 1;
} else {
checkPointNew.next_csn = t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo;
}
XLogRegisterData((char *)(&checkPointNew), sizeof(checkPointNew));
} else {
XLogRegisterData((char *)(&checkPoint), sizeof(checkPoint));
}
recptr = XLogInsert(RM_XLOG_ID, shutdown ? XLOG_CHECKPOINT_SHUTDOWN : XLOG_CHECKPOINT_ONLINE);
XLogWaitFlush(recptr);
* We mustn't write any new WAL after a shutdown checkpoint, or it will be
* overwritten at next startup. No-one should even try, this just allows
* sanity-checking. In the case of an end-of-recovery checkpoint, we want
* to just temporarily disable writing until the system has exited
* recovery.
*/
if (shutdown) {
if ((unsigned int)flags & CHECKPOINT_END_OF_RECOVERY) {
t_thrd.xlog_cxt.LocalXLogInsertAllowed = -1;
} else {
t_thrd.xlog_cxt.LocalXLogInsertAllowed = 0;
}
}
if (doFullCheckpoint && ENABLE_INCRE_CKPT) {
XLogRecPtr MinRecLSN = ckpt_get_min_rec_lsn();
if (!XLogRecPtrIsInvalid(MinRecLSN) && XLByteLT(MinRecLSN, t_thrd.xlog_cxt.RedoRecPtr)) {
ereport(PANIC, (errmsg("current dirty page list head recLSN %08X/%08X smaller than redo lsn %08X/%08X",
(uint32)(MinRecLSN >> XLOG_LSN_SWAP), (uint32)MinRecLSN,
(uint32)(t_thrd.xlog_cxt.RedoRecPtr >> XLOG_LSN_SWAP),
(uint32)t_thrd.xlog_cxt.RedoRecPtr)));
}
}
* We now have ProcLastRecPtr = start of actual checkpoint record, recptr
* = end of actual checkpoint record.
*/
#ifdef ENABLE_MULTIPLE_NODES
if (shutdown && !XLByteEQ(checkPoint.redo, t_thrd.xlog_cxt.ProcLastRecPtr)) {
ereport(PANIC, (errmsg("concurrent transaction log activity while database system is shutting down")));
}
#else
if (shutdown && !XLByteEQ(checkPoint.redo, t_thrd.xlog_cxt.ProcLastRecPtr) && !XLogStandbyInfoActive()) {
ereport(PANIC, (errmsg("concurrent transaction log activity while database system is shutting down")));
}
#endif
PrintCkpXctlControlFile(t_thrd.shemem_ptr_cxt.ControlFile->checkPoint,
&(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy), t_thrd.xlog_cxt.ProcLastRecPtr,
&checkPoint, t_thrd.shemem_ptr_cxt.ControlFile->prevCheckPoint, "CreateCheckPoint");
* Update the control file.
*/
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
if (shutdown) {
t_thrd.shemem_ptr_cxt.ControlFile->state = DB_SHUTDOWNED;
}
t_thrd.shemem_ptr_cxt.ControlFile->prevCheckPoint = t_thrd.shemem_ptr_cxt.ControlFile->checkPoint;
t_thrd.shemem_ptr_cxt.ControlFile->checkPoint = t_thrd.xlog_cxt.ProcLastRecPtr;
t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy = checkPoint;
t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL);
g_instance.ckpt_cxt_ctl->ckpt_view.ckpt_current_redo_point = checkPoint.redo;
errorno = memset_s(&t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint, sizeof(XLogRecPtr), 0, sizeof(XLogRecPtr));
securec_check(errorno, "", "");
ereport(LOG, (errmsg("will update control file (create checkpoint), shutdown:%d", shutdown)));
UpdateControlFile();
LWLockRelease(ControlFileLock);
{
xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->ckptXid = checkPoint.nextXid;
SpinLockRelease(&xlogctl->info_lck);
}
* We are now done with critical updates; no need for system panic if we
* have trouble while fooling with old log segments.
*/
END_CRIT_SECTION();
SyncPostCheckpoint();
if (!t_thrd.cbm_cxt.XlogCbmSys->firstCPCreated) {
t_thrd.cbm_cxt.XlogCbmSys->firstCPCreated = true;
}
if (doFullCheckpoint || ((unsigned int)flags & CHECKPOINT_IMMEDIATE) ||
elapsed_secs >= u_sess->attr.attr_storage.fullCheckPointTimeout) {
if (g_instance.proc_base->cbmwriterLatch) {
SetLatch(g_instance.proc_base->cbmwriterLatch);
}
}
* Delete old log files (those no longer needed even for previous
* checkpoint or the standbys in XLOG streaming).
*/
if (removexlog) {
RemoveOldXlogFiles(_logSegNo, recptr);
}
* Make more log segments if needed. (Do this after recycling old log
* segments, since that may supply some of the needed files.)
*/
if (!shutdown) {
PreallocXlogFiles(recptr);
}
LogCheckpointEnd(false);
TRACE_POSTGRESQL_CHECKPOINT_DONE(t_thrd.xlog_cxt.CheckpointStats->ckpt_bufs_written,
g_instance.attr.attr_storage.NBuffers,
t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_added,
t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_removed,
t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_recycled);
LWLockRelease(CheckpointLock);
gstrace_exit(GS_TRC_ID_CreateCheckPoint);
}
* Flush all data in shared memory to disk, and fsync
*
* This is the common code shared between regular checkpoints and
* recovery restartpoints.
*/
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags, bool doFullCheckpoint)
{
if (ENABLE_DSS && IsInitdb &&
g_instance.dms_cxt.SSReformerControl.primaryInstId == INVALID_INSTANCEID) {
SSReadReformerCtrl();
}
gstrace_entry(GS_TRC_ID_CheckPointGuts);
CheckPointCLOG();
CheckPointCSNLOG();
CheckPointMultiXact();
CheckPointPredicate();
CheckPointRelationMap();
CheckPointReplicationSlots();
CheckPointSnapBuild();
* If enable_incremental_checkpoint is on, there are two scenarios:
* Incremental checkpoint, don't need flush any dirty page, and don't wait
* pagewriter thread flush dirty page.
* Do full checkpoint, checkpoint thread also don't flush any dirty page, but
* need wait pagewriter thread flush dirty page.
*/
CheckPointBuffers(flags, doFullCheckpoint);
#ifndef ENABLE_MULTIPLE_NODES
CheckPointReplicationOrigin();
#endif
CheckPointTwoPhase(checkPointRedo);
undo::CheckPointUndoSystemMeta(checkPointRedo);
gstrace_exit(GS_TRC_ID_CheckPointGuts);
}
void PushRestartPointToQueue(XLogRecPtr recordReadRecPtr, const CheckPoint checkPoint)
{
RecoveryQueueState *state = &g_instance.ckpt_cxt_ctl->ckpt_redo_state;
uint loc = 0;
if (!ENABLE_INCRE_CKPT) {
return;
}
(void)LWLockAcquire(state->recovery_queue_lock, LW_EXCLUSIVE);
if (state->end - state->start + 1 >= RESTART_POINT_QUEUE_LEN) {
state->start++;
}
loc = state->end % RESTART_POINT_QUEUE_LEN;
state->ckpt_rec_queue[loc].CkptLSN = recordReadRecPtr;
state->ckpt_rec_queue[loc].checkpoint = checkPoint;
state->end++;
LWLockRelease(state->recovery_queue_lock);
}
* Save a checkpoint for recovery restart if appropriate
*
* This function is called each time a checkpoint record is read from XLOG.
* It must determine whether the checkpoint represents a safe restartpoint or
* not. If so, the checkpoint record is stashed in shared memory so that
* CreateRestartPoint can consult it. (Note that the latter function is
* executed by the checkpointer, while this one will be executed by the
* startup process.)
*/
static void RecoveryRestartPoint(const CheckPoint checkPoint, XLogRecPtr recordReadRecPtr)
{
const uint32 shitRightLength = 32;
g_instance.comm_cxt.predo_cxt.newestCheckpointLoc = recordReadRecPtr;
const_cast<CheckPoint &>(g_instance.comm_cxt.predo_cxt.newestCheckpoint) = const_cast<CheckPoint &>(checkPoint);
if (!IsRestartPointSafe(checkPoint.redo)) {
return;
}
if (IsExtremeRedo()) {
XLogRecPtr safeCheckPoint = ExtremeGetSafeMinCheckPoint();
if (XLByteEQ(safeCheckPoint, MAX_XLOG_REC_PTR) || XLByteLT(safeCheckPoint, recordReadRecPtr)) {
ereport(WARNING, (errmsg("RecoveryRestartPoint is false at %X/%X,last safe point is %X/%X",
(uint32)(recordReadRecPtr >> 32), (uint32)(recordReadRecPtr),
(uint32)(safeCheckPoint >> 32), (uint32)(safeCheckPoint))));
return;
}
} else if (!parallel_recovery::IsRecoveryRestartPointSafeForWorkers(recordReadRecPtr)) {
ereport(WARNING, (errmsg("RecoveryRestartPointSafe is false at %X/%X",
static_cast<uint32>(recordReadRecPtr >> shitRightLength),
static_cast<uint32>(recordReadRecPtr))));
return;
}
update_dirty_page_queue_rec_lsn(recordReadRecPtr, true);
pg_write_barrier();
PushRestartPointToQueue(recordReadRecPtr, checkPoint);
* Copy the checkpoint record to shared memory, so that checkpointer can
* work out the next time it wants to perform a restartpoint.
*/
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->lastCheckPointRecPtr = recordReadRecPtr;
xlogctl->lastCheckPoint = checkPoint;
SpinLockRelease(&xlogctl->info_lck);
if (t_thrd.xlog_cxt.needImmediateCkp) {
ereport(LOG, (errmsg("RecoveryRestartPoint do ImmediateCkp at %X/%X",
static_cast<uint32>(recordReadRecPtr >> shitRightLength),
static_cast<uint32>(recordReadRecPtr))));
RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT);
t_thrd.xlog_cxt.needImmediateCkp = false;
}
}
bool IsRestartPointSafe(const XLogRecPtr checkPoint)
{
* Is it safe to restartpoint? We must ask each of the resource managers
* whether they have any partial state information that might prevent a
* correct restart from this point. If so, we skip this opportunity, but
* return at the next checkpoint record for another try.
*/
for (int rmid = 0; rmid <= RM_MAX_ID; rmid++) {
if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
if (!(RmgrTable[rmid].rm_safe_restartpoint())) {
ereport(WARNING, (errmsg("RM %d not safe to record restart point at %X/%X", rmid,
(uint32)(checkPoint >> 32), (uint32)checkPoint)));
return false;
}
}
* Also refrain from creating a restartpoint if we have seen any
* references to non-existent pages. Restarting recovery from the
* restartpoint would not see the references, so we would lose the
* cross-check that the pages belonged to a relation that was dropped
* later.
*/
if (XLogHaveInvalidPages()) {
ereport(WARNING, (errmsg("could not record restart point at %X/%X because there "
"are unresolved references to invalid pages",
(uint32)(checkPoint >> 32), (uint32)checkPoint)));
return false;
}
return true;
}
void wait_all_dirty_page_flush(int flags, XLogRecPtr redo)
{
if (ENABLE_INCRE_CKPT) {
g_instance.ckpt_cxt_ctl->full_ckpt_redo_ptr = redo;
g_instance.ckpt_cxt_ctl->full_ckpt_expected_flush_loc = get_dirty_page_queue_tail();
pg_memory_barrier();
if (get_dirty_page_num() > 0) {
g_instance.ckpt_cxt_ctl->flush_all_dirty_page = true;
ereport(LOG, (errmsg("CreateRestartPoint, need flush %ld pages.", get_dirty_page_num())));
CheckPointBuffers(flags, true);
}
}
return;
}
bool RecoveryQueueIsEmpty()
{
RecoveryQueueState *state = &g_instance.ckpt_cxt_ctl->ckpt_redo_state;
int num;
(void)LWLockAcquire(state->recovery_queue_lock, LW_EXCLUSIVE);
num = state->end - state->start;
(void)LWLockRelease(state->recovery_queue_lock);
if (num == 0) {
return true;
} else {
return false;
}
}
* In the recovery phase, don't push the redo point to the latest position, avoid the I/O peak.
* calculate the redo point based on the page flushing speed and max_redo_log_size.
*/
const int BYTE_PER_KB = 1024;
XLogRecPtr GetRestartPointInRecovery(CheckPoint *restartCheckPoint)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
XLogRecPtr restartRecPtr = InvalidXLogRecPtr;
XLogRecPtr curMinRecLSN = ckpt_get_min_rec_lsn();
RecoveryQueueState *state = &g_instance.ckpt_cxt_ctl->ckpt_redo_state;
if (XLogRecPtrIsInvalid(curMinRecLSN)) {
(void)LWLockAcquire(state->recovery_queue_lock, LW_EXCLUSIVE);
state->start = state->end > 0 ? state->end - 1 : state->end;
(void)LWLockRelease(state->recovery_queue_lock);
SpinLockAcquire(&xlogctl->info_lck);
restartRecPtr = xlogctl->lastCheckPointRecPtr;
*restartCheckPoint = const_cast<CheckPoint &>(xlogctl->lastCheckPoint);
SpinLockRelease(&xlogctl->info_lck);
Assert(XLByteLE(restartRecPtr, get_dirty_page_queue_rec_lsn()));
} else {
int num = 0;
int loc = 0;
int i = 0;
XLogRecPtr replayLastLSN;
XLogRecPtr targetLSN;
if (RecoveryQueueIsEmpty()) {
return restartRecPtr;
}
SpinLockAcquire(&xlogctl->info_lck);
replayLastLSN = xlogctl->lastCheckPointRecPtr;
SpinLockRelease(&xlogctl->info_lck);
if (replayLastLSN > (uint64)u_sess->attr.attr_storage.max_redo_log_size * BYTE_PER_KB) {
targetLSN = replayLastLSN - (uint64)u_sess->attr.attr_storage.max_redo_log_size * BYTE_PER_KB;
} else {
targetLSN = curMinRecLSN;
}
(void)LWLockAcquire(state->recovery_queue_lock, LW_EXCLUSIVE);
num = state->end - state->start;
* If the pagewriter flush dirty page to disk quickly, push the checkpoint loc point to
* the position closest to curMinRecLSN .
*/
if (XLByteLE(targetLSN, curMinRecLSN)) {
for (i = 0; i < num; i++) {
loc = (state->start + i) % RESTART_POINT_QUEUE_LEN;
if (state->ckpt_rec_queue[loc].CkptLSN > curMinRecLSN) {
if (i > 0) {
loc = (state->start + i - 1) % RESTART_POINT_QUEUE_LEN;
state->start = state->start + i - 1;
}
restartRecPtr = state->ckpt_rec_queue[loc].CkptLSN;
*restartCheckPoint = state->ckpt_rec_queue[loc].checkpoint;
break;
}
}
} else {
for (i = 0; i < num; i++) {
loc = (state->start + i) % RESTART_POINT_QUEUE_LEN;
if (state->ckpt_rec_queue[loc].CkptLSN > targetLSN) {
if (i > 0) {
uint64 gap = state->ckpt_rec_queue[loc].CkptLSN - targetLSN;
int prevLoc = (state->start + i - 1) % RESTART_POINT_QUEUE_LEN;
if (targetLSN - state->ckpt_rec_queue[prevLoc].CkptLSN < gap) {
restartRecPtr = state->ckpt_rec_queue[prevLoc].CkptLSN;
*restartCheckPoint = state->ckpt_rec_queue[prevLoc].checkpoint;
state->start = state->start + i - 1;
} else {
restartRecPtr = state->ckpt_rec_queue[loc].CkptLSN;
*restartCheckPoint = state->ckpt_rec_queue[loc].checkpoint;
state->start = state->start + i;
}
} else {
restartRecPtr = state->ckpt_rec_queue[loc].CkptLSN;
*restartCheckPoint = state->ckpt_rec_queue[loc].checkpoint;
}
break;
}
}
}
if (XLogRecPtrIsInvalid(restartRecPtr) && num > 0) {
loc = (state->end - 1) % RESTART_POINT_QUEUE_LEN;
restartRecPtr = state->ckpt_rec_queue[loc].CkptLSN;
*restartCheckPoint = state->ckpt_rec_queue[loc].checkpoint;
state->start = state->end - 1;
}
(void)LWLockRelease(state->recovery_queue_lock);
}
return restartRecPtr;
}
* Establish a restartpoint if possible.
*
* This is similar to CreateCheckPoint, but is used during WAL recovery
* to establish a point from which recovery can roll forward without
* replaying the entire recovery log.
*
* Returns true if a new restartpoint was established. We can only establish
* a restartpoint if we have replayed a safe checkpoint record since last
* restartpoint.
*/
bool CreateRestartPoint(int flags)
{
XLogRecPtr lastCheckPointRecPtr;
CheckPoint lastCheckPoint;
XLogSegNo _logSegNo;
TimestampTz xtime;
int32 lastlrc = 0;
errno_t errorno = EOK;
bool recoveryInProgress = true;
bool doFullCkpt = !ENABLE_INCRE_CKPT;
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
* Acquire CheckpointLock to ensure only one restartpoint or checkpoint
* happens at a time.
*/
gstrace_entry(GS_TRC_ID_CreateRestartPoint);
LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
recoveryInProgress = RecoveryInProgress();
* Check that we're still in recovery mode. It's ok if we exit recovery
* mode after this check, the restart point is valid anyway.
*/
if (!recoveryInProgress && !SS_DISASTER_STANDBY_CLUSTER) {
ereport(LOG, (errmsg("skipping restartpoint, recovery has already ended")));
LWLockRelease(CheckpointLock);
gstrace_exit(GS_TRC_ID_CreateRestartPoint);
return false;
}
if (doFullCkpt ||
((unsigned int)flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_FORCE))) {
doFullCkpt = true;
if (ENABLE_INCRE_CKPT) {
RecoveryQueueState *state = &g_instance.ckpt_cxt_ctl->ckpt_redo_state;
(void)LWLockAcquire(state->recovery_queue_lock, LW_EXCLUSIVE);
state->start = state->end > 0 ? state->end - 1 : state->end;
(void)LWLockRelease(state->recovery_queue_lock);
}
SpinLockAcquire(&xlogctl->info_lck);
lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
lastCheckPoint = xlogctl->lastCheckPoint;
SpinLockRelease(&xlogctl->info_lck);
} else {
lastCheckPointRecPtr = GetRestartPointInRecovery(&lastCheckPoint);
}
* If the last checkpoint record we've replayed is already our last
* restartpoint, we can't perform a new restart point. We still update
* minRecoveryPoint in that case, so that if this is a shutdown restart
* point, we won't start up earlier than before. That's not strictly
* necessary, but when hot standby is enabled, it would be rather weird if
* the database opened up for read-only connections at a point-in-time
* before the last shutdown. Such time travel is still possible in case of
* immediate shutdown, though.
*
* We don't explicitly advance minRecoveryPoint when we do create a
* restartpoint. It's assumed that flushing the buffers will do that as a
* side-effect.
*/
if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
XLByteLE(lastCheckPoint.redo, t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo)) {
#ifdef USE_ASSERT_CHECKING
int mode = LOG;
#else
int mode = DEBUG2;
#endif
ereport(mode, (errmsg("skipping restartpoint, already performed at %X/%X", (uint32)(lastCheckPoint.redo >> 32),
(uint32)lastCheckPoint.redo)));
UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
if ((unsigned int)flags & CHECKPOINT_IS_SHUTDOWN) {
wait_all_dirty_page_flush(flags, lastCheckPoint.redo);
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
t_thrd.shemem_ptr_cxt.ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL);
UpdateControlFile();
LWLockRelease(ControlFileLock);
}
LWLockRelease(CheckpointLock);
gstrace_exit(GS_TRC_ID_CreateRestartPoint);
return false;
}
PrintCkpXctlControlFile(t_thrd.shemem_ptr_cxt.ControlFile->checkPoint,
&(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy), lastCheckPointRecPtr, &lastCheckPoint,
t_thrd.shemem_ptr_cxt.ControlFile->prevCheckPoint, "CreateRestartPoint");
#ifdef ENABLE_MOT
CallCheckpointCallback(EVENT_CHECKPOINT_CREATE_SNAPSHOT, 0);
#endif
* Update the shared RedoRecPtr so that the startup process can calculate
* the number of segments replayed since last restartpoint, and request a
* restartpoint if it exceeds checkpoint_segments.
*
* Like in CreateCheckPoint(), hold off insertions to update it, although
* during recovery this is just pro forma, because no WAL insertions are
* happening.
*/
StartSuspendWalInsert(&lastlrc);
xlogctl->Insert.RedoRecPtr = lastCheckPoint.redo;
#ifdef ENABLE_MOT
CallCheckpointCallback(EVENT_CHECKPOINT_SNAPSHOT_READY, lastCheckPointRecPtr);
#endif
StopSuspendWalInsert(lastlrc);
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->RedoRecPtr = lastCheckPoint.redo;
SpinLockRelease(&xlogctl->info_lck);
* Prepare to accumulate statistics.
*
* Note: because it is possible for log_checkpoints to change while a
* checkpoint proceeds, we always accumulate stats, even if
* log_checkpoints is currently off.
*/
errorno = memset_s(t_thrd.xlog_cxt.CheckpointStats, sizeof(CheckpointStatsData), 0, sizeof(CheckpointStatsData));
securec_check(errorno, "", "");
t_thrd.xlog_cxt.CheckpointStats->ckpt_start_t = GetCurrentTimestamp();
if (u_sess->attr.attr_common.log_checkpoints) {
LogCheckpointStart((unsigned int)flags, true);
}
if (ENABLE_INCRE_CKPT && doFullCkpt) {
g_instance.ckpt_cxt_ctl->full_ckpt_redo_ptr = lastCheckPoint.redo;
g_instance.ckpt_cxt_ctl->full_ckpt_expected_flush_loc = get_dirty_page_queue_tail();
pg_memory_barrier();
if (get_dirty_page_num() > 0) {
g_instance.ckpt_cxt_ctl->flush_all_dirty_page = true;
}
ereport(LOG, (errmsg("CreateRestartPoint, need flush %ld pages.", get_dirty_page_num())));
} else if (ENABLE_INCRE_CKPT) {
g_instance.ckpt_cxt_ctl->full_ckpt_redo_ptr = lastCheckPoint.redo;
(void)LWLockAcquire(g_instance.ckpt_cxt_ctl->prune_queue_lock, LW_EXCLUSIVE);
g_instance.ckpt_cxt_ctl->full_ckpt_expected_flush_loc = get_loc_for_lsn(lastCheckPointRecPtr);
pg_memory_barrier();
LWLockRelease(g_instance.ckpt_cxt_ctl->prune_queue_lock);
uint64 head = pg_atomic_read_u64(&g_instance.ckpt_cxt_ctl->dirty_page_queue_head);
int64 need_flush_num = g_instance.ckpt_cxt_ctl->full_ckpt_expected_flush_loc > head
? g_instance.ckpt_cxt_ctl->full_ckpt_expected_flush_loc - head
: 0;
if (need_flush_num > 0) {
g_instance.ckpt_cxt_ctl->flush_all_dirty_page = true;
}
ereport(LOG, (errmsg("CreateRestartPoint, need flush %ld pages", need_flush_num)));
}
CheckPointGuts(lastCheckPoint.redo, flags, true);
#ifdef ENABLE_MOT
CallCheckpointCallback(EVENT_CHECKPOINT_BEGIN_CHECKPOINT, 0);
#endif
* Select point at which we can truncate the xlog, which we base on the
* prior checkpoint's earliest info.
*/
XLByteToSeg(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo, _logSegNo);
if (ENABLE_INCRE_CKPT) {
XLogRecPtr MinRecLSN = ckpt_get_min_rec_lsn();
if (!XLogRecPtrIsInvalid(MinRecLSN) && XLByteLT(MinRecLSN, lastCheckPointRecPtr)) {
ereport(WARNING, (errmsg("current dirty page list head recLSN %08X/%08X smaller than lsn %08X/%08X",
(uint32)(MinRecLSN >> XLOG_LSN_SWAP), (uint32)MinRecLSN,
(uint32)(lastCheckPointRecPtr >> XLOG_LSN_SWAP), (uint32)lastCheckPointRecPtr)));
LWLockRelease(CheckpointLock);
gstrace_exit(GS_TRC_ID_CreateRestartPoint);
return false;
}
}
(void)LWLockAcquire(RedoTruncateLock, LW_SHARED);
LWLockRelease(RedoTruncateLock);
* Update pg_control, using current time. Check that it still shows
* IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
* this is a quick hack to make sure nothing really bad happens if somehow
* we get here after the end-of-recovery checkpoint.
*/
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
if (t_thrd.shemem_ptr_cxt.ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
XLByteLT(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo, lastCheckPoint.redo)) {
t_thrd.shemem_ptr_cxt.ControlFile->prevCheckPoint = t_thrd.shemem_ptr_cxt.ControlFile->checkPoint;
t_thrd.shemem_ptr_cxt.ControlFile->checkPoint = lastCheckPointRecPtr;
t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy = lastCheckPoint;
t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL);
g_instance.ckpt_cxt_ctl->ckpt_view.ckpt_current_redo_point = lastCheckPoint.redo;
if ((unsigned int)flags & CHECKPOINT_IS_SHUTDOWN) {
t_thrd.shemem_ptr_cxt.ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
}
UpdateControlFile();
}
LWLockRelease(ControlFileLock);
if (g_instance.proc_base->cbmwriterLatch) {
SetLatch(g_instance.proc_base->cbmwriterLatch);
}
* Delete old log files (those no longer needed even for previous
* checkpoint/restartpoint) to prevent the disk holding the xlog from
* growing full.
*/
if (_logSegNo) {
XLogRecPtr endptr;
endptr = GetStandbyFlushRecPtr(NULL);
KeepLogSeg(endptr, &_logSegNo, InvalidXLogRecPtr, false);
_logSegNo--;
* Save one more segment to handle the corner case. For example, standby has replayed to
* the end to one segment file (000000010000002400000078) and the LSN is exactly 24/79000000.
* The corresponding slot on primary is 24/79000000. As a result, if standby restart to
* request xlog stream from the lastest valid record (23/xxxxxxx), cannot find on primary.
*/
if (_logSegNo > 0) {
_logSegNo--;
}
* Update ThisTimeLineID to the timeline we're currently replaying,
* so that we install any recycled segments on that timeline.
*
* There is no guarantee that the WAL segments will be useful on the
* current timeline; if recovery proceeds to a new timeline right
* after this, the pre-allocated WAL segments on this timeline will
* not be used, and will go wasted until recycled on the next
* restartpoint. We'll live with that.
*/
SpinLockAcquire(&xlogctl->info_lck);
t_thrd.xlog_cxt.ThisTimeLineID = t_thrd.shemem_ptr_cxt.XLogCtl->lastCheckPoint.ThisTimeLineID;
SpinLockRelease(&xlogctl->info_lck);
RemoveOldXlogFiles(_logSegNo, endptr);
* Make more log segments if needed. (Do this after recycling old log
* segments, since that may supply some of the needed files.)
*/
PreallocXlogFiles(endptr);
}
* Truncate pg_csnlog if possible. We can throw away all data before
* the oldest XMIN of any running transaction. No future transaction will
* attempt to reference any pg_csnlog entry older than that (see Asserts
* in csnlog.c). When hot standby is disabled, though, we mustn't do
* this because StartupCSNLOG hasn't been called yet.
*/
if (g_instance.attr.attr_storage.EnableHotStandby && !IS_MULTI_DISASTER_RECOVER_MODE) {
pg_time_t now;
int elapsed_secs;
now = (pg_time_t)time(NULL);
elapsed_secs = now - t_thrd.checkpoint_cxt.last_truncate_log_time;
* Reduce the frequency of trucate CSN log to avoid the probability of lock contention.
* Incremental chekpoint does not require frequent truncate of csnlog.
*/
if (!ENABLE_INCRE_CKPT || elapsed_secs >= u_sess->attr.attr_storage.fullCheckPointTimeout) {
TransactionId globalXmin = InvalidTransactionId;
(void)GetOldestActiveTransactionId(&globalXmin);
TransactionId cutoffXid = GetOldestXmin(NULL);
if (TransactionIdIsNormal(globalXmin) && TransactionIdPrecedes(globalXmin, cutoffXid)) {
cutoffXid = globalXmin;
}
if (g_instance.attr.attr_storage.enable_ustore) {
TransactionId globalRecycleXid = pg_atomic_read_u64(&g_instance.undo_cxt.globalRecycleXid);
if (TransactionIdIsNormal(globalRecycleXid) && TransactionIdPrecedes(globalRecycleXid, cutoffXid)) {
cutoffXid = globalRecycleXid;
}
}
TruncateCSNLOG(cutoffXid);
t_thrd.checkpoint_cxt.last_truncate_log_time = now;
}
}
LogCheckpointEnd(true);
xtime = GetLatestXTime();
ereport((u_sess->attr.attr_common.log_checkpoints ? LOG : DEBUG2),
(errmsg("recovery restart point at %X/%X", (uint32)(lastCheckPoint.redo >> 32),
(uint32)lastCheckPoint.redo),
xtime ? errdetail("last completed transaction was at log time %s", timestamptz_to_str(xtime)) : 0));
LWLockRelease(CheckpointLock);
* Finally, execute archive_cleanup_command, if any.
*/
if (t_thrd.shemem_ptr_cxt.XLogCtl->archiveCleanupCommand[0]) {
ExecuteRecoveryCommand(t_thrd.shemem_ptr_cxt.XLogCtl->archiveCleanupCommand, "archive_cleanup_command", false);
}
gstrace_exit(GS_TRC_ID_CreateRestartPoint);
return true;
}
static XLogSegNo CalcRecycleSegNo(XLogRecPtr curInsert, XLogRecPtr quorumMinRequired, XLogRecPtr minToolsRequired)
{
uint64 maxKeepSize = ((uint64)u_sess->attr.attr_storage.max_size_for_xlog_prune << 10);
XLogSegNo SegNoCanRecycled = 1;
if (XLByteLT(maxKeepSize + XLogSegSize, curInsert)) {
XLByteToSeg(curInsert - maxKeepSize, SegNoCanRecycled);
}
* For asynchronous replication, we do not care connected sync standbys,
* since standby build won't block xact commit on master.
*/
if (u_sess->attr.attr_storage.guc_synchronous_commit > SYNCHRONOUS_COMMIT_LOCAL_FLUSH) {
if (!XLByteEQ(quorumMinRequired, InvalidXLogRecPtr)) {
XLogSegNo quorumMinSegNo;
XLByteToSeg(quorumMinRequired, quorumMinSegNo);
if (quorumMinSegNo < SegNoCanRecycled) {
SegNoCanRecycled = quorumMinSegNo;
}
}
}
if (!XLByteEQ(minToolsRequired, InvalidXLogRecPtr)) {
XLogSegNo minToolsSegNo;
XLByteToSeg(minToolsRequired, minToolsSegNo);
if (minToolsSegNo < SegNoCanRecycled && minToolsSegNo > 0) {
SegNoCanRecycled = minToolsSegNo;
}
}
ereport(LOG, (errmsg("keep xlog segments after %lu, current wal size %lu(Bytes), curInsert %lu, "
"quorumMinRequired %lu, minToolsRequired %lu", SegNoCanRecycled, maxKeepSize, curInsert, quorumMinRequired,
minToolsRequired)));
return SegNoCanRecycled;
}
static XLogSegNo CalcRecycleSegNoForHadrMainStandby(XLogRecPtr curFlush, XLogSegNo segno, XLogRecPtr minRequired)
{
XLogSegNo SegNoCanRecycled = segno;
if (g_instance.attr.attr_storage.max_replication_slots > 0 && !XLByteEQ(minRequired, InvalidXLogRecPtr)) {
XLogSegNo slotSegNo;
XLByteToSeg(minRequired, slotSegNo);
if (slotSegNo == 0) {
SegNoCanRecycled = 1;
ereport(LOG, (errmsg("main standby keep all the xlog segments, because the minimal replication slot segno "
"is less than or equal to zero")));
} else if (slotSegNo < SegNoCanRecycled) {
SegNoCanRecycled = slotSegNo;
}
}
uint64 maxKeepSize = ((uint64)u_sess->attr.attr_storage.max_size_for_xlog_prune << 10);
if (WalSndInProgress(SNDROLE_PRIMARY_BUILDSTANDBY)) {
SegNoCanRecycled = 1;
ereport(LOG, (errmsg("keep all the xlog segments in Main standby, "
"because there is a full standby building processing.")));
} else if (!WalSndAllInProgressForMainStandby(SNDROLE_PRIMARY_STANDBY)) {
if (XLByteLT(maxKeepSize + XLogSegSize, curFlush)) {
XLByteToSeg(curFlush - maxKeepSize, SegNoCanRecycled);
} else {
SegNoCanRecycled = 1;
ereport(LOG, (errmsg("keep all the xlog segments in Main standby, "
"because there is a Cascade standby offline.")));
}
}
return SegNoCanRecycled;
}
static XLogSegNo CalcRecycleArchiveSegNo()
{
XLogRecPtr min_required = InvalidXLogRecPtr;
XLogSegNo min_required_segno;
for (int i = 0; i < g_instance.attr.attr_storage.max_replication_slots; i++) {
ReplicationSlot *s = &t_thrd.slot_cxt.ReplicationSlotCtl->replication_slots[i];
volatile ReplicationSlot *vslot = s;
SpinLockAcquire(&s->mutex);
XLogRecPtr restart_lsn;
if (!s->in_use) {
SpinLockRelease(&s->mutex);
continue;
}
restart_lsn = vslot->data.restart_lsn;
if (s->extra_content == NULL) {
if ((!XLByteEQ(restart_lsn, InvalidXLogRecPtr)) &&
(XLByteEQ(min_required, InvalidXLogRecPtr) || XLByteLT(restart_lsn, min_required))) {
min_required = restart_lsn;
}
}
SpinLockRelease(&s->mutex);
}
XLByteToSeg(min_required, min_required_segno);
return min_required_segno;
}
static XLogSegNo CalculateCNRecycleSegNoForStreamingHadr(XLogRecPtr curInsert, XLogSegNo logSegNo,
XLogRecPtr minToolsRequired)
{
XLogSegNo slotSegNo = logSegNo;
uint64 maxKeepSize = ((uint64)u_sess->attr.attr_storage.max_size_for_xlog_prune << 10);
if (WalSndInProgress(SNDROLE_PRIMARY_BUILDSTANDBY)) {
slotSegNo = 1;
ereport(LOG, (errmsg("keep all the xlog segments in Main Coordinator, "
"because there is a full building processing.")));
} else if (!WalSndAllInProgressForMainStandby(SNDROLE_PRIMARY_STANDBY)) {
if (XLByteLT(maxKeepSize + XLogSegSize, curInsert)) {
XLByteToSeg(curInsert - maxKeepSize, slotSegNo);
} else {
slotSegNo = 1;
ereport(LOG, (errmsg("keep all the xlog segments in Main Coordinator, "
"because there is a standby Coordinator offline.")));
}
}
if (!XLByteEQ(minToolsRequired, InvalidXLogRecPtr)) {
XLogSegNo minToolsSegNo;
XLByteToSeg(minToolsRequired, minToolsSegNo);
if (minToolsSegNo < slotSegNo) {
slotSegNo = minToolsSegNo;
}
}
if (slotSegNo < logSegNo) {
return slotSegNo;
} else {
return logSegNo;
}
}
static XLogSegNo CalcForceRecycleSegNo(XLogRecPtr curInsert, XLogRecPtr quorumMinReq)
{
uint64 maxKeepSize = ((uint64)u_sess->attr.attr_storage.max_size_xlog_force_prune << 10);
XLogSegNo SegNoForceRecycled = 1;
if (XLByteLT(maxKeepSize + XLogSegSize, curInsert)) {
XLByteToSeg(curInsert - maxKeepSize, SegNoForceRecycled);
}
* For asynchronous replication, we do not care connected sync standbys,
* since standby build won't block xact commit on master.
*/
if (u_sess->attr.attr_storage.guc_synchronous_commit > SYNCHRONOUS_COMMIT_LOCAL_FLUSH) {
if (!XLByteEQ(quorumMinReq, InvalidXLogRecPtr)) {
XLogSegNo quorumMinSegNo;
XLByteToSeg(quorumMinReq, quorumMinSegNo);
if (quorumMinSegNo < SegNoForceRecycled) {
SegNoForceRecycled = quorumMinSegNo;
}
}
}
return SegNoForceRecycled;
}
static void fill_keeper(XLogSegNo segno, int index, XlogKeeper *xlogkeeper)
{
if(!xlogkeeper)
return;
if(segno <= 0)
segno = 1;
xlogkeeper[index].segno = segno;
xlogkeeper[index].valid = true;
}
static void search_archive_keeper(XlogKeeper *xlogkeeper)
{
DIR *xldir = NULL;
struct dirent *xlde = NULL;
XLogSegNo min_ready_xlog = 0;
XLogSegNo max_done_xlog = 0;
if(!XLogArchivingActive())
return;
xldir = AllocateDir(ARCHIVEDIR);
if (xldir == NULL) {
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open transaction log directory \"%s\": %m", ARCHIVEDIR)));
}
* .ready is marking wal files need to be archive and .done are ones finished archive.
* So we regard min of .ready or max of .done as one keep by archive, if no .done or
* .ready files the archive keep nothing.
*/
while ((xlde = ReadDir(xldir, XLOGDIR "/archive_status")) != NULL) {
if (strlen(xlde->d_name) > 24 && strspn(xlde->d_name, "0123456789ABCDEF") == 24) {
if (strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".ready"), ".ready") == 0) {
XLogSegNo ready_segno = 0;
TimeLineID tld = 0;
XLogFromFileName(xlde->d_name, &tld, &ready_segno);
if(0 == min_ready_xlog || min_ready_xlog > ready_segno)
min_ready_xlog = ready_segno;
} else if(strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".done"), ".done") == 0) {
XLogSegNo done_segno = 0;
TimeLineID tld = 0;
XLogFromFileName(xlde->d_name, &tld, &done_segno);
if(0 == max_done_xlog || max_done_xlog < done_segno)
max_done_xlog = done_segno;
}
}
}
FreeDir(xldir);
if(0 == min_ready_xlog && 0 == max_done_xlog)
return;
else if(0 != max_done_xlog && 0 == min_ready_xlog)
min_ready_xlog = max_done_xlog + 1;
fill_keeper(min_ready_xlog, WALKEEPER_ARCHIVE, xlogkeeper);
}
XlogKeeper* generate_xlog_keepers(void)
{
XLogSegNo logSegNoTemp = 0;
XlogKeeper *result = NULL;
if (RecoveryInProgress()) {
XLogRecPtr recptr = GetStandbyFlushRecPtr(NULL);
result = KeepLogSeg(recptr, &logSegNoTemp, InvalidXLogRecPtr, true);
} else {
XLogRecPtr curInsert = GetXLogWriteRecPtr();
result = KeepLogSeg(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo, &logSegNoTemp, curInsert, true);
}
search_archive_keeper(result);
return result;
}
* Retreat *logSegNo to the last segment that we need to retain because of
* either wal_keep_segments or replication slots.
*
* This is calculated by subtracting wal_keep_segments from the given xlog
* location, recptr and by making sure that that result is below the
* requirement of replication slots.
*
* DCF feature: Replication slot is no longer needed as consensus of XLog
* is achieved in DCF. Rather than WalSnd, XLog data are replicated via DCF.
*/
static XlogKeeper* KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo, XLogRecPtr curInsert, bool getkeeper)
{
XLogSegNo segno;
XLogSegNo wal_keep_segno;
XLogRecPtr keep;
XLogRecPtr xlogcopystartptr;
ReplicationSlotState repl_slot_state;
XlogKeeper *xlogkeeper = NULL;
const XLogSegNo KEEP_ALL = 1;
if(getkeeper)
xlogkeeper = (XlogKeeper*)palloc0(WALKEEPER_MAX * sizeof(XlogKeeper));
gstrace_entry(GS_TRC_ID_KeepLogSeg);
XLByteToSeg(recptr, segno);
fill_keeper(segno, WALKEEPER_BASECHECK, xlogkeeper);
if (segno <= (uint32)(u_sess->attr.attr_storage.wal_keep_segments)) {
ereport(LOG, (errmsg("keep all the xlog segments, because current segno = %lu, "
"less than wal_keep_segments = %d", segno,
(int)(u_sess->attr.attr_storage.wal_keep_segments))));
segno = KEEP_ALL;
} else {
segno = segno - (uint32)(u_sess->attr.attr_storage.wal_keep_segments);
}
wal_keep_segno = segno;
fill_keeper(wal_keep_segno, WALKEEPER_SEGMENT_KEEP, xlogkeeper);
ReplicationSlotsComputeRequiredLSN(&repl_slot_state);
keep = repl_slot_state.min_required;
ReplicationSlotReportRestartLSN();
if (g_instance.attr.attr_storage.max_replication_slots > 0 && !XLByteEQ(keep, InvalidXLogRecPtr)) {
XLogSegNo slotSegNo;
XLByteToSeg(keep, slotSegNo);
if (slotSegNo == 0) {
segno = KEEP_ALL;
ereport(LOG, (errmsg("keep all the xlog segments, because the minimal replication slot segno "
"is less than or equal to zero")));
} else if (slotSegNo < segno) {
segno = slotSegNo;
}
fill_keeper(segno, WALKEEPER_SLOTS, xlogkeeper);
}
LWLockAcquire(FullBuildXlogCopyStartPtrLock, LW_SHARED);
xlogcopystartptr = XlogCopyStartPtr;
LWLockRelease(FullBuildXlogCopyStartPtrLock);
if (!XLByteEQ(xlogcopystartptr, InvalidXLogRecPtr)) {
XLogSegNo slotSegNo;
XLByteToSeg(xlogcopystartptr, slotSegNo);
if (slotSegNo == 0) {
segno = KEEP_ALL;
ereport(LOG, (errmsg("keep all the xlog segments, because there is a full-build task in the backend, "
"and start segno is less than or equal to zero")));
} else if (slotSegNo < segno) {
segno = slotSegNo;
}
fill_keeper(segno, WALKEEPER_BASEBACKUP, xlogkeeper);
}
load_server_mode();
if (t_thrd.xlog_cxt.server_mode == NORMAL_MODE && IS_PGXC_COORDINATOR) {
XLogSegNo mainCNSegNo = CalculateCNRecycleSegNoForStreamingHadr(curInsert, segno,
repl_slot_state.min_tools_required);
if (mainCNSegNo < segno && mainCNSegNo > 0) {
segno = mainCNSegNo;
fill_keeper(segno, WALKEEPER_COODRECYCLE, xlogkeeper);
}
}
* In primary mode, we should do additional check.
*
* 1.When dn is in build(full or incremental), just keep log.
* 2.When not all standbys are alive in multi standby mode, further check enable_xlog_prune.
* If false, just keep log; otherwise, keep log only if walSize less than config.
* 3.If max_size_xlog_force_prune is set > 0, calculate segno with its config, and we fetch
* the larger one between this segno and the segno calculated with max_size_for_xlog_prune.
* 4.When standby is not alive in dummy standby mode, just keep log.
* 5.Notice the users if slot is invalid
*
* while dms and dss enable, t_thrd.xlog_cxt.server_mode only is normal_mode, we do additional
* check for dms and dss enabling.
*/
if ((!ENABLE_DMS && t_thrd.xlog_cxt.server_mode == PRIMARY_MODE) || SS_DISASTER_PRIMARY_NODE) {
if (WalSndInProgress(SNDROLE_PRIMARY_BUILDSTANDBY) ||
pg_atomic_read_u32(&g_instance.comm_cxt.current_gsrewind_count) > 0) {
segno = KEEP_ALL;
fill_keeper(segno, WALKEEPER_BUILD, xlogkeeper);
ereport(LOG, (errmsg("keep all the xlog segments, because there is a build task in the backend, "
"and gs_rewind count is more than zero")));
} else if (IS_DN_MULTI_STANDYS_MODE() && !g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
XLogSegNo prune_segno = 0;
XLogSegNo force_prune_segno = 0;
if (!WalSndAllInProgress(SNDROLE_PRIMARY_STANDBY)) {
if (!u_sess->attr.attr_storage.enable_xlog_prune) {
prune_segno = KEEP_ALL;
} else {
prune_segno = CalcRecycleSegNo(curInsert, repl_slot_state.quorum_min_required,
repl_slot_state.min_tools_required);
}
fill_keeper(prune_segno, WALKEEPER_INVALIDSEND, xlogkeeper);
}
if (u_sess->attr.attr_storage.max_size_xlog_force_prune > 0) {
force_prune_segno = CalcForceRecycleSegNo(curInsert, repl_slot_state.quorum_min_required);
fill_keeper(force_prune_segno, WALKEEPER_QUORUM_MIN, xlogkeeper);
}
if (prune_segno > 0 || force_prune_segno > 0) {
segno = prune_segno > force_prune_segno ? prune_segno : force_prune_segno;
ereport(LOG, (errmsg("enable_xlog_prune %s, prune_segno %lu; max_size_xlog_force_prune %dkB, "
"force_prune_segno %lu; so keep xlog segments after %lu, quorumMinReq %lu, minToolsReq %lu",
u_sess->attr.attr_storage.enable_xlog_prune ? "on" : "off", prune_segno,
u_sess->attr.attr_storage.max_size_xlog_force_prune, force_prune_segno, segno,
repl_slot_state.quorum_min_required, repl_slot_state.min_tools_required)));
}
} else if (IS_DN_DUMMY_STANDYS_MODE() && !WalSndInProgress(SNDROLE_PRIMARY_STANDBY)) {
segno = KEEP_ALL;
fill_keeper(segno, WALKEEPER_DUMMYSTANDBY, xlogkeeper);
ereport(LOG, (errmsg("keep all the xlog segments, because not all the wal senders are "
"in the role PRIMARY_STANDBY")));
} else {
uint64 maxKeepSize = ((uint64)u_sess->attr.attr_storage.max_size_for_xlog_prune << 10);
XLogSegNo archiveSlotSegNo;
XLByteToSeg(repl_slot_state.min_archive_slot_required, archiveSlotSegNo);
if (!XLByteEQ(repl_slot_state.min_archive_slot_required, InvalidXLogRecPtr) &&
(XLByteLT(maxKeepSize + XLogSegSize, (curInsert - repl_slot_state.min_archive_slot_required)) &&
archiveSlotSegNo == segno)) {
segno = CalcRecycleArchiveSegNo();
g_instance.roach_cxt.isXLogForceRecycled = true;
fill_keeper(segno, WALKEEPER_RESISTARCHIVE, xlogkeeper);
ereport(LOG, (errmsg("force recycle archiving xlog, because archive required xlog keep segement size "
"is bigger than max keep size: %lu", maxKeepSize)));
}
}
if (XLByteEQ(keep, InvalidXLogRecPtr) && repl_slot_state.exist_in_use &&
!g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
segno = KEEP_ALL;
fill_keeper(segno, WALKEEPER_INVALIDSLOT, xlogkeeper);
ereport(WARNING, (errmsg("invalid replication slot recptr"),
errhint("Check slot configuration or setup standby/secondary")));
}
if (wal_keep_segno < segno) {
segno = wal_keep_segno;
}
}
if (u_sess->attr.attr_storage.enable_cbm_tracking) {
XLogSegNo cbm_tracked_segno;
XLByteToSeg(GetCBMTrackedLSN(), cbm_tracked_segno);
if (cbm_tracked_segno < segno && cbm_tracked_segno > 0) {
segno = cbm_tracked_segno;
}
if(cbm_tracked_segno > 0)
fill_keeper(cbm_tracked_segno, WALKEEPER_CBM, xlogkeeper);
}
if (t_thrd.xlog_cxt.server_mode != PRIMARY_MODE && t_thrd.xlog_cxt.server_mode != NORMAL_MODE) {
LWLockAcquire(XlogRemoveSegLock, LW_SHARED);
if (XlogRemoveSegPrimary < segno && XlogRemoveSegPrimary > 0) {
segno = XlogRemoveSegPrimary;
}
if(XlogRemoveSegPrimary > 0)
fill_keeper(XlogRemoveSegPrimary, WALKEEPER_CHECKPOINT, xlogkeeper);
LWLockRelease(XlogRemoveSegLock);
}
if (t_thrd.xlog_cxt.server_mode == STANDBY_MODE && t_thrd.xlog_cxt.is_hadr_main_standby &&
!g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
XLogSegNo mainStandbySegNo = CalcRecycleSegNoForHadrMainStandby(recptr, segno, repl_slot_state.min_required);
if (mainStandbySegNo < segno && mainStandbySegNo > 0) {
segno = mainStandbySegNo;
}
}
if (RecoveryInProgress() && IS_EXRTO_READ && !dummyStandbyMode && IS_EXRTO_READ_OPT) {
XLogRecPtr recycle_recptr = pg_atomic_read_u64(&g_instance.comm_cxt.predo_cxt.global_recycle_lsn);
XLogSegNo recyle_segno;
XLByteToSeg(recycle_recptr, recyle_segno);
if (recyle_segno == 0) {
segno = KEEP_ALL;
} else if (recyle_segno < segno) {
segno = recyle_segno;
}
fill_keeper(segno, WALKEEPER_EXRTO_STANDBY_READ, xlogkeeper);
}
if (segno < *logSegNo && segno > 0) {
*logSegNo = segno;
}
gstrace_exit(GS_TRC_ID_KeepLogSeg);
return xlogkeeper;
}
* Write a NEXTOID log record
*/
void XLogPutNextOid(Oid nextOid)
{
XLogBeginInsert();
XLogRegisterData((char *)(&nextOid), sizeof(Oid));
(void)XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
* We need not flush the NEXTOID record immediately, because any of the
* just-allocated OIDs could only reach disk as part of a tuple insert or
* update that would have its own XLOG record that must follow the NEXTOID
* record. Therefore, the standard buffer LSN interlock applied to those
* records will ensure no such OID reaches disk before the NEXTOID record
* does.
*
* Note, however, that the above statement only covers state "within" the
* database. When we use a generated OID as a file or directory name, we
* are in a sense violating the basic WAL rule, because that filesystem
* change may reach disk before the NEXTOID WAL record does. The impact
* of this is that if a database crash occurs immediately afterward, we
* might after restart re-generate the same OID and find that it conflicts
* with the leftover file or directory. But since for safety's sake we
* always loop until finding a nonconflicting filename, this poses no real
* problem in practice. See pgsql-hackers discussion 27-Sep-2006.
*/
}
* Write an XLOG SWITCH record.
*
* Here we just blindly issue an XLogInsert request for the record.
* All the magic happens inside XLogInsert.
*
* The return value is either the end+1 address of the switch record,
* or the end+1 address of the prior segment if we did not need to
* write a switch record because we are already at segment start.
*/
XLogRecPtr RequestXLogSwitch(void)
{
XLogBeginInsert();
return XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
}
* Write a RESTORE POINT record
*/
XLogRecPtr XLogRestorePoint(const char *rpName)
{
XLogRecPtr RecPtr;
xl_restore_point xlrec;
errno_t retcode;
xlrec.rp_time = GetCurrentTimestamp();
retcode = strncpy_s(xlrec.rp_name, MAXFNAMELEN, rpName, strlen(rpName));
securec_check(retcode, "\0", "\0");
XLogBeginInsert();
XLogRegisterData((char *)&xlrec, sizeof(xl_restore_point));
RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
ereport(LOG, (errmsg("restore point \"%s\" created at %X/%X", rpName, (uint32)(RecPtr >> 32), (uint32)RecPtr)));
return RecPtr;
}
* Check if any of the GUC parameters that are critical for hot standby
* have changed, and update the value in pg_control file if necessary.
*/
static void XLogReportParameters(void)
{
if (g_instance.attr.attr_storage.wal_level != t_thrd.shemem_ptr_cxt.ControlFile->wal_level ||
g_instance.shmem_cxt.MaxConnections != t_thrd.shemem_ptr_cxt.ControlFile->MaxConnections ||
g_instance.attr.attr_storage.max_prepared_xacts != t_thrd.shemem_ptr_cxt.ControlFile->max_prepared_xacts ||
g_instance.attr.attr_storage.max_locks_per_xact != t_thrd.shemem_ptr_cxt.ControlFile->max_locks_per_xact) {
* The change in number of backend slots doesn't need to be WAL-logged
* if archiving is not enabled, as you can't start archive recovery
* with wal_level=minimal anyway. We don't really care about the
* values in pg_control either if wal_level=minimal, but seems better
* to keep them up-to-date to avoid confusion.
*/
if ((g_instance.attr.attr_storage.wal_level != t_thrd.shemem_ptr_cxt.ControlFile->wal_level || XLogIsNeeded())
&& !SS_DISASTER_STANDBY_CLUSTER && !SS_STANDBY_MODE) {
xl_parameter_change xlrec;
XLogRecPtr recptr;
xlrec.MaxConnections = g_instance.shmem_cxt.MaxConnections;
xlrec.max_prepared_xacts = g_instance.attr.attr_storage.max_prepared_xacts;
xlrec.max_locks_per_xact = g_instance.attr.attr_storage.max_locks_per_xact;
xlrec.wal_level = g_instance.attr.attr_storage.wal_level;
XLogBeginInsert();
XLogRegisterData((char *)&xlrec, sizeof(xlrec));
recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
XLogWaitFlush(recptr);
}
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
t_thrd.shemem_ptr_cxt.ControlFile->MaxConnections = g_instance.shmem_cxt.MaxConnections;
t_thrd.shemem_ptr_cxt.ControlFile->max_prepared_xacts = g_instance.attr.attr_storage.max_prepared_xacts;
t_thrd.shemem_ptr_cxt.ControlFile->max_locks_per_xact = g_instance.attr.attr_storage.max_locks_per_xact;
t_thrd.shemem_ptr_cxt.ControlFile->wal_level = g_instance.attr.attr_storage.wal_level;
UpdateControlFile();
LWLockRelease(ControlFileLock);
}
}
* Update full_page_writes in shared memory, and write an
* XLOG_FPW_CHANGE record if necessary.
*
* Note: this function assumes there is no other process running
* concurrently that could update it.
*/
void UpdateFullPageWrites(void)
{
XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
int32 lastlrc = 0;
* Do nothing if full_page_writes has not been changed.
*
* It's safe to check the shared full_page_writes without the lock,
* because we assume that there is no concurrently running process which
* can update it.
*/
if (ENABLE_INCRE_CKPT) {
u_sess->attr.attr_storage.fullPageWrites = false;
}
if (u_sess->attr.attr_storage.fullPageWrites == Insert->fullPageWrites) {
return;
}
START_CRIT_SECTION();
* It's always safe to take full page images, even when not strictly
* required, but not the other round. So if we're setting full_page_writes
* to true, first set it true and then write the WAL record. If we're
* setting it to false, first write the WAL record and then set the global
* flag.
*/
if (u_sess->attr.attr_storage.fullPageWrites) {
StartSuspendWalInsert(&lastlrc);
Insert->fullPageWrites = true;
StopSuspendWalInsert(lastlrc);
}
* Write an XLOG_FPW_CHANGE record. This allows us to keep track of
* full_page_writes during archive recovery, if required.
*/
if (XLogStandbyInfoActive() && !RecoveryInProgress() && !SS_STANDBY_MODE) {
XLogBeginInsert();
XLogRegisterData((char *)(&u_sess->attr.attr_storage.fullPageWrites), sizeof(bool));
(void)XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
}
if (!u_sess->attr.attr_storage.fullPageWrites) {
StartSuspendWalInsert(&lastlrc);
Insert->fullPageWrites = false;
StopSuspendWalInsert(lastlrc);
}
END_CRIT_SECTION();
}
bool IsCheckPoint(XLogReaderState *record)
{
uint8 info;
RmgrId rmid;
info = XLogRecGetInfo(record) & (~XLR_INFO_MASK);
rmid = XLogRecGetRmid(record);
return rmid == RM_XLOG_ID && (info == XLOG_CHECKPOINT_SHUTDOWN || info == XLOG_CHECKPOINT_ONLINE);
}
bool IsCheckPoint(const XLogRecParseState *parseState)
{
uint8 info = parseState->blockparse.blockhead.xl_info & (~XLR_INFO_MASK);
RmgrId rmid = parseState->blockparse.blockhead.xl_rmid;
return rmid == RM_XLOG_ID && (info == XLOG_CHECKPOINT_SHUTDOWN || info == XLOG_CHECKPOINT_ONLINE);
}
bool is_backup_end(const XLogRecParseState *parse_state)
{
uint8 info = parse_state->blockparse.blockhead.xl_info & (~XLR_INFO_MASK);
RmgrId rmid = parse_state->blockparse.blockhead.xl_rmid;
return rmid == RM_XLOG_ID && (info == XLOG_BACKUP_END);
}
bool HasTimelineUpdate(XLogReaderState *record)
{
uint8 info;
int rmid;
info = XLogRecGetInfo(record) & (~XLR_INFO_MASK);
rmid = XLogRecGetRmid(record);
return (rmid == RM_XLOG_ID && info == XLOG_CHECKPOINT_SHUTDOWN);
}
void UpdateTimeline(CheckPoint *checkPoint)
{
if (checkPoint->ThisTimeLineID != t_thrd.xlog_cxt.ThisTimeLineID) {
if (checkPoint->ThisTimeLineID < t_thrd.xlog_cxt.ThisTimeLineID ||
!list_member_int(t_thrd.xlog_cxt.expectedTLIs, (int)checkPoint->ThisTimeLineID))
ereport(PANIC, (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
checkPoint->ThisTimeLineID, t_thrd.xlog_cxt.ThisTimeLineID)));
t_thrd.xlog_cxt.ThisTimeLineID = checkPoint->ThisTimeLineID;
}
}
* XLOG resource manager's routines
*
* Definitions of info values are in include/catalog/pg_control.h, though
* not all record types are related to control file updates.
*/
void xlog_redo(XLogReaderState *record)
{
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
XLogRecPtr lsn = record->EndRecPtr;
Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT || !XLogRecHasAnyBlockRefs(record));
if (info == XLOG_NEXTOID) {
Oid nextOid;
errno_t rc;
* We used to try to take the maximum of ShmemVariableCache->nextOid
* and the recorded nextOid, but that fails if the OID counter wraps
* around. Since no OID allocation should be happening during replay
* anyway, better to just believe the record exactly. We still take
* OidGenLock while setting the variable, just in case.
*/
rc = memcpy_s(&nextOid, sizeof(Oid), XLogRecGetData(record), sizeof(Oid));
securec_check(rc, "", "");
LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
t_thrd.xact_cxt.ShmemVariableCache->nextOid = nextOid;
t_thrd.xact_cxt.ShmemVariableCache->oidCount = 0;
LWLockRelease(OidGenLock);
} else if (info == XLOG_CHECKPOINT_SHUTDOWN) {
CheckPoint checkPoint;
CheckPointUndo checkPointUndo;
errno_t rc;
if (XLogRecGetDataLen(record) >= sizeof(checkPoint) && XLogRecGetDataLen(record) < sizeof(checkPointUndo)) {
rc = memcpy_s(&checkPoint, sizeof(CheckPoint), XLogRecGetData(record), sizeof(CheckPoint));
securec_check(rc, "", "");
} else if (XLogRecGetDataLen(record) >= sizeof(checkPointUndo)) {
rc = memcpy_s(&checkPointUndo, sizeof(CheckPointUndo), XLogRecGetData(record), sizeof(CheckPointUndo));
securec_check(rc, "", "");
checkPoint = checkPointUndo.ori_checkpoint;
if (t_thrd.proc->workingVersionNum >= INPLACE_UPDATE_VERSION_NUM) {
pg_atomic_write_u64(&g_instance.undo_cxt.globalFrozenXid, checkPointUndo.globalRecycleXid);
pg_atomic_write_u64(&g_instance.undo_cxt.globalRecycleXid, checkPointUndo.globalRecycleXid);
} else {
pg_atomic_write_u64(&g_instance.undo_cxt.globalFrozenXid, InvalidTransactionId);
pg_atomic_write_u64(&g_instance.undo_cxt.globalRecycleXid, InvalidTransactionId);
}
if (TransactionIdPrecedes(t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin,
checkPointUndo.recent_global_xmin)) {
t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin = checkPointUndo.recent_global_xmin;
}
}
LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
if (TransactionIdPrecedes(t_thrd.xact_cxt.ShmemVariableCache->nextXid, checkPoint.nextXid)) {
t_thrd.xact_cxt.ShmemVariableCache->nextXid = checkPoint.nextXid;
}
ExtendCSNLOG(checkPoint.nextXid);
LWLockRelease(XidGenLock);
LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
t_thrd.xact_cxt.ShmemVariableCache->nextOid = checkPoint.nextOid;
t_thrd.xact_cxt.ShmemVariableCache->oidCount = 0;
LWLockRelease(OidGenLock);
#ifdef ENABLE_DISTRIBUTE_TEST
if (TEST_STUB(DN_SLAVE_CHECKPOINT_KEEPXLOG, default_error_emit)) {
ereport(get_distribute_test_param()->elevel,
(errmsg("default_error_emit xlog_redo time:%ds, stub_name:%s",
get_distribute_test_param()->sleep_time, get_distribute_test_param()->test_stub_name)));
checkPoint.remove_seg = 1;
}
#endif
LWLockAcquire(XlogRemoveSegLock, LW_EXCLUSIVE);
ereport(DEBUG1, (errmsg("redo checkpoint xlog change remove seg from %X/%X to %X/%X",
(uint32)(XlogRemoveSegPrimary >> 32), (uint32)XlogRemoveSegPrimary,
(uint32)(checkPoint.remove_seg >> 32), (uint32)checkPoint.remove_seg)));
XlogRemoveSegPrimary = checkPoint.remove_seg;
LWLockRelease(XlogRemoveSegLock);
MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
* If we see a shutdown checkpoint while waiting for an end-of-backup
* record, the backup was canceled and the end-of-backup record will
* never arrive.
*/
if (t_thrd.xlog_cxt.ArchiveRestoreRequested &&
!XLogRecPtrIsInvalid(t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint) &&
XLogRecPtrIsInvalid(t_thrd.shemem_ptr_cxt.ControlFile->backupEndPoint)) {
if (IsRoachRestore()) {
ereport(WARNING,
(errmsg("exist a shutdown log during xlog redo due to resume backup appear switchover.")));
} else {
ereport(PANIC, (errmsg("online backup was canceled, recovery cannot continue")));
}
}
* If we see a shutdown checkpoint, we know that nothing was running
* on the master at this point. So fake-up an empty running-xacts
* record and use that here and now. Recover additional standby state
* for prepared transactions.
*/
if (t_thrd.xlog_cxt.standbyState >= STANDBY_INITIALIZED) {
TransactionId *xids = NULL;
int nxids = 0;
TransactionId oldestActiveXID = InvalidTransactionId;
TransactionId latestCompletedXid = InvalidTransactionId;
RunningTransactionsData running;
oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
* Construct a RunningTransactions snapshot representing a shut
* down server, with only prepared transactions still alive. We're
* never overflowed at this point because all subxids are listed
* with their parent prepared transactions.
*/
running.nextXid = checkPoint.nextXid;
running.oldestRunningXid = oldestActiveXID;
latestCompletedXid = checkPoint.nextXid;
TransactionIdRetreat(latestCompletedXid);
Assert(TransactionIdIsNormal(latestCompletedXid));
running.latestCompletedXid = latestCompletedXid;
ProcArrayApplyRecoveryInfo(&running);
StandbyRecoverPreparedTransactions();
}
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
LWLockRelease(ControlFileLock);
{
SpinLockAcquire(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck);
t_thrd.shemem_ptr_cxt.XLogCtl->ckptXid = checkPoint.nextXid;
SpinLockRelease(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck);
}
* TLI may change in a shutdown checkpoint, but it shouldn't decrease
*/
if (checkPoint.ThisTimeLineID != t_thrd.xlog_cxt.ThisTimeLineID) {
if (checkPoint.ThisTimeLineID < t_thrd.xlog_cxt.ThisTimeLineID ||
!list_member_int(t_thrd.xlog_cxt.expectedTLIs, (int)checkPoint.ThisTimeLineID)) {
ereport(PANIC, (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
checkPoint.ThisTimeLineID, t_thrd.xlog_cxt.ThisTimeLineID)));
}
t_thrd.xlog_cxt.ThisTimeLineID = checkPoint.ThisTimeLineID;
}
RecoveryRestartPoint(checkPoint, record->ReadRecPtr);
} else if (info == XLOG_CHECKPOINT_ONLINE) {
CheckPoint checkPoint;
CheckPointUndo checkPointUndo;
errno_t rc;
if (XLogRecGetDataLen(record) >= sizeof(checkPoint) && XLogRecGetDataLen(record) < sizeof(checkPointUndo)) {
rc = memcpy_s(&checkPoint, sizeof(CheckPoint), XLogRecGetData(record), sizeof(CheckPoint));
securec_check(rc, "", "");
} else if (XLogRecGetDataLen(record) >= sizeof(checkPointUndo)) {
rc = memcpy_s(&checkPointUndo, sizeof(CheckPointUndo), XLogRecGetData(record), sizeof(CheckPointUndo));
securec_check(rc, "", "");
checkPoint = checkPointUndo.ori_checkpoint;
if (t_thrd.proc->workingVersionNum >= INPLACE_UPDATE_VERSION_NUM) {
pg_atomic_write_u64(&g_instance.undo_cxt.globalFrozenXid, checkPointUndo.globalRecycleXid);
pg_atomic_write_u64(&g_instance.undo_cxt.globalRecycleXid, checkPointUndo.globalRecycleXid);
} else {
pg_atomic_write_u64(&g_instance.undo_cxt.globalFrozenXid, InvalidTransactionId);
pg_atomic_write_u64(&g_instance.undo_cxt.globalRecycleXid, InvalidTransactionId);
}
if (TransactionIdPrecedes(t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin,
checkPointUndo.recent_global_xmin)) {
t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin = checkPointUndo.recent_global_xmin;
}
}
LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
if (TransactionIdPrecedes(t_thrd.xact_cxt.ShmemVariableCache->nextXid, checkPoint.nextXid)) {
t_thrd.xact_cxt.ShmemVariableCache->nextXid = checkPoint.nextXid;
}
ExtendCSNLOG(checkPoint.nextXid);
LWLockRelease(XidGenLock);
LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
t_thrd.xact_cxt.ShmemVariableCache->nextOid = checkPoint.nextOid;
t_thrd.xact_cxt.ShmemVariableCache->oidCount = 0;
LWLockRelease(OidGenLock);
MultiXactAdvanceNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
if (TransactionIdPrecedes(t_thrd.xact_cxt.ShmemVariableCache->oldestXid, checkPoint.oldestXid)) {
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
}
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
LWLockRelease(ControlFileLock);
#ifdef ENABLE_DISTRIBUTE_TEST
if (TEST_STUB(DN_SLAVE_CHECKPOINT_KEEPXLOG, default_error_emit)) {
ereport(get_distribute_test_param()->elevel,
(errmsg("default_error_emit xlog_redo time:%ds, stub_name:%s",
get_distribute_test_param()->sleep_time, get_distribute_test_param()->test_stub_name)));
checkPoint.remove_seg = 1;
}
#endif
LWLockAcquire(XlogRemoveSegLock, LW_EXCLUSIVE);
ereport(DEBUG1, (errmsg("redo checkpoint xlog change remove seg from %X/%X to %X/%X",
(uint32)(XlogRemoveSegPrimary >> 32), (uint32)XlogRemoveSegPrimary,
(uint32)(checkPoint.remove_seg >> 32), (uint32)checkPoint.remove_seg)));
XlogRemoveSegPrimary = checkPoint.remove_seg;
LWLockRelease(XlogRemoveSegLock);
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->ckptXid = checkPoint.nextXid;
SpinLockRelease(&xlogctl->info_lck);
}
if (checkPoint.ThisTimeLineID != t_thrd.xlog_cxt.ThisTimeLineID) {
ereport(PANIC, (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
checkPoint.ThisTimeLineID, t_thrd.xlog_cxt.ThisTimeLineID)));
}
RecoveryRestartPoint(checkPoint, record->ReadRecPtr);
} else if (info == XLOG_NOOP) {
} else if (info == XLOG_SWITCH) {
} else if (info == XLOG_RESTORE_POINT) {
} else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT) {
RedoBufferInfo buffer;
* Full-page image (FPI) records contain nothing else but a backup
* block. The block reference must include a full-page image -
* otherwise there would be no point in this record.
*
* No recovery conflicts are generated by these generic records - if a
* resource manager needs to generate conflicts, it has to define a
* separate WAL record type and redo routine.
*
* XLOG_FPI_FOR_HINT records are generated when a page needs to be
* WAL- logged because of a hint bit update. They are only generated
* when checksums are enabled. There is no difference in handling
* XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
* code just to distinguish them for statistics purposes.
*/
if ((XLogRecGetInfo(record) & XLR_INFO_MASK) == XLOG_MERGE_RECORD){
for (uint8 block_id = 0; block_id <= record->max_block_id; block_id++) {
if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED) {
ereport(ERROR, (errcode(ERRCODE_CASE_NOT_FOUND),
errmsg("unexpected XLogReadBufferForRedo result when restoring backup block, block id: %d", block_id)));
}
UnlockReleaseBuffer(buffer.buf);
}
} else {
if (XLogReadBufferForRedo(record, 0, &buffer) != BLK_RESTORED) {
ereport(ERROR, (errcode(ERRCODE_CASE_NOT_FOUND),
errmsg("unexpected XLogReadBufferForRedo result when restoring backup block")));
}
UnlockReleaseBuffer(buffer.buf);
}
} else if (info == XLOG_BACKUP_END) {
XLogRecPtr startpoint;
errno_t rc = EOK;
rc = memcpy_s(&startpoint, sizeof(startpoint), XLogRecGetData(record), sizeof(startpoint));
securec_check(rc, "", "");
if (XLByteEQ(t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint, startpoint)) {
* We have reached the end of base backup, the point where
* pg_stop_backup() was done. The data on disk is now consistent.
* Reset backupStartPoint, and update minRecoveryPoint to make
* sure we don't allow starting up at an earlier point even if
* recovery is stopped and restarted soon after this.
*/
ereport(LOG, (errmsg("end of backup reached")));
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
if (XLByteLT(t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint, lsn)) {
t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint = lsn;
}
rc = memset_s(&t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint, sizeof(XLogRecPtr), 0,
sizeof(XLogRecPtr));
securec_check(rc, "\0", "\0");
t_thrd.shemem_ptr_cxt.ControlFile->backupEndRequired = false;
UpdateControlFile();
LWLockRelease(ControlFileLock);
}
} else if (info == XLOG_PARAMETER_CHANGE) {
xl_parameter_change xlrec;
errno_t rc = EOK;
rc = memcpy_s(&xlrec, sizeof(xl_parameter_change), XLogRecGetData(record), sizeof(xl_parameter_change));
securec_check(rc, "", "");
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
t_thrd.shemem_ptr_cxt.ControlFile->MaxConnections = xlrec.MaxConnections;
t_thrd.shemem_ptr_cxt.ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
t_thrd.shemem_ptr_cxt.ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
t_thrd.shemem_ptr_cxt.ControlFile->wal_level = xlrec.wal_level;
* Update minRecoveryPoint to ensure that if recovery is aborted, we
* recover back up to this point before allowing hot standby again.
* This is particularly important if wal_level was set to 'archive'
* before, and is now 'hot_standby', to ensure you don't run queries
* against the WAL preceding the wal_level change. Same applies to
* decreasing max_* settings.
*/
t_thrd.xlog_cxt.minRecoveryPoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
SetMinRecoverPointForStats(t_thrd.xlog_cxt.minRecoveryPoint);
if ((t_thrd.xlog_cxt.minRecoveryPoint != 0) && XLByteLT(t_thrd.xlog_cxt.minRecoveryPoint, lsn)) {
t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint = lsn;
}
UpdateControlFile();
LWLockRelease(ControlFileLock);
ereport(LOG, (errmsg("update ControlFile->minRecoveryPoint to %X/%X for xlog parameter change",
(uint32)((t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint) >> 32),
(uint32)(t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint))));
CheckRequiredParameterValues(false);
} else if (info == XLOG_FPW_CHANGE) {
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
bool fpw = false;
errno_t rc = EOK;
rc = memcpy_s(&fpw, sizeof(bool), XLogRecGetData(record), sizeof(bool));
securec_check(rc, "", "");
* Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
* do_pg_start_backup() and do_pg_stop_backup() can check whether
* full_page_writes has been disabled during online backup.
*/
if (!fpw) {
SpinLockAcquire(&xlogctl->info_lck);
if (XLByteLT(xlogctl->lastFpwDisableRecPtr, record->ReadRecPtr)) {
xlogctl->lastFpwDisableRecPtr = record->ReadRecPtr;
}
SpinLockRelease(&xlogctl->info_lck);
}
t_thrd.xlog_cxt.lastFullPageWrites = fpw;
} else if (info == XLOG_DELAY_XLOG_RECYCLE) {
bool delay = false;
errno_t rc = EOK;
rc = memcpy_s(&delay, sizeof(bool), XLogRecGetData(record), sizeof(bool));
securec_check(rc, "", "");
if (delay == true) {
enable_delay_xlog_recycle(true);
} else {
disable_delay_xlog_recycle(true);
}
}
}
#ifdef WAL_DEBUG
static void xlog_outrec(StringInfo buf, XLogReaderState *record)
{
int block_id;
appendStringInfo(buf, "prev %X/%X; xid %u", (uint32)(XLogRecGetPrev(record) >> 32), (uint32)XLogRecGetPrev(record),
XLogRecGetXid(record));
appendStringInfo(buf, "; len %u", XLogRecGetDataLen(record));
for (block_id = 0; block_id <= record->max_block_id; block_id++) {
RelFileNode rnode;
ForkNumber forknum;
BlockNumber blk;
if (!XLogRecHasBlockRef(record, block_id)) {
continue;
}
XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
if (forknum != MAIN_FORKNUM) {
appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u", block_id, rnode.spcNode, rnode.dbNode,
rnode.relNode, forknum, blk);
} else {
appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u", block_id, rnode.spcNode, rnode.dbNode,
rnode.relNode, blk);
}
if (XLogRecHasBlockImage(record, block_id)) {
appendStringInfo(buf, " FPW");
}
}
}
#endif
* Return the (possible) sync flag used for opening a file, depending on the
* value of the GUC wal_sync_method.
*/
static int get_sync_bit(int method)
{
uint32 o_direct_flag = 0;
if (!u_sess->attr.attr_storage.enableFsync) {
return 0;
}
* Optimize writes by bypassing kernel cache with O_DIRECT when using
* O_SYNC/O_FSYNC and O_DSYNC. But only if archiving and streaming are
* disabled, otherwise the archive command or walsender process will read
* the WAL soon after writing it, which is guaranteed to cause a physical
* read if we bypassed the kernel cache. We also skip the
* posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
* reason.
*
* Never use O_DIRECT in walreceiver process for similar reasons; the WAL
* written by walreceiver is normally read by the startup process soon
* after its written. Also, walreceiver performs unaligned writes, which
* don't work with O_DIRECT, so it is required for correctness too.
*/
if (!XLogIsNeeded() && !AmWalReceiverProcess()) {
o_direct_flag = PG_O_DIRECT;
}
switch (method) {
* enum values for all sync options are defined even if they are
* not supported on the current platform. But if not, they are
* not included in the enum option array, and therefore will never
* be seen here.
*/
case SYNC_METHOD_FSYNC:
case SYNC_METHOD_FSYNC_WRITETHROUGH:
case SYNC_METHOD_FDATASYNC:
return 0;
#ifdef OPEN_SYNC_FLAG
case SYNC_METHOD_OPEN:
return OPEN_SYNC_FLAG | o_direct_flag;
#endif
#ifdef OPEN_DATASYNC_FLAG
case SYNC_METHOD_OPEN_DSYNC:
return OPEN_DATASYNC_FLAG | o_direct_flag;
#endif
default:
ereport(ERROR,
(errcode(ERRCODE_MOST_SPECIFIC_TYPE_MISMATCH), errmsg("unrecognized wal_sync_method: %d", method)));
return 0;
}
}
* GUC support
*/
void assign_xlog_sync_method(int new_sync_method, void *extra)
{
if (u_sess->attr.attr_storage.sync_method != new_sync_method) {
* To ensure that no blocks escape unsynced, force an fsync on the
* currently open log segment (if any). Also, if the open flag is
* changing, close the log file so it will be reopened (with new flag
* bit) at next use.
*/
if (t_thrd.xlog_cxt.openLogFile >= 0) {
pgstat_report_waitevent(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
if (pg_fsync(t_thrd.xlog_cxt.openLogFile) != 0) {
ereport(PANIC, (errcode_for_file_access(),
errmsg("could not fsync log segment %s: %s",
XLogFileNameP(t_thrd.xlog_cxt.curFileTLI, t_thrd.xlog_cxt.readSegNo),
TRANSLATE_ERRNO)));
}
pgstat_report_waitevent(WAIT_EVENT_END);
if (get_sync_bit(u_sess->attr.attr_storage.sync_method) != get_sync_bit(new_sync_method)) {
XLogFileClose();
}
}
}
}
* Issue appropriate kind of fsync (if any) for an XLOG output file.
*
* 'fd' is a file descriptor for the XLOG file to be fsync'd.
* 'log' and 'seg' are for error reporting purposes.
*/
void issue_xlog_fsync(int fd, XLogSegNo segno)
{
if (ENABLE_DSS) {
return;
}
switch (u_sess->attr.attr_storage.sync_method) {
case SYNC_METHOD_FSYNC:
if (pg_fsync_no_writethrough(fd) != 0) {
ereport(PANIC,
(errcode_for_file_access(), errmsg("could not fsync log file %s: %s",
XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, segno),
TRANSLATE_ERRNO)));
}
break;
#ifdef HAVE_FSYNC_WRITETHROUGH
case SYNC_METHOD_FSYNC_WRITETHROUGH:
if (pg_fsync_writethrough(fd) != 0) {
ereport(PANIC,
(errcode_for_file_access(), errmsg("could not fsync write-through log file %s: %s",
XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, segno),
TRANSLATE_ERRNO)));
}
break;
#endif
#ifdef HAVE_FDATASYNC
case SYNC_METHOD_FDATASYNC:
if (pg_fdatasync(fd) != 0) {
ereport(PANIC,
(errcode_for_file_access(), errmsg("could not fdatasync log file %s: %s",
XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, segno),
TRANSLATE_ERRNO)));
}
break;
#endif
case SYNC_METHOD_OPEN:
case SYNC_METHOD_OPEN_DSYNC:
break;
default:
ereport(PANIC, (errmsg("unrecognized wal_sync_method: %d", u_sess->attr.attr_storage.sync_method)));
break;
}
}
* Return the filename of given log segment, as a palloc'd string.
*/
char *XLogFileNameP(TimeLineID tli, XLogSegNo segno)
{
errno_t errorno = EOK;
char *result = (char *)palloc(MAXFNAMELEN);
errorno = snprintf_s(result, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", tli,
(uint32)((segno) / XLogSegmentsPerXLogId), (uint32)((segno) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
return result;
}
bool check_roach_start_backup(const char *slotName)
{
char backup_label[MAXPGPATH];
errno_t rc = EOK;
struct stat statbuf;
rc = snprintf_s(backup_label, sizeof(backup_label), sizeof(backup_label) - 1, "%s.%s", BACKUP_LABEL_FILE_ROACH,
slotName);
securec_check_ss(rc, "", "");
if (stat(backup_label, &statbuf)) {
if (errno != ENOENT) {
ereport(WARNING, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %s", backup_label, TRANSLATE_ERRNO)));
}
ereport(DEBUG1, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("roach backup %s is not in progress", slotName)));
return false;
}
return true;
}
static void CollectTableSpace(DIR *tblspcdir, List **tablespaces, StringInfo tblspc_mapfbuf, bool infotbssize)
{
struct dirent *de = NULL;
tablespaceinfo *ti = NULL;
const char *dataDir = NULL;
int datadirpathlen = -1;
dataDir = is_dss_file(TBLSPCDIR) ? g_instance.attr.attr_storage.dss_attr.ss_dss_data_vg_name : t_thrd.proc_cxt.DataDir;
datadirpathlen = strlen(dataDir);
while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL) {
char fullpath[MAXPGPATH + PG_TBLSPCS];
char linkpath[MAXPGPATH];
char *relpath = NULL;
int rllen;
errno_t errorno = EOK;
errorno = memset_s(fullpath, MAXPGPATH + PG_TBLSPCS, '\0', MAXPGPATH + PG_TBLSPCS);
securec_check(errorno, "", "");
errorno = memset_s(linkpath, MAXPGPATH, '\0', MAXPGPATH);
securec_check(errorno, "", "");
if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
continue;
errorno = snprintf_s(fullpath, MAXPGPATH + PG_TBLSPCS, MAXPGPATH + PG_TBLSPCS - 1,
"%s/%s", TBLSPCDIR, de->d_name);
securec_check_ss(errorno, "\0", "\0");
#if defined(HAVE_READLINK) || defined(WIN32)
rllen = readlink(fullpath, linkpath, sizeof(linkpath));
if (rllen < 0) {
ereport(WARNING, (errmsg("could not read symbolic link \"%s\": %s", fullpath, TRANSLATE_ERRNO)));
continue;
} else if (rllen >= (int)sizeof(linkpath)) {
ereport(WARNING, (errmsg("symbolic link \"%s\" target is too long", fullpath)));
continue;
}
linkpath[rllen] = '\0';
* Relpath holds the relative path of the tablespace directory
* when it's located within PGDATA, or NULL if it's located
* elsewhere.
*/
if (rllen > datadirpathlen && strncmp(linkpath, dataDir, datadirpathlen) == 0 &&
IS_DIR_SEP(linkpath[datadirpathlen]))
relpath = linkpath + datadirpathlen + 1;
ti = (tablespaceinfo *)palloc(sizeof(tablespaceinfo));
ti->oid = pstrdup(de->d_name);
ti->path = pstrdup(linkpath);
ti->relativePath = relpath ? pstrdup(relpath) : NULL;
ti->size = infotbssize ? sendTablespace(fullpath, true) : -1;
if (tablespaces)
*tablespaces = lappend(*tablespaces, ti);
appendStringInfo(tblspc_mapfbuf, "%s %s\n", ti->oid, ti->path);
#else
* If the platform does not have symbolic links, it should not be
* possible to have tablespaces - clearly somebody else created
* them. Warn about it and ignore.
*/
ereport(WARNING,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("tablespaces are not supported on this platform")));
#endif
}
}
* do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
* function. It creates the necessary starting checkpoint and constructs the
* backup label file.
*
* There are two kind of backups: exclusive and non-exclusive. An exclusive
* backup is started with pg_start_backup(), and there can be only one active
* at a time. The backup label file of an exclusive backup is written to
* $PGDATA/backup_label, and it is removed by pg_stop_backup().
*
* A non-exclusive backup is used for the streaming base backups (see
* src/backend/replication/basebackup.c). The difference to exclusive backups
* is that the backup label file is not written to disk. Instead, its would-be
* contents are returned in *labelfile, and the caller is responsible for
* including it in the backup archive as 'backup_label'. There can be many
* non-exclusive backups active at the same time, and they don't conflict
* with an exclusive backup either.
*
* Every successfully started non-exclusive backup must
* be stopped by calling do_pg_stop_backup() or do_pg_abort_backup().
*/
XLogRecPtr do_pg_start_backup(const char *backupidstr, bool fast, char **labelfile, DIR *tblspcdir,
char **tblspcmapfile, List **tablespaces, bool infotbssize, bool needtblspcmapfile)
{
bool exclusive = (labelfile == NULL);
bool backup_started_in_recovery = false;
XLogRecPtr checkpointloc;
XLogRecPtr startpoint;
pg_time_t stamp_time;
char strfbuf[128];
char xlogfilename[MAXFNAMELEN];
XLogSegNo _logSegNo;
struct stat stat_buf;
FILE *fp = NULL;
StringInfoData labelfbuf;
StringInfoData tblspc_mapfbuf;
int32 lastlrc = 0;
errno_t errorno = EOK;
gstrace_entry(GS_TRC_ID_do_pg_start_backup);
backup_started_in_recovery = RecoveryInProgress();
if (!superuser() && !has_rolreplication(GetUserId()) &&
!(isOperatoradmin(GetUserId()) && u_sess->attr.attr_security.operation_mode)) {
ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
(errmsg("must be system admin or replication role to run a backup."
"If in operation mode, you can also be operator admin to run a backup."))));
}
* Currently only non-exclusive backup can be taken during recovery.
*/
if (backup_started_in_recovery && exclusive) {
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is in progress"),
errhint("WAL control functions cannot be executed during recovery.")));
}
* During recovery, we don't need to check WAL level. Because, if WAL
* level is not sufficient, it's impossible to get here during recovery.
*/
if (!backup_started_in_recovery && !XLogIsNeeded()) {
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("WAL level not sufficient for making an online backup"),
errhint("wal_level must be set to \"archive\", \"hot_standby\" or \"logical\" at server start.")));
}
if (strlen(backupidstr) > MAXPGPATH) {
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("backup label too long (max %d bytes)", MAXPGPATH)));
}
* Mark backup active in shared memory. We must do full-page WAL writes
* during an on-line backup even if not doing so at other times, because
* it's quite possible for the backup dump to obtain a "torn" (partially
* written) copy of a database page if it reads the page concurrently with
* our write to the same page. This can be fixed as long as the first
* write to the page in the WAL sequence is a full-page write. Hence, we
* turn on forcePageWrites and then force a CHECKPOINT, to ensure there
* are no dirty pages in shared memory that might get dumped while the
* backup is in progress without having a corresponding WAL record. (Once
* the backup is complete, we need not force full-page writes anymore,
* since we expect that any pages not modified during the backup interval
* must have been correctly captured by the backup.)
*
* Note that forcePageWrites has no effect during an online backup from
* the standby.
*
* We must hold WALInsertLock to change the value of forcePageWrites, to
* ensure adequate interlocking against XLogInsertRecord().
*/
StartSuspendWalInsert(&lastlrc);
if (exclusive) {
if (t_thrd.shemem_ptr_cxt.XLogCtl->Insert.exclusiveBackup) {
StopSuspendWalInsert(lastlrc);
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("a backup is already in progress"), errhint("Run pg_stop_backup() and try again.")));
}
t_thrd.shemem_ptr_cxt.XLogCtl->Insert.exclusiveBackup = true;
} else {
t_thrd.shemem_ptr_cxt.XLogCtl->Insert.nonExclusiveBackups++;
}
if (ENABLE_INCRE_CKPT) {
t_thrd.shemem_ptr_cxt.XLogCtl->Insert.forcePageWrites = false;
} else {
t_thrd.shemem_ptr_cxt.XLogCtl->Insert.forcePageWrites = true;
}
StopSuspendWalInsert(lastlrc);
PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum)BoolGetDatum(exclusive));
{
if (strcmp(backupidstr, "gs_ctl full build") == 0) {
if (!backup_started_in_recovery) {
(void)RequestXLogSwitch();
}
LWLockAcquire(ControlFileLock, LW_SHARED);
checkpointloc = t_thrd.shemem_ptr_cxt.ControlFile->checkPoint;
startpoint = t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo;
LWLockRelease(ControlFileLock);
} else {
bool gotUniqueStartpoint = false;
* Force an XLOG file switch before the checkpoint, to ensure that the
* WAL segment the checkpoint is written to doesn't contain pages with
* old timeline IDs. That would otherwise happen if you called
* pg_start_backup() right after restoring from a PITR archive: the
* first WAL segment containing the startup checkpoint has pages in
* the beginning with the old timeline ID. That can cause trouble at
* recovery: we won't have a history file covering the old timeline if
* pg_xlog directory was not included in the base backup and the WAL
* archive was cleared too before starting the backup.
*
* This also ensures that we have emitted a WAL page header that has
* XLP_BKP_REMOVABLE off before we emit the checkpoint record.
* Therefore, if a WAL archiver (such as pglesslog) is trying to
* compress out removable backup blocks, it won't remove any that
* occur after this point.
*
* During recovery, we skip forcing XLOG file switch, which means that
* the backup taken during recovery is not available for the special
* recovery case described above.
*/
if (!backup_started_in_recovery) {
(void)RequestXLogSwitch();
}
do {
bool checkpointfpw = false;
* Force a CHECKPOINT. Aside from being necessary to prevent torn
* page problems, this guarantees that two successive backup runs
* will have different checkpoint positions and hence different
* history file names, even if nothing happened in between.
*
* During recovery, establish a restartpoint if possible. We use
* the last restartpoint as the backup starting checkpoint. This
* means that two successive backup runs can have same checkpoint
* positions.
*
* Since the fact that we are executing do_pg_start_backup()
* during recovery means that checkpointer is running, we can use
* RequestCheckpoint() to establish a restartpoint.
*
* We use CHECKPOINT_IMMEDIATE only if requested by user (via
* passing fast = true). Otherwise this can take awhile.
*/
RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | (fast ? CHECKPOINT_IMMEDIATE : 0));
* Now we need to fetch the checkpoint record location, and also
* its REDO pointer. The oldest point in WAL that would be needed
* to restore starting from the checkpoint is precisely the REDO
* pointer.
*/
LWLockAcquire(ControlFileLock, LW_SHARED);
checkpointloc = t_thrd.shemem_ptr_cxt.ControlFile->checkPoint;
startpoint = t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo;
checkpointfpw = t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.fullPageWrites;
LWLockRelease(ControlFileLock);
if (backup_started_in_recovery) {
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
XLogRecPtr recptr;
* Check to see if all WAL replayed during online backup
* (i.e., since last restartpoint used as backup starting
* checkpoint) contain full-page writes.
*/
SpinLockAcquire(&xlogctl->info_lck);
recptr = xlogctl->lastFpwDisableRecPtr;
SpinLockRelease(&xlogctl->info_lck);
#ifdef ENABLE_MULTIPLE_NODES
if (!checkpointfpw || XLByteLE(startpoint, recptr)) {
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("WAL generated with full_page_writes=off was replayed "
"since last restartpoint"),
errhint("This means that the backup being taken on the standby "
"is corrupt and should not be used. "
"Enable full_page_writes and run CHECKPOINT on the master, "
"and then try an online backup again.")));
}
#endif
* During recovery, since we don't use the end-of-backup WAL
* record and don't write the backup history file, the
* starting WAL location doesn't need to be unique. This means
* that two base backups started at the same time might use
* the same checkpoint as starting locations.
*/
gotUniqueStartpoint = true;
}
* If two base backups are started at the same time (in WAL sender
* processes), we need to make sure that they use different
* checkpoints as starting locations, because we use the starting
* WAL location as a unique identifier for the base backup in the
* end-of-backup WAL record and when we write the backup history
* file. Perhaps it would be better generate a separate unique ID
* for each backup instead of forcing another checkpoint, but
* taking a checkpoint right after another is not that expensive
* either because only few buffers have been dirtied yet.
*/
StartSuspendWalInsert(&lastlrc);
if (XLByteLT(t_thrd.shemem_ptr_cxt.XLogCtl->Insert.lastBackupStart, startpoint)) {
t_thrd.shemem_ptr_cxt.XLogCtl->Insert.lastBackupStart = startpoint;
gotUniqueStartpoint = true;
}
StopSuspendWalInsert(lastlrc);
} while (!gotUniqueStartpoint);
}
XLByteToSeg(startpoint, _logSegNo);
errorno = snprintf_s(xlogfilename, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", t_thrd.xlog_cxt.ThisTimeLineID,
(uint32)((_logSegNo) / XLogSegmentsPerXLogId),
(uint32)((_logSegNo) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
* Construct tablespace_map file
*/
initStringInfo(&tblspc_mapfbuf);
CollectTableSpace(tblspcdir, tablespaces, &tblspc_mapfbuf, infotbssize);
* Construct backup label file
*/
initStringInfo(&labelfbuf);
stamp_time = (pg_time_t)time(NULL);
pg_strftime(strfbuf, sizeof(strfbuf), "%Y-%m-%d %H:%M:%S %Z", pg_localtime(&stamp_time, log_timezone));
appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n", (uint32)(startpoint >> 32),
(uint32)startpoint, xlogfilename);
appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n", (uint32)(checkpointloc >> 32),
(uint32)checkpointloc);
appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n", exclusive ? "pg_start_backup" : "streamed");
appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n", backup_started_in_recovery ? "standby" : "master");
appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);
* Okay, write the file, or return its contents to caller.
*/
if (exclusive) {
* Check for existing backup label --- implies a backup is already
* running. (XXX given that we checked exclusiveBackup above,
* maybe it would be OK to just unlink any such label file?)
*/
char *fileName = BACKUP_LABEL_FILE;
if (strcmp(u_sess->attr.attr_common.application_name, "gs_roach") == 0) {
fileName = BACKUP_LABEL_FILE_ROACH;
}
if (stat(fileName, &stat_buf) != 0) {
if (errno != ENOENT) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %s", fileName, TRANSLATE_ERRNO)));
}
} else {
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("a backup is already in progress"),
errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
fileName)));
}
fp = AllocateFile(fileName, "w");
if (fp == NULL) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %s", fileName, TRANSLATE_ERRNO)));
}
if (fwrite(labelfbuf.data, labelfbuf.len, 1, fp) != 1 || fflush(fp) != 0 || pg_fsync(fileno(fp)) != 0 ||
ferror(fp) || FreeFile(fp)) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not write file \"%s\": %s", fileName, TRANSLATE_ERRNO)));
}
pfree(labelfbuf.data);
if (tblspc_mapfbuf.len > 0) {
if (stat(TABLESPACE_MAP, &stat_buf) != 0) {
if (errno != ENOENT)
ereport(ERROR,
(errcode_for_file_access(), errmsg("could not stat file \"%s\": %s", TABLESPACE_MAP, TRANSLATE_ERRNO)));
} else
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("a backup is already in progress"),
errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
TABLESPACE_MAP)));
fp = AllocateFile(TABLESPACE_MAP, "w");
if (!fp)
ereport(ERROR,
(errcode_for_file_access(), errmsg("could not create file \"%s\": %s", TABLESPACE_MAP, TRANSLATE_ERRNO)));
if (fwrite(tblspc_mapfbuf.data, tblspc_mapfbuf.len, 1, fp) != 1 || fflush(fp) != 0 ||
pg_fsync(fileno(fp)) != 0 || ferror(fp) || FreeFile(fp))
ereport(ERROR,
(errcode_for_file_access(), errmsg("could not write file \"%s\": %s", TABLESPACE_MAP, TRANSLATE_ERRNO)));
}
pfree(tblspc_mapfbuf.data);
} else {
*labelfile = labelfbuf.data;
if (tblspc_mapfbuf.len > 0)
*tblspcmapfile = tblspc_mapfbuf.data;
}
}
PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum)BoolGetDatum(exclusive));
if (exclusive) {
u_sess->proc_cxt.sessionBackupState = SESSION_BACKUP_EXCLUSIVE;
} else {
u_sess->proc_cxt.sessionBackupState = SESSION_BACKUP_NON_EXCLUSIVE;
}
gstrace_exit(GS_TRC_ID_do_pg_start_backup);
* We're done. As a convenience, return the starting WAL location.
*/
return startpoint;
}
XLogRecPtr StandbyDoStartBackup(const char *backupidstr, char **labelFile, char **tblSpcMapFile, List **tableSpaces,
DIR *tblSpcDir, bool infoTbsSize)
{
StringInfoData labelfbuf;
StringInfoData tblspc_mapfbuf;
pg_time_t stamp_time;
char strfbuf[128];
char xlogFileName[MAXFNAMELEN];
XLogSegNo _logSegNo;
XLogRecPtr checkPointLoc;
XLogRecPtr startPoint;
errno_t errorno = EOK;
LWLockAcquire(ControlFileLock, LW_SHARED);
checkPointLoc = t_thrd.shemem_ptr_cxt.ControlFile->checkPoint;
startPoint = t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo;
LWLockRelease(ControlFileLock);
XLByteToSeg(startPoint, _logSegNo);
errorno = snprintf_s(xlogFileName, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", t_thrd.xlog_cxt.ThisTimeLineID,
(uint32)((_logSegNo) / XLogSegmentsPerXLogId), (uint32)((_logSegNo) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
* Construct tablespace_map file
*/
initStringInfo(&tblspc_mapfbuf);
CollectTableSpace(tblSpcDir, tableSpaces, &tblspc_mapfbuf, infoTbsSize);
* Construct backup label file
*/
initStringInfo(&labelfbuf);
stamp_time = (pg_time_t)time(NULL);
pg_strftime(strfbuf, sizeof(strfbuf), "%Y-%m-%d %H:%M:%S %Z", pg_localtime(&stamp_time, log_timezone));
appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n", (uint32)(startPoint >> 32),
(uint32)startPoint, xlogFileName);
appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n", (uint32)(checkPointLoc >> 32), (uint32)checkPointLoc);
appendStringInfo(&labelfbuf, "BACKUP METHOD: streamed\n");
appendStringInfo(&labelfbuf, "BACKUP FROM: standby\n");
appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);
* Okay, write the file, or return its contents to caller.
*/
*labelFile = labelfbuf.data;
if (tblspc_mapfbuf.len > 0) {
*tblSpcMapFile = tblspc_mapfbuf.data;
}
return startPoint;
}
static void pg_start_backup_callback(int code, Datum arg)
{
bool exclusive = DatumGetBool(arg);
int32 lastlrc = 0;
StartSuspendWalInsert(&lastlrc);
if (exclusive) {
Assert(t_thrd.shemem_ptr_cxt.XLogCtl->Insert.exclusiveBackup);
t_thrd.shemem_ptr_cxt.XLogCtl->Insert.exclusiveBackup = false;
} else {
Assert(t_thrd.shemem_ptr_cxt.XLogCtl->Insert.nonExclusiveBackups > 0);
t_thrd.shemem_ptr_cxt.XLogCtl->Insert.nonExclusiveBackups--;
}
if (!t_thrd.shemem_ptr_cxt.XLogCtl->Insert.exclusiveBackup &&
t_thrd.shemem_ptr_cxt.XLogCtl->Insert.nonExclusiveBackups == 0) {
t_thrd.shemem_ptr_cxt.XLogCtl->Insert.forcePageWrites = false;
}
StopSuspendWalInsert(lastlrc);
}
* do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
* function.
* If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
* the non-exclusive backup specified by 'labelfile'.
*/
XLogRecPtr do_pg_stop_backup(char *labelfile, bool waitforarchive, unsigned long long *consensusPaxosIdx)
{
bool exclusive = (labelfile == NULL);
bool backup_started_in_recovery = false;
XLogRecPtr startpoint;
XLogRecPtr stoppoint;
pg_time_t stamp_time;
char strfbuf[128];
char backupbuf[MAX_BACKUP_LEN] = {0};
char histfilepath[MAXPGPATH];
char startxlogfilename[MAXFNAMELEN];
char stopxlogfilename[MAXFNAMELEN];
char lastxlogfilename[MAXFNAMELEN];
char histfilename[MAXFNAMELEN];
char backupfrom[20];
XLogSegNo _logSegNo;
FILE *lfp = NULL;
FILE *fp = NULL;
char ch;
int seconds_before_warning;
int waits = 0;
bool reported_waiting = false;
char *remaining = NULL;
char *ptr = NULL;
uint32 hi, lo;
int32 lastlrc = 0;
errno_t errorno = EOK;
gstrace_entry(GS_TRC_ID_do_pg_stop_backup);
backup_started_in_recovery = RecoveryInProgress();
if (!superuser() && !has_rolreplication(GetUserId()) &&
!(isOperatoradmin(GetUserId()) && u_sess->attr.attr_security.operation_mode)) {
ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
(errmsg("must be system admin or replication role to run a backup."
"If in operation mode, you can also be operator admin to run a backup."))));
}
* Currently only non-exclusive backup can be taken during recovery.
*/
if (backup_started_in_recovery && exclusive) {
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is in progress"),
errhint("WAL control functions cannot be executed during recovery.")));
}
* During recovery, we don't need to check WAL level. Because, if WAL
* level is not sufficient, it's impossible to get here during recovery.
*/
if (!backup_started_in_recovery && !XLogIsNeeded()) {
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("WAL level not sufficient for making an online backup"),
errhint("wal_level must be set to \"archive\", \"hot_standby\" or \"logical\" at server start.")));
}
* OK to update backup counters and forcePageWrites
*/
StartSuspendWalInsert(&lastlrc);
if (exclusive) {
t_thrd.shemem_ptr_cxt.XLogCtl->Insert.exclusiveBackup = false;
} else {
* The user-visible pg_start/stop_backup() functions that operate on
* exclusive backups can be called at any time, but for non-exclusive
* backups, it is expected that each do_pg_start_backup() call is
* matched by exactly one do_pg_stop_backup() call.
*/
Assert(t_thrd.shemem_ptr_cxt.XLogCtl->Insert.nonExclusiveBackups > 0);
t_thrd.shemem_ptr_cxt.XLogCtl->Insert.nonExclusiveBackups--;
}
if (!t_thrd.shemem_ptr_cxt.XLogCtl->Insert.exclusiveBackup &&
t_thrd.shemem_ptr_cxt.XLogCtl->Insert.nonExclusiveBackups == 0) {
t_thrd.shemem_ptr_cxt.XLogCtl->Insert.forcePageWrites = false;
}
u_sess->proc_cxt.sessionBackupState = SESSION_BACKUP_NONE;
StopSuspendWalInsert(lastlrc);
* Clean up session-level lock.
*
* You might think that WALInsertLockRelease() can be called before
* cleaning up session-level lock because session-level lock doesn't need
* to be protected with WAL insertion lock. But since
* CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
* cleaned up before it.
*/
u_sess->proc_cxt.sessionBackupState = SESSION_BACKUP_NONE;
if (exclusive) {
* Read the existing label file into memory.
*/
struct stat statbuf;
int r;
char *fileName = BACKUP_LABEL_FILE;
if (strcmp(u_sess->attr.attr_common.application_name, "gs_roach") == 0) {
fileName = BACKUP_LABEL_FILE_ROACH;
}
if (stat(fileName, &statbuf)) {
if (errno != ENOENT) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %s", fileName, TRANSLATE_ERRNO)));
}
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("a backup is not in progress")));
}
lfp = AllocateFile(fileName, "r");
if (lfp == NULL) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\": %s", fileName, TRANSLATE_ERRNO)));
}
if (statbuf.st_size > BLCKSZ) {
ereport(ERROR, (errcode_for_file_access(), errmsg("file size is wrong, \"%s\": %s", fileName, TRANSLATE_ERRNO)));
}
labelfile = (char *)palloc(statbuf.st_size + 1);
r = fread(labelfile, statbuf.st_size, 1, lfp);
labelfile[statbuf.st_size] = '\0';
* Close and remove the backup label file
*/
if (r != 1 || ferror(lfp) || FreeFile(lfp)) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\": %s", fileName, TRANSLATE_ERRNO)));
}
if (strcmp(u_sess->attr.attr_common.application_name, "gs_roach") != 0) {
if (unlink(fileName) != 0) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %s", fileName, TRANSLATE_ERRNO)));
}
if (unlink(TABLESPACE_MAP) != 0) {
ereport(DEBUG1,
(errcode_for_file_access(), errmsg("could not remove file \"%s\": %s", TABLESPACE_MAP, TRANSLATE_ERRNO)));
}
}
}
* Read and parse the START WAL LOCATION line (this code is pretty crude,
* but we are not expecting any variability in the file format).
*/
if (sscanf_s(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c", &hi, &lo, startxlogfilename,
sizeof(startxlogfilename), &ch, 1) != 4 ||
ch != '\n') {
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
}
startpoint = (((uint64)hi) << 32) | lo;
remaining = strchr(labelfile, '\n') + 1;
* Parse the BACKUP FROM line. If we are taking an online backup from the
* standby, we confirm that the standby has not been promoted during the
* backup.
*/
ptr = strstr(remaining, "BACKUP FROM:");
if ((ptr == NULL) || sscanf_s(ptr, "BACKUP FROM: %19s\n", backupfrom, sizeof(backupfrom)) != 1) {
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
}
if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery) {
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("the standby was promoted during online backup"),
errhint("This means that the backup being taken is corrupt "
"and should not be used. "
"Try taking another online backup.")));
}
* During recovery, we don't write an end-of-backup record. We assume that
* pg_control was backed up last and its minimum recovery point can be
* available as the backup end location. Since we don't have an
* end-of-backup record, we use the pg_control value to check whether
* we've reached the end of backup when starting recovery from this
* backup. We have no way of checking if pg_control wasn't backed up last
* however.
*
* We don't force a switch to new WAL file and wait for all the required
* files to be archived. This is okay if we use the backup to start the
* standby. But, if it's for an archive recovery, to ensure all the
* required files are available, a user should wait for them to be
* archived, or include them into the backup.
*
* We return the current minimum recovery point as the backup end
* location. Note that it can be greater than the exact backup end
* location if the minimum recovery point is updated after the backup of
* pg_control. This is harmless for current uses.
*
* XXX currently a backup history file is for informational and debug
* purposes only. It's not essential for an online backup. Furthermore,
* even if it's created, it will not be archived during recovery because
* an archiver is not invoked. So it doesn't seem worthwhile to write a
* backup history file during recovery.
*/
if (backup_started_in_recovery) {
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
XLogRecPtr recptr;
* Check to see if all WAL replayed during online backup contain
* full-page writes.
*/
SpinLockAcquire(&xlogctl->info_lck);
recptr = xlogctl->lastFpwDisableRecPtr;
SpinLockRelease(&xlogctl->info_lck);
#ifdef ENABLE_MULTIPLE_NODES
if (XLByteLE(startpoint, recptr)) {
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("WAL generated with full_page_writes=off was replayed "
"during online backup"),
errhint("This means that the backup being taken on the standby "
"is corrupt and should not be used. "
"Enable full_page_writes and run CHECKPOINT on the master, "
"and then try an online backup again.")));
}
#endif
LWLockAcquire(ControlFileLock, LW_SHARED);
stoppoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
LWLockRelease(ControlFileLock);
if (XLByteLE(stoppoint, startpoint)) {
stoppoint = GetXLogReplayRecPtr(NULL);
}
gstrace_exit(GS_TRC_ID_do_pg_stop_backup);
return stoppoint;
}
* Write the backup-end xlog record
*/
XLogBeginInsert();
XLogRegisterData((char *)(&startpoint), sizeof(startpoint));
stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
if (strcmp(u_sess->attr.attr_common.application_name, "gs_roach") == 0) {
char *fileName = BACKUP_LABEL_FILE_ROACH;
struct stat statbuf;
XLogWaitFlush(stoppoint);
if (stat(fileName, &statbuf)) {
if (errno != ENOENT) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %s", fileName, TRANSLATE_ERRNO)));
}
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("the file is not exist \"%s\": %s", fileName, TRANSLATE_ERRNO)));
}
if (durable_rename(BACKUP_LABEL_FILE_ROACH, BACKUP_LABEL_FILE_ROACH_DONE, ERROR)) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %s",
BACKUP_LABEL_FILE_ROACH, BACKUP_LABEL_FILE_ROACH_DONE, TRANSLATE_ERRNO)));
}
}
* Force a switch to a new xlog segment file, so that the backup is valid
* as soon as archiver moves out the current segment file.
*/
(void)RequestXLogSwitch();
XLogWaitFlush(stoppoint);
XLByteToPrevSeg(stoppoint, _logSegNo);
errorno = snprintf_s(stopxlogfilename, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", t_thrd.xlog_cxt.ThisTimeLineID,
(uint32)((_logSegNo) / XLogSegmentsPerXLogId), (uint32)((_logSegNo) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
stamp_time = (pg_time_t)time(NULL);
pg_strftime(strfbuf, sizeof(strfbuf), "%Y-%m-%d %H:%M:%S %Z", pg_localtime(&stamp_time, log_timezone));
* Write the backup history file
*/
XLByteToSeg(startpoint, _logSegNo);
errorno = snprintf_s(histfilepath, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X.%08X.backup", SS_XLOGDIR,
t_thrd.xlog_cxt.ThisTimeLineID, (uint32)((_logSegNo) / XLogSegmentsPerXLogId),
(uint32)((_logSegNo) % XLogSegmentsPerXLogId), (uint32)(startpoint % XLogSegSize));
securec_check_ss(errorno, "", "");
fp = AllocateFile(histfilepath, "w");
if (fp == NULL) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %s", histfilepath, TRANSLATE_ERRNO)));
}
errorno = snprintf_s(backupbuf, MAX_BACKUP_LEN, MAX_BACKUP_LEN - 1,
"START WAL LOCATION: %X/%X (file %s)\n"
"STOP WAL LOCATION: %X/%X (file %s)\n"
"%s"
"STOP TIME: %s\n",
(uint32)(startpoint >> 32), (uint32)startpoint, startxlogfilename,
(uint32)(stoppoint >> 32), (uint32)stoppoint, stopxlogfilename,
remaining,
strfbuf);
securec_check_ss(errorno, "", "");
fwrite(backupbuf, MAX_BACKUP_LEN, 1, fp);
if (fflush(fp) || ferror(fp) || FreeFile(fp)) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not write file \"%s\": %s", histfilepath, TRANSLATE_ERRNO)));
}
* Clean out any no-longer-needed history files. As a side effect, this
* will post a .ready file for the newly created history file, notifying
* the archiver that history file may be archived immediately.
*/
CleanupBackupHistory();
* If archiving is enabled, wait for all the required WAL files to be
* archived before returning. If archiving isn't enabled, the required WAL
* needs to be transported via streaming replication (hopefully with
* wal_keep_segments set high enough), or some more exotic mechanism like
* polling and copying files from pg_xlog with script. We have no
* knowledge of those mechanisms, so it's up to the user to ensure that he
* gets all the required WAL.
*
* We wait until both the last WAL file filled during backup and the
* history file have been archived, and assume that the alphabetic sorting
* property of the WAL files ensures any earlier WAL files are safely
* archived as well.
*
* We wait forever, since archive_command is supposed to work and we
* assume the admin wanted his backup to work completely. If you don't
* wish to wait, you can set statement_timeout. Also, some notices are
* issued to clue in anyone who might be doing this interactively.
*/
if (waitforarchive && XLogArchivingActive()) {
XLByteToPrevSeg(stoppoint, _logSegNo);
errorno = snprintf_s(lastxlogfilename, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X",
t_thrd.xlog_cxt.ThisTimeLineID, (uint32)((_logSegNo) / XLogSegmentsPerXLogId),
(uint32)((_logSegNo) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
XLByteToSeg(startpoint, _logSegNo);
errorno = snprintf_s(histfilename, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X.%08X.backup",
t_thrd.xlog_cxt.ThisTimeLineID, (uint32)((_logSegNo) / XLogSegmentsPerXLogId),
(uint32)((_logSegNo) % XLogSegmentsPerXLogId), (uint32)(startpoint % XLogSegSize));
securec_check_ss(errorno, "", "");
seconds_before_warning = 60;
waits = 0;
while (XLogArchiveIsBusy(lastxlogfilename) || XLogArchiveIsBusy(histfilename)) {
CHECK_FOR_INTERRUPTS();
if (!reported_waiting && waits > 5) {
ereport(NOTICE,
(errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
reported_waiting = true;
}
pg_usleep(1000000L);
if (++waits >= seconds_before_warning) {
seconds_before_warning *= 2;
ereport(WARNING,
(errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds "
"elapsed)",
waits),
errhint("Check that your archive_command is executing properly. "
"pg_stop_backup can be canceled safely, "
"but the database backup will not be usable without all the WAL segments.")));
}
}
ereport(NOTICE, (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
} else if (waitforarchive) {
ereport(NOTICE,
(errmsg(
"WAL archiving is not enabled; you must ensure that all required WAL segments are copied through "
"other means to complete the backup")));
}
gstrace_exit(GS_TRC_ID_do_pg_stop_backup);
* We're done. As a convenience, return the ending WAL location.
*/
return stoppoint;
}
*
*/
XLogRecPtr StandbyDoStopBackup(char *labelfile)
{
XLogRecPtr stopPoint;
XLogRecPtr startPoint;
uint32 hi, lo;
char startxlogfilename[MAXFNAMELEN];
char ch;
char *remaining = NULL;
* During recovery, we don't need to check WAL level. Because, if WAL
* level is not sufficient, it's impossible to get here during recovery.
*/
if (!XLogIsNeeded()) {
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("WAL level not sufficient for making an online backup"),
errhint("wal_level must be set to \"archive\", \"hot_standby\" or \"logical\" at server start.")));
}
* Read and parse the START WAL LOCATION line (this code is pretty crude,
* but we are not expecting any variability in the file format).
*/
if (sscanf_s(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c", &hi, &lo, startxlogfilename,
sizeof(startxlogfilename), &ch, 1) != 4 ||
ch != '\n') {
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
}
startPoint = (((uint64)hi) << 32) | lo;
remaining = strchr(labelfile, '\n') + 1;
XLogRecPtr minRecoveryPoint;
LWLockAcquire(ControlFileLock, LW_SHARED);
stopPoint = t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo;
minRecoveryPoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
LWLockRelease(ControlFileLock);
if (XLByteLE(stopPoint, startPoint)) {
stopPoint = GetXLogReplayRecPtr(NULL);
}
if (XLByteLT(stopPoint, minRecoveryPoint)) {
stopPoint = minRecoveryPoint;
}
if (XLByteLT(stopPoint, g_instance.comm_cxt.predo_cxt.redoPf.read_ptr)) {
stopPoint = g_instance.comm_cxt.predo_cxt.redoPf.read_ptr;
}
return stopPoint;
}
* do_pg_abort_backup: abort a running backup
*
* This does just the most basic steps of do_pg_stop_backup(), by taking the
* system out of backup mode, thus making it a lot more safe to call from
* an error handler.
*
* NB: This is only for aborting a non-exclusive backup that doesn't write
* backup_label. A backup started with pg_stop_backup() needs to be finished
* with pg_stop_backup().
*/
void do_pg_abort_backup(void)
{
if (u_sess->proc_cxt.sessionBackupState != SESSION_BACKUP_NON_EXCLUSIVE)
return;
int32 lastlrc = 0;
StartSuspendWalInsert(&lastlrc);
* Quick exit if session is not keeping around a non-exclusive backup
* already started.
*/
if (u_sess->proc_cxt.sessionBackupState != SESSION_BACKUP_NON_EXCLUSIVE)
return;
Assert(t_thrd.shemem_ptr_cxt.XLogCtl->Insert.nonExclusiveBackups > 0);
t_thrd.shemem_ptr_cxt.XLogCtl->Insert.nonExclusiveBackups--;
if (!t_thrd.shemem_ptr_cxt.XLogCtl->Insert.exclusiveBackup &&
t_thrd.shemem_ptr_cxt.XLogCtl->Insert.nonExclusiveBackups == 0) {
t_thrd.shemem_ptr_cxt.XLogCtl->Insert.forcePageWrites = false;
}
u_sess->proc_cxt.sessionBackupState = SESSION_BACKUP_NONE;
StopSuspendWalInsert(lastlrc);
}
* Error cleanup callback for do_pg_abort_backup
*/
static void DoAbortBackupCallback(int code, Datum arg)
{
do_pg_abort_backup();
}
* Register a handler that will warn about unterminated backups at the end of
* session, unless this has already been done.
*/
void RegisterPersistentAbortBackupHandler(void)
{
if (u_sess->proc_cxt.registerAbortBackupHandlerdone)
return;
on_shmem_exit(DoAbortBackupCallback, BoolGetDatum(true));
u_sess->proc_cxt.registerAbortBackupHandlerdone = true;
}
* start roach backup.
* get current checkpoint location, together with its redo location, and write them into
* roach backup label.
* We avoid requesting extra checkpoint here and adopt a similar recovery logic as
* recovery from standby backup, i.e., backup pg_control last and use its minrecovery point
* to judge recovery consistency.
*/
XLogRecPtr do_roach_start_backup(const char *backupidstr)
{
bool backup_started_in_recovery = false;
XLogRecPtr checkpointloc;
XLogRecPtr startpoint;
pg_time_t stamp_time;
char strfbuf[128];
char xlogfilename[MAXFNAMELEN];
XLogSegNo _logSegNo;
struct stat stat_buf;
FILE *fp = NULL;
StringInfoData labelfbuf;
errno_t errorno = EOK;
gstrace_entry(GS_TRC_ID_do_pg_start_backup);
backup_started_in_recovery = RecoveryInProgress();
check_roach_backup_condition(backup_started_in_recovery, backupidstr);
* Now we need to fetch the checkpoint record location, and also
* its REDO pointer. Different from pg_do_start_backup, in order to
* save IO, this checkpoint is an existing one, rather than a newly-forced full checkpoint.
*/
LWLockAcquire(ControlFileLock, LW_SHARED);
checkpointloc = t_thrd.shemem_ptr_cxt.ControlFile->checkPoint;
startpoint = t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo;
LWLockRelease(ControlFileLock);
if (!backup_started_in_recovery &&
u_sess->attr.attr_storage.guc_synchronous_commit > SYNCHRONOUS_COMMIT_LOCAL_FLUSH) {
if (g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
SyncPaxosWaitForLSN(checkpointloc);
} else {
SyncRepWaitForLSN(checkpointloc);
}
}
XLByteToSeg(startpoint, _logSegNo);
errorno = snprintf_s(xlogfilename, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", t_thrd.xlog_cxt.ThisTimeLineID,
(uint32)((_logSegNo) / XLogSegmentsPerXLogId), (uint32)((_logSegNo) % XLogSegmentsPerXLogId));
securec_check_ss(errorno, "", "");
initStringInfo(&labelfbuf);
stamp_time = (pg_time_t)time(NULL);
pg_strftime(strfbuf, sizeof(strfbuf), "%Y-%m-%d %H:%M:%S %Z", pg_localtime(&stamp_time, log_timezone));
appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n", (uint32)(startpoint >> 32),
(uint32)startpoint, xlogfilename);
appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n", (uint32)(checkpointloc >> 32), (uint32)checkpointloc);
appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n", "pg_start_backup");
appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n", "roach_master");
appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);
errorno = snprintf_s(strfbuf, sizeof(strfbuf), sizeof(strfbuf) - 1, "%s.%s", BACKUP_LABEL_FILE_ROACH, backupidstr);
securec_check_ss(errorno, "\0", "\0");
if (stat(strfbuf, &stat_buf) != 0) {
if (errno != ENOENT) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %s", strfbuf, TRANSLATE_ERRNO)));
}
} else {
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("a backup is already in progress"),
errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.", strfbuf)));
}
fp = AllocateFile(strfbuf, "w");
if (fp == NULL) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %s", strfbuf, TRANSLATE_ERRNO)));
}
if (fwrite(labelfbuf.data, labelfbuf.len, 1, fp) != 1 || fflush(fp) != 0 || pg_fsync(fileno(fp)) != 0 ||
ferror(fp) || FreeFile(fp)) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not write file \"%s\": %s", strfbuf, TRANSLATE_ERRNO)));
}
pfree(labelfbuf.data);
gstrace_exit(GS_TRC_ID_do_pg_start_backup);
return startpoint;
}
* stop roach backup
* For primary, get current insert position.
* For standby, get current minRecoveryPoint.
* As roach start backup, we avoid requesting full checkpoint to save IO.
*/
XLogRecPtr do_roach_stop_backup(const char *backupidstr)
{
bool backup_started_in_recovery = false;
XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
#if defined(__aarch64__) || defined(__x86_64__) || defined(__i386__)
uint64 current_bytepos;
#endif
XLogRecPtr stoppoint;
char backup_label[MAXPGPATH];
char backup_label_done[MAXPGPATH];
errno_t errorno = EOK;
struct stat statbuf;
gstrace_entry(GS_TRC_ID_do_pg_stop_backup);
backup_started_in_recovery = RecoveryInProgress();
check_roach_backup_condition(backup_started_in_recovery, backupidstr);
errorno = snprintf_s(backup_label, sizeof(backup_label), sizeof(backup_label) - 1, "%s.%s", BACKUP_LABEL_FILE_ROACH,
backupidstr);
securec_check_ss(errorno, "\0", "\0");
if (stat(backup_label, &statbuf)) {
if (errno != ENOENT) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %s", backup_label, TRANSLATE_ERRNO)));
}
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("roach backup %s is not in progress", backupidstr)));
}
if (backup_started_in_recovery) {
LWLockAcquire(ControlFileLock, LW_SHARED);
stoppoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
LWLockRelease(ControlFileLock);
} else {
#if defined(__x86_64__) || defined(__aarch64__) && !defined(__USE_SPINLOCK)
current_bytepos = pg_atomic_barrier_read_u64((uint64 *)&Insert->CurrBytePos);
stoppoint = XLogBytePosToEndRecPtr(current_bytepos);
#else
SpinLockAcquire(&Insert->insertpos_lck);
stoppoint = XLogBytePosToEndRecPtr(Insert->CurrBytePos);
SpinLockRelease(&Insert->insertpos_lck);
#endif
}
if (!backup_started_in_recovery &&
u_sess->attr.attr_storage.guc_synchronous_commit > SYNCHRONOUS_COMMIT_LOCAL_FLUSH) {
if (g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
SyncPaxosWaitForLSN(stoppoint);
} else {
SyncRepWaitForLSN(stoppoint);
}
}
errorno = snprintf_s(backup_label_done, sizeof(backup_label_done), sizeof(backup_label_done) - 1, "%s.%s",
BACKUP_LABEL_FILE_ROACH_DONE, backupidstr);
securec_check_ss(errorno, "\0", "\0");
if (durable_rename(backup_label, backup_label_done, ERROR)) {
ereport(ERROR, (errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %s", backup_label, backup_label_done, TRANSLATE_ERRNO)));
}
gstrace_exit(GS_TRC_ID_do_pg_stop_backup);
return stoppoint;
}
void check_roach_backup_condition(bool backup_started_in_recovery, const char *backupidstr)
{
if (!superuser() && !has_rolreplication(GetUserId()) &&
!(isOperatoradmin(GetUserId()) && u_sess->attr.attr_security.operation_mode)) {
ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
errmsg("must be system admin or replication role or "
"operator admin in operation mode to run a roach backup")));
}
if (backup_started_in_recovery) {
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is in progress"),
errhint("roach backup cannot be executed during recovery.")));
}
* During recovery, we don't need to check WAL level. Because, if WAL
* level is not sufficient, it's impossible to get here during recovery.
*/
if (!backup_started_in_recovery && !XLogIsNeeded()) {
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("WAL level not sufficient for making an online roach backup"),
errhint("wal_level must be set to \"archive\", \"hot_standby\" or \"logical\" at server start.")));
}
if (strlen(backupidstr) >= NAMEDATALEN) {
ereport(ERROR,
(errcode(ERRCODE_NAME_TOO_LONG), errmsg("roach backup id name \"%s\" is too long", backupidstr)));
}
if (strncmp(backupidstr, "gs_roach", strlen("gs_roach")) == 0 &&
strcmp(u_sess->attr.attr_common.application_name, "gs_roach") != 0) {
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("Backup id name starting with gs_roach is internally reserverd")));
}
}
*Error cleanup callback for exclusive backup
*/
static void AbortExcluseBackupCallback(int code, Datum arg)
{
if (u_sess->proc_cxt.sessionBackupState != SESSION_BACKUP_EXCLUSIVE) {
return;
}
int32 lastlrc = 0;
StartSuspendWalInsert(&lastlrc);
Assert(t_thrd.shemem_ptr_cxt.XLogCtl->Insert.exclusiveBackup);
t_thrd.shemem_ptr_cxt.XLogCtl->Insert.exclusiveBackup = false;
if (!t_thrd.shemem_ptr_cxt.XLogCtl->Insert.exclusiveBackup &&
t_thrd.shemem_ptr_cxt.XLogCtl->Insert.nonExclusiveBackups == 0) {
t_thrd.shemem_ptr_cxt.XLogCtl->Insert.forcePageWrites = false;
}
u_sess->proc_cxt.sessionBackupState = SESSION_BACKUP_NONE;
StopSuspendWalInsert(lastlrc);
if (strcmp(u_sess->attr.attr_common.application_name, "gs_roach") != 0) {
(void)unlink(BACKUP_LABEL_FILE);
}
}
*Register a handler that will warn about unterminated exclusive backups
*at end of session, unless this has already been done.
*/
void RegisterAbortExclusiveBackup()
{
if (u_sess->proc_cxt.registerExclusiveHandlerdone) {
return;
}
on_shmem_exit(AbortExcluseBackupCallback, BoolGetDatum(true));
u_sess->proc_cxt.registerExclusiveHandlerdone = true;
}
void enable_delay_xlog_recycle(bool isRedo)
{
FILE *fp = NULL;
ereport(LOG, (errmsg("start delaying xlog recycle so that checkpoint "
"no longer removes stale xlog segments")));
fp = AllocateFile(DELAY_XLOG_RECYCLE_FILE, PG_BINARY_W);
if (fp == NULL || fflush(fp) != 0 || pg_fsync(fileno(fp)) != 0 || ferror(fp) || FreeFile(fp)) {
ereport(ERROR, (errcode_for_file_access(),
errmsg("failed to touch %s file during enable xlog delay: %s",
DELAY_XLOG_RECYCLE_FILE, TRANSLATE_ERRNO)));
}
SetDelayXlogRecycle(true, isRedo);
* acquire and release RowPageReplicationLock to ensure that
* concurrent heap_multi_insert will no longer use page replication after
* we return
*/
LWLockAcquire(RowPageReplicationLock, LW_EXCLUSIVE);
LWLockRelease(RowPageReplicationLock);
}
void disable_delay_xlog_recycle(bool isRedo)
{
ereport(LOG, (errmsg("stop delaying xlog recycle and consequent checkpoint "
"will remove stale xlog segments")));
if (unlink(DELAY_XLOG_RECYCLE_FILE) < 0 && errno != ENOENT) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove %s file: %s. This will lead to residual of "
"stale xlog segments",
DELAY_XLOG_RECYCLE_FILE, TRANSLATE_ERRNO)));
}
SetDelayXlogRecycle(false, isRedo);
}
void startupInitDelayXlog(void)
{
struct stat st;
if (!lstat(DELAY_XLOG_RECYCLE_FILE, &st)) {
ereport(LOG, (errmsg("xlog delay function was enabled at previous database shutdown")));
* Do not turn on xlog delay function if we are recovering from archive or
* if we are standby. In these cases, our xlog segments will be modified unexpectedly and
* it is meaningless to keep them for back up.
*/
if (t_thrd.xlog_cxt.ArchiveRestoreRequested) {
disable_delay_xlog_recycle(true);
} else {
SetDelayXlogRecycle(true, true);
}
} else if (errno != ENOENT) {
ereport(ERROR, (errcode_for_file_access(), errmsg("failed to stat %s file:%s",
DELAY_XLOG_RECYCLE_FILE, TRANSLATE_ERRNO)));
}
}
char *getLastRewindTime()
{
char buf[MAXPGPATH] = {0};
char *retStr = NULL;
errno_t ret;
FILE *fd = NULL;
struct stat st;
if (stat(REWIND_LABLE_FILE, &st)) {
if (errno != ENOENT) {
ereport(ERROR, (errcode_for_file_access(),
errmsg("could not stat file \"%s\": %s", REWIND_LABLE_FILE, TRANSLATE_ERRNO)));
}
return "";
}
fd = fopen(REWIND_LABLE_FILE, "r");
if (fd == NULL) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not open rewind file: %s\n", gs_strerror(errno))));
}
if (fgets(buf, lengthof(buf), fd) == NULL) {
if (fclose(fd)) {
ereport(LOG, (errcode_for_file_access(),
errmsg("get null bits, could not close file \"%s\": %s", REWIND_LABLE_FILE, TRANSLATE_ERRNO)));
}
fd = NULL;
return "";
}
retStr = (char *)palloc0(sizeof(char) * (strlen(buf) + 1));
ret = snprintf_s(retStr, strlen(buf) + 1, strlen(buf), "%s", buf);
securec_check_ss(ret, "\0", "\0");
if (fclose(fd)) {
ereport(LOG, (errcode_for_file_access(), errmsg("could not close file \"%s\": %s", REWIND_LABLE_FILE, TRANSLATE_ERRNO)));
}
fd = NULL;
return retStr;
}
XLogRecPtr enable_delay_ddl_recycle(void)
{
DelayDDLRange prevDelayRange = InvalidDelayRange;
DelayDDLRange curDelayRange = InvalidDelayRange;
XLogRecPtr globalDelayStartPtr = InvalidXLogRecPtr;
XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
bool usePrevDelayFile = false;
if (!u_sess->attr.attr_storage.enable_cbm_tracking) {
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("could not enable delay ddl when enable_cbm_tracking is off!")));
}
ereport(LOG, (errmsg("start delaying ddl recycle so that column relation files "
"will remain even after they are logically dropped")));
LWLockAcquire(DelayDDLLock, LW_EXCLUSIVE);
if (!validate_parse_delay_ddl_file(&prevDelayRange)) {
if (unlink(DELAY_DDL_RECYCLE_FILE) < 0 && errno != ENOENT) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove %s file: %s",
DELAY_DDL_RECYCLE_FILE, TRANSLATE_ERRNO)));
}
SetDDLDelayStartPtr(InvalidXLogRecPtr);
}
globalDelayStartPtr = GetDDLDelayStartPtr();
if (!XLogRecPtrIsInvalid(prevDelayRange.startLSN) && XLByteLT(prevDelayRange.startLSN, prevDelayRange.endLSN)) {
Assert(XLogRecPtrIsInvalid(globalDelayStartPtr));
if (unlink(DELAY_DDL_RECYCLE_FILE) < 0 && errno != ENOENT) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove %s file: %s",
DELAY_DDL_RECYCLE_FILE, TRANSLATE_ERRNO)));
}
SetDDLDelayStartPtr(InvalidXLogRecPtr);
prevDelayRange = InvalidDelayRange;
}
LWLockAcquire(CBMParseXlogLock, LW_EXCLUSIVE);
#if defined(__x86_64__) || defined(__aarch64__) && !defined(__USE_SPINLOCK)
uint64 current_bytepos = pg_atomic_barrier_read_u64((uint64 *)&Insert->CurrBytePos);
curDelayRange.startLSN = XLogBytePosToEndRecPtr(current_bytepos);
#else
SpinLockAcquire(&Insert->insertpos_lck);
curDelayRange.startLSN = XLogBytePosToEndRecPtr(Insert->CurrBytePos);
SpinLockRelease(&Insert->insertpos_lck);
#endif
XLogWaitFlush(curDelayRange.startLSN);
advanceXlogPtrToNextPageIfNeeded(&(curDelayRange.startLSN));
* We only care about the case that prevDelayRange.startLSN is smaller
* than curDelayRange.startLSN. Because we hold exclusive wal insert
* lock to get curDelayRange.startLSN, larger prevDelayRange.startLSN
* is meaningless to us.
*/
if (!XLogRecPtrIsInvalid(prevDelayRange.startLSN) && XLByteLE(prevDelayRange.startLSN, curDelayRange.startLSN)) {
if (!XLogRecPtrIsInvalid(globalDelayStartPtr) && XLByteLE(globalDelayStartPtr, prevDelayRange.startLSN)) {
curDelayRange.startLSN = prevDelayRange.startLSN;
usePrevDelayFile = true;
} else {
ereport(WARNING, (errmsg("unexpected global xlogctl delay start lsn %08X/%08X "
"and previous delay start lsn %08X/%08X ",
(uint32)(globalDelayStartPtr >> 32), (uint32)(globalDelayStartPtr),
(uint32)(prevDelayRange.startLSN >> 32), (uint32)(prevDelayRange.startLSN))));
}
}
if (!usePrevDelayFile) {
if (!write_delay_ddl_file(curDelayRange, true))
ereport(ERROR, (errcode_for_file_access(),
errmsg("failed to write %s file during enable delay ddl recycle", DELAY_DDL_RECYCLE_FILE)));
}
SetDDLDelayStartPtr(curDelayRange.startLSN);
LWLockRelease(DelayDDLLock);
ForceTrackCBMOnce(curDelayRange.startLSN, ENABLE_DDL_DELAY_TIMEOUT, true, true);
return curDelayRange.startLSN;
}
* We will grab at most 4 exclusive LW locks at the same time, so the
* sequence is important and deadlock should be avoided.
*
* If isForce is true, Please do not pass in valid barrierLSN.
* In this case, we will ensure global delay lsn is reset and
* delay file is unlink by ingoring possible errors, though,
* there may be some residual of dropped column relation files
* if we do encounter errors.
*/
void disable_delay_ddl_recycle(XLogRecPtr barrierLSN, bool isForce, XLogRecPtr *startLSN, XLogRecPtr *endLSN)
{
DelayDDLRange delayRange = InvalidDelayRange;
XLogRecPtr globalDelayStartPtr = InvalidXLogRecPtr;
XLogRecPtr forceTrackLSN = InvalidXLogRecPtr;
XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
bool cbmTrackFailed = false;
if (!u_sess->attr.attr_storage.enable_cbm_tracking) {
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg(" could not disable delay ddl when enable_cbm_tracking is off!")));
}
*startLSN = InvalidXLogRecPtr;
*endLSN = InvalidXLogRecPtr;
ereport(LOG, (errmsg("stop delaying ddl recycle and clear column relation files "
"that have been dropped during previous delay period. If any error occurs "
"in this progress, there may be residual of physical relation files "
"that need to be removed manually")));
if (isForce && !XLogRecPtrIsInvalid(barrierLSN)) {
ereport(ERROR, (errcode(ERRCODE_CASE_NOT_FOUND), errmsg("valid input lsn is not supported "
"while doing forceful disable delay ddl")));
}
LWLockAcquire(DelayDDLLock, LW_EXCLUSIVE);
if (!validate_parse_delay_ddl_file(&delayRange)) {
if (unlink(DELAY_DDL_RECYCLE_FILE) < 0 && errno != ENOENT) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove %s file: %s",
DELAY_DDL_RECYCLE_FILE, TRANSLATE_ERRNO)));
}
SetDDLDelayStartPtr(InvalidXLogRecPtr);
LWLockRelease(DelayDDLLock);
ereport(isForce ? LOG : ERROR,
(errcode(ERRCODE_CASE_NOT_FOUND),
errmsg("failed to validate %s file during disable delay ddl recycle", DELAY_DDL_RECYCLE_FILE)));
return;
}
globalDelayStartPtr = GetDDLDelayStartPtr();
if (!XLogRecPtrIsInvalid(delayRange.endLSN) || !XLByteEQ(globalDelayStartPtr, delayRange.startLSN) ||
(!XLogRecPtrIsInvalid(barrierLSN) && XLByteLT(barrierLSN, delayRange.startLSN))) {
ereport(isForce ? LOG : ERROR,
(errcode(ERRCODE_CASE_NOT_FOUND),
errmsg("inconsistent delay lsn range: global start lsn %08X/%08X, "
"%s file start lsn %08X/%08X end lsn %08X/%08X, "
"barrier lsn %08X/%08X",
(uint32)(globalDelayStartPtr >> 32), (uint32)(globalDelayStartPtr), DELAY_DDL_RECYCLE_FILE,
(uint32)(delayRange.startLSN >> 32), (uint32)(delayRange.startLSN),
(uint32)(delayRange.endLSN >> 32), (uint32)(delayRange.endLSN), (uint32)(barrierLSN >> 32),
(uint32)(barrierLSN))));
if (unlink(DELAY_DDL_RECYCLE_FILE) < 0 && errno != ENOENT) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove %s file: %s",
DELAY_DDL_RECYCLE_FILE, TRANSLATE_ERRNO)));
}
SetDDLDelayStartPtr(InvalidXLogRecPtr);
LWLockRelease(DelayDDLLock);
return;
}
LWLockAcquire(CBMParseXlogLock, LW_EXCLUSIVE);
* hold this lock to avoid mis-unlinking column attribute files that
* have been reused.
* At present, only alter table add column for col rels will reuse existing files
* instead of throwing ERROR. If other operations allow such reuse, must
* also hold this lock in advance!
*/
LWLockAcquire(RelfilenodeReuseLock, LW_EXCLUSIVE);
#if defined(__x86_64__) || defined(__aarch64__) && !defined(__USE_SPINLOCK)
uint64 current_bytepos = pg_atomic_barrier_read_u64((uint64 *)&Insert->CurrBytePos);
delayRange.endLSN = XLogBytePosToEndRecPtr(current_bytepos);
#else
SpinLockAcquire(&Insert->insertpos_lck);
delayRange.endLSN = XLogBytePosToEndRecPtr(Insert->CurrBytePos);
SpinLockRelease(&Insert->insertpos_lck);
#endif
XLogWaitFlush(delayRange.endLSN);
advanceXlogPtrToNextPageIfNeeded(&(delayRange.endLSN));
if (XLByteEQ(delayRange.startLSN, delayRange.endLSN)) {
if (unlink(DELAY_DDL_RECYCLE_FILE) < 0 && errno != ENOENT) {
ereport(PANIC, (errcode_for_file_access(), errmsg("could not remove %s file: %s",
DELAY_DDL_RECYCLE_FILE, TRANSLATE_ERRNO)));
}
SetDDLDelayStartPtr(InvalidXLogRecPtr);
LWLockRelease(RelfilenodeReuseLock);
LWLockRelease(CBMParseXlogLock);
LWLockRelease(DelayDDLLock);
*startLSN = delayRange.startLSN;
*endLSN = delayRange.endLSN;
return;
}
if (!write_delay_ddl_file(delayRange, false)) {
ereport(ERROR, (errcode_for_file_access(),
errmsg("failed to write %s file during disable delay ddl recycle", DELAY_DDL_RECYCLE_FILE)));
}
* We do not allow long jump before we finish executing delayed ddl and unlinking the
* delay flag file, though we can ignore exiting finished delay file the next time we enable
* or disable ddl recycle. Instead, we now take a more aggressive way so that
* startup can have a chance to clean up the residual column files.
*/
START_CRIT_SECTION();
SetDDLDelayStartPtr(InvalidXLogRecPtr);
LWLockRelease(DelayDDLLock);
forceTrackLSN = ForceTrackCBMOnce(delayRange.endLSN, DISABLE_DDL_DELAY_TIMEOUT, true, true);
if (XLogRecPtrIsInvalid(forceTrackLSN)) {
ereport(WARNING, (errmsg("failed to push cbm track point to delay end lsn %08X/%08X, ",
(uint32)(delayRange.endLSN >> 32), (uint32)(delayRange.endLSN))));
cbmTrackFailed = true;
}
execDelayedDDL(delayRange.startLSN, delayRange.endLSN, cbmTrackFailed);
END_CRIT_SECTION();
LWLockRelease(RelfilenodeReuseLock);
*startLSN = delayRange.startLSN;
*endLSN = delayRange.endLSN;
}
XLogRecPtr enable_delay_ddl_recycle_with_slot(const char *slotname)
{
XLogRecPtr delay_start_lsn;
XLogRecPtr ret_lsn;
XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
if (!u_sess->attr.attr_storage.enable_cbm_tracking) {
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("could not enable delay ddl when enable_cbm_tracking is off!")));
}
if (t_thrd.xlog_cxt.LocalXLogInsertAllowed == 0 && g_instance.streaming_dr_cxt.isInSwitchover == true) {
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot enable delay ddl during streaming disaster recovery")));
}
ereport(LOG, (errmsg("start delaying ddl recycle with backup slot %s", slotname)));
LWLockAcquire(DelayDDLLock, LW_EXCLUSIVE);
LWLockAcquire(CBMParseXlogLock, LW_EXCLUSIVE);
#if defined(__x86_64__) || defined(__aarch64__) && !defined(__USE_SPINLOCK)
uint64 current_bytepos = pg_atomic_barrier_read_u64((uint64 *)&Insert->CurrBytePos);
delay_start_lsn = XLogBytePosToEndRecPtr(current_bytepos);
#else
SpinLockAcquire(&Insert->insertpos_lck);
delay_start_lsn = XLogBytePosToEndRecPtr(Insert->CurrBytePos);
SpinLockRelease(&Insert->insertpos_lck);
#endif
XLogWaitFlush(delay_start_lsn);
advanceXlogPtrToNextPageIfNeeded(&delay_start_lsn);
ret_lsn = slot_advance_confirmed_lsn(slotname, delay_start_lsn);
LWLockRelease(DelayDDLLock);
* Currently, we do not check pushed cbm track location at delay start.
* If failed to push to the exact point, stop delay will complain and there will be
* residual col files.
* Later, we could retry set and push.
*/
(void)ForceTrackCBMOnce(ret_lsn, ENABLE_DDL_DELAY_TIMEOUT, true, true);
return ret_lsn;
}
* We will grab at most 4 exclusive LW locks at the same time, so the
* sequence is important and deadlock should be avoided.
*
* If isForce is true, Please do not pass in valid barrierLSN.
* In this case, we will ensure global delay lsn is reset and
* delay file is unlink by ingoring possible errors, though,
* there may be some residual of dropped column relation files
* if we do encounter errors.
*/
void disable_delay_ddl_recycle_with_slot(const char *slotname, XLogRecPtr *startLSN, XLogRecPtr *endLSN)
{
XLogRecPtr delay_start_lsn = InvalidXLogRecPtr;
XLogRecPtr delay_end_lsn = InvalidXLogRecPtr;
XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
bool need_recycle = true;
if (!u_sess->attr.attr_storage.enable_cbm_tracking) {
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg(" could not disable delay ddl when enable_cbm_tracking is off!")));
}
if (t_thrd.xlog_cxt.LocalXLogInsertAllowed == 0 && g_instance.streaming_dr_cxt.isInSwitchover == true) {
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot disable delay ddl during streaming disaster recovery")));
}
ereport(LOG, (errmsg("stop delaying ddl recycle with backup slot %s", slotname)));
LWLockAcquire(DelayDDLLock, LW_EXCLUSIVE);
LWLockAcquire(CBMParseXlogLock, LW_EXCLUSIVE);
* hold this lock to avoid mis-unlinking column attribute files that
* have been reused.
* At present, only alter table add column for col rels will reuse existing files
* instead of throwing ERROR. If other operations allow such reuse, must
* also hold this lock in advance!
*/
LWLockAcquire(RelfilenodeReuseLock, LW_EXCLUSIVE);
#if defined(__x86_64__) || defined(__aarch64__) && !defined(__USE_SPINLOCK)
uint64 current_bytepos = pg_atomic_barrier_read_u64((uint64 *)&Insert->CurrBytePos);
delay_end_lsn = XLogBytePosToEndRecPtr(current_bytepos);
#else
SpinLockAcquire(&Insert->insertpos_lck);
delay_end_lsn = XLogBytePosToEndRecPtr(Insert->CurrBytePos);
SpinLockRelease(&Insert->insertpos_lck);
#endif
XLogWaitFlush(delay_end_lsn);
advanceXlogPtrToNextPageIfNeeded(&delay_end_lsn);
need_recycle = slot_reset_confirmed_lsn(slotname, &delay_start_lsn);
if (!need_recycle || XLByteEQ(delay_start_lsn, delay_end_lsn)) {
LWLockRelease(RelfilenodeReuseLock);
LWLockRelease(CBMParseXlogLock);
LWLockRelease(DelayDDLLock);
*startLSN = delay_start_lsn;
*endLSN = (need_recycle ? delay_end_lsn : InvalidXLogRecPtr);
return;
}
LWLockRelease(DelayDDLLock);
* Currently not check push result. See comments in enable_delay_ddl_recycle_with_slot.
* There is a very small race condition with enable_delay_ddl_recycle_with_slot for pushing cbm tracked
* position. Currently, we do not address this issue. The worst result is col file residual.
*/
(void)ForceTrackCBMOnce(delay_end_lsn, DISABLE_DDL_DELAY_TIMEOUT, true, true);
execDelayedDDL(delay_start_lsn, delay_end_lsn, true);
LWLockRelease(RelfilenodeReuseLock);
*startLSN = delay_start_lsn;
*endLSN = delay_end_lsn;
}
* If system previously crashed while disabling delay ddl, we will try
* to finish that in startup.
* However, if cbm has already tracked far beyond delay end point, then we
* could do nothing except leaving residual files of dropped col tables. The chance
* is low and the result is safe, anyway.
*/
void startupInitDelayDDL(void)
{
DelayDDLRange delayRange = InvalidDelayRange;
if (!validate_parse_delay_ddl_file(&delayRange) || !u_sess->attr.attr_storage.enable_cbm_tracking) {
if (unlink(DELAY_DDL_RECYCLE_FILE) < 0 && errno != ENOENT) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove %s file: %s",
DELAY_DDL_RECYCLE_FILE, TRANSLATE_ERRNO)));
}
} else {
Assert(!XLogRecPtrIsInvalid(delayRange.startLSN));
* Do not turn on ddl delay function if we are recovering from archive or
* if we are standby. In these cases, our xlog segments and data pages
* might be modified unexpectedly and it is meaningless to keep them for back up.
*/
if (t_thrd.xlog_cxt.ArchiveRestoreRequested) {
ereport(LOG, (errmsg("Have to turn off ddl delay (start lsn %08X/%08X "
"to end lsn %08X/%08X) before entering archive recovery "
"or standby mode. There may be residual of column data files. "
"Please refer to previous log to clean them manually if necessary.",
(uint32)(delayRange.startLSN >> 32), (uint32)(delayRange.startLSN),
(uint32)(delayRange.endLSN >> 32), (uint32)(delayRange.endLSN))));
if (unlink(DELAY_DDL_RECYCLE_FILE) < 0 && errno != ENOENT) {
ereport(ERROR, (errcode_for_file_access(),
errmsg("could not remove %s file: %s", DELAY_DDL_RECYCLE_FILE, TRANSLATE_ERRNO)));
}
} else if (XLogRecPtrIsInvalid(delayRange.endLSN)) {
ereport(LOG, (errmsg("ddl delay was enabled last time from start lsn %08X/%08X",
(uint32)(delayRange.startLSN >> 32), (uint32)(delayRange.startLSN))));
SetDDLDelayStartPtr(delayRange.startLSN);
} else {
ForceTrackCBMOnce(delayRange.endLSN, 0, false, false);
execDelayedDDL(delayRange.startLSN, delayRange.endLSN, true);
}
}
}
bool validate_parse_delay_ddl_file(DelayDDLRange *delayRange)
{
struct stat st;
FILE *fp = NULL;
DelayDDLRange tmpRange = InvalidDelayRange;
bool result = false;
errno_t err;
if (stat(DELAY_DDL_RECYCLE_FILE, &st) == -1 && errno == ENOENT) {
ereport(DEBUG5, (errmsg("%s file does not exist", DELAY_DDL_RECYCLE_FILE)));
return result;
}
fp = AllocateFile(DELAY_DDL_RECYCLE_FILE, PG_BINARY_R);
if (SECUREC_UNLIKELY(fp == NULL)) {
ereport(LOG, (errcode_for_file_access(),
errmsg("could not open existing file \"%s\": %s", DELAY_DDL_RECYCLE_FILE, TRANSLATE_ERRNO)));
return result;
}
if (fread(&tmpRange, 1, sizeof(DelayDDLRange), fp) != sizeof(DelayDDLRange)) {
ereport(WARNING, (errmsg("%s file is corrupted. We will zap it anyway", DELAY_DDL_RECYCLE_FILE)));
goto END;
}
if (!XLogRecPtrIsInvalid(tmpRange.startLSN) &&
(XLogRecPtrIsInvalid(tmpRange.endLSN) || XLByteLT(tmpRange.startLSN, tmpRange.endLSN))) {
ereport(LOG, (errmsg("find existing %s file (start lsn %08X/%08X, end lsn %08X/%08X)", DELAY_DDL_RECYCLE_FILE,
(uint32)(tmpRange.startLSN >> 32), (uint32)(tmpRange.startLSN),
(uint32)(tmpRange.endLSN >> 32), (uint32)(tmpRange.endLSN))));
} else {
ereport(WARNING, (errmsg("%s file has unexpected start lsn %08X/%08X and end lsn "
"%08X/%08X. While there may be residual of dropped column files, "
"We will zap it anyway",
DELAY_DDL_RECYCLE_FILE, (uint32)(tmpRange.startLSN >> 32), (uint32)(tmpRange.startLSN),
(uint32)(tmpRange.endLSN >> 32), (uint32)(tmpRange.endLSN))));
goto END;
}
err = memcpy_s(delayRange, sizeof(DelayDDLRange), &tmpRange, sizeof(DelayDDLRange));
securec_check(err, "\0", "\0");
result = true;
END:
if (FreeFile(fp)) {
ereport(LOG, (errcode_for_file_access(),
errmsg("could not close file \"%s\": %s", DELAY_DDL_RECYCLE_FILE, TRANSLATE_ERRNO)));
}
fp = NULL;
return result;
}
bool write_delay_ddl_file(const DelayDDLRange &delayRange, bool onErrDelete)
{
FILE *fp = AllocateFile(DELAY_DDL_RECYCLE_FILE, PG_BINARY_W);
if (fp == NULL) {
ereport(LOG, (errcode_for_file_access(),
errmsg("could not open file \"%s\" for write: %s", DELAY_DDL_RECYCLE_FILE, TRANSLATE_ERRNO)));
return false;
}
if (fwrite(&delayRange, 1, sizeof(DelayDDLRange), fp) != sizeof(DelayDDLRange) || fflush(fp) != 0 ||
pg_fsync(fileno(fp)) != 0 || ferror(fp) || FreeFile(fp)) {
ereport(onErrDelete ? LOG : PANIC,
(errcode_for_file_access(), errmsg("failed to write file \"%s\": %s", DELAY_DDL_RECYCLE_FILE, TRANSLATE_ERRNO)));
* PANIC may be unnecessary here, because if we failed to write delay file during
* enable delay and leave unset xlogctl delay start lsn, we would fix it the next time
* we try to enable delay. To be safe, at present, we use PANIC.
*/
if (onErrDelete && unlink(DELAY_DDL_RECYCLE_FILE) < 0 && errno != ENOENT) {
ereport(PANIC, (errcode_for_file_access(),
errmsg("could not remove %s file after write failure: %s",
DELAY_DDL_RECYCLE_FILE, TRANSLATE_ERRNO)));
}
return false;
}
return true;
}
void execDelayedDDL(XLogRecPtr startLSN, XLogRecPtr endLSN, bool ignoreCBMErr)
{
CBMArray *cbmArray = NULL;
MemoryContext currentContext = CurrentMemoryContext;
MemoryContext cbmParseContext;
bool cbmParseFailed = false;
long i;
int saveInterruptHoldoffCount;
cbmParseContext = AllocSetContextCreate(CurrentMemoryContext, "cbm_parse_context", ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE);
LWLockAcquire(CBMParseXlogLock, LW_SHARED);
saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount;
PG_TRY();
{
(void)MemoryContextSwitchTo(cbmParseContext);
cbmArray = CBMGetMergedArray(startLSN, endLSN);
}
PG_CATCH();
{
t_thrd.int_cxt.InterruptHoldoffCount = saveInterruptHoldoffCount;
(void)MemoryContextSwitchTo(currentContext);
ErrorData *edata = CopyErrorData();
FlushErrorState();
MemoryContextDelete(cbmParseContext);
cbmParseContext = NULL;
if (!ignoreCBMErr) {
LWLockRelease(CBMParseXlogLock);
}
if (ignoreCBMErr) {
ereport(LOG, (errmsg("Failed to get cbm information during execute delayed DDL: %s", edata->message)));
} else {
ereport(ERROR, (errcode(ERRCODE_CASE_NOT_FOUND),
errmsg("Failed to get cbm information during execute delayed DDL: %s", edata->message)));
}
cbmParseFailed = true;
}
PG_END_TRY();
LWLockRelease(CBMParseXlogLock);
* step1. remove delay file
* step2. remove dropped column file
*
* We use this sequence to avoid unlink newly created relations that
* reuse the relfilenodes.
* If we fail in step2, there will be residual of
* column files.
*
* If caller told us to ignore cbm parse failue, just return after
* unlinking the delay file.
*/
if (unlink(DELAY_DDL_RECYCLE_FILE) < 0 && errno != ENOENT) {
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not remove %s file before unlink col relation files: %s",
DELAY_DDL_RECYCLE_FILE, TRANSLATE_ERRNO)));
}
if (cbmParseFailed) {
return;
}
if (!XLByteEQ(cbmArray->startLSN, startLSN) || !XLByteEQ(cbmArray->endLSN, endLSN)) {
ereport(WARNING, (errmsg("delay start-end %08X/%08X-%08X/%08X is inconsistent with cbm tracking "
"result start-end %08X/%08X-%08X/%08X. To be safe, while there may be residual "
"of dropped column files, We will leave them anyway",
(uint32)(startLSN >> 32), (uint32)(startLSN), (uint32)(endLSN >> 32), (uint32)(endLSN),
(uint32)(cbmArray->startLSN >> 32), (uint32)(cbmArray->startLSN),
(uint32)(cbmArray->endLSN >> 32), (uint32)(cbmArray->endLSN))));
} else {
for (i = 0; i < cbmArray->arrayLength; i++) {
* For row relations, relfilenode can not be reused if the physical file has not
* been unlinked. However, for column relations, add column can reuse
* the file that a previously crashed add-column-xact leaved. Such operations
* must hold RelfilenodeReuseLock lock to avoid concurrency with execDelayedDDL.
*/
if ((cbmArray->arrayEntry[i].changeType & PAGETYPE_DROP) &&
!(cbmArray->arrayEntry[i].changeType & PAGETYPE_CREATE) && !(cbmArray->arrayEntry[i].totalBlockNum) &&
IsValidColForkNum(cbmArray->arrayEntry[i].cbmTag.forkNum)) {
CStore::UnlinkColDataFile(cbmArray->arrayEntry[i].cbmTag.rNode,
ColForkNum2ColumnId(cbmArray->arrayEntry[i].cbmTag.forkNum), false);
}
}
}
FreeCBMArray(cbmArray);
cbmArray = NULL;
(void)MemoryContextSwitchTo(currentContext);
MemoryContextDelete(cbmParseContext);
}
void RedoSpeedDiag(XLogRecPtr readPtr, XLogRecPtr endPtr)
{
if (g_instance.comm_cxt.predo_cxt.redoPf.preEndPtr == 0) {
g_instance.comm_cxt.predo_cxt.redoPf.preEndPtr = endPtr;
INSTR_TIME_SET_CURRENT(g_instance.comm_cxt.predo_cxt.redoPf.preTime);
}
uint64 redoBytes = (uint64)(endPtr - g_instance.comm_cxt.predo_cxt.redoPf.preEndPtr);
instr_time newTime;
INSTR_TIME_SET_CURRENT(newTime);
instr_time tmpTime = newTime;
INSTR_TIME_SUBTRACT(tmpTime, g_instance.comm_cxt.predo_cxt.redoPf.preTime);
uint64 totalTime = INSTR_TIME_GET_MICROSEC(tmpTime);
if (redoBytes >= REDO_SPEED_LOG_LEN || (redoBytes > 0 && totalTime >= MAX_OUT_INTERVAL)) {
uint64 speed = 0;
uint64 speed_raw = 0;
if (totalTime > 0) {
speed_raw = redoBytes * US_TRANSFER_TO_S / totalTime;
speed = speed_raw / BYTES_TRANSFER_KBYTES;
}
g_instance.comm_cxt.predo_cxt.redoPf.speed_according_seg = static_cast<uint32>(speed_raw);
if (g_instance.comm_cxt.predo_cxt.redoPf.recovery_done_ptr == 0 || module_logging_is_on(MOD_REDO)) {
ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
errmsg("RedoSpeedDiag: %lu us, redoBytes:%lu, "
"preEndPtr:%X/%X, readPtr:%X/%X, endPtr:%X/%X, "
"speed:%lu KB/s, totalTime:%lu",
INSTR_TIME_GET_MICROSEC(tmpTime), redoBytes,
(uint32)(g_instance.comm_cxt.predo_cxt.redoPf.preEndPtr >> 32),
(uint32)(g_instance.comm_cxt.predo_cxt.redoPf.preEndPtr),
(uint32)(readPtr >> 32), (uint32)(readPtr),
(uint32)(endPtr >> 32), (uint32)(endPtr),
speed, totalTime)));
}
redo_refresh_stats(speed_raw);
g_instance.comm_cxt.predo_cxt.redoPf.preEndPtr = endPtr;
g_instance.comm_cxt.predo_cxt.redoPf.preTime = newTime;
}
}
void set_walrcv_reply_dueto_commit(bool need_reply)
{
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
xlogctl->walrcv_reply_dueto_commit = need_reply;
}
bool get_walrcv_reply_dueto_commit(void)
{
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
return xlogctl->walrcv_reply_dueto_commit;
}
* Get latest redo apply position.
*
* Optionally, returns the current recovery target timeline. Callers not
* interested in that may pass NULL for targetTLI.
*
* Exported to allow WALReceiver to read the pointer directly.
*/
XLogRecPtr GetXLogReplayRecPtr(TimeLineID *targetTLI, XLogRecPtr *ReplayReadPtr)
{
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
XLogRecPtr recptr;
if (!targetTLI && !ReplayReadPtr)
recptr = pg_atomic_read_u64((volatile uint64*)&xlogctl->lastReplayedEndRecPtr);
else {
SpinLockAcquire(&xlogctl->info_lck);
recptr = xlogctl->lastReplayedEndRecPtr;
if (targetTLI != NULL) {
*targetTLI = xlogctl->RecoveryTargetTLI;
}
if (ReplayReadPtr != NULL) {
*ReplayReadPtr = xlogctl->lastReplayedReadRecPtr;
}
SpinLockRelease(&xlogctl->info_lck);
}
return recptr;
}
void SetXLogReplayRecPtr(XLogRecPtr readRecPtr, XLogRecPtr endRecPtr)
{
bool isUpdated = false;
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
if (XLByteLT(xlogctl->lastReplayedReadRecPtr, endRecPtr)) {
xlogctl->lastReplayedReadRecPtr = readRecPtr;
xlogctl->lastReplayedEndRecPtr = endRecPtr;
g_instance.comm_cxt.predo_cxt.redoPf.last_replayed_end_ptr = endRecPtr;
isUpdated = true;
}
SpinLockRelease(&xlogctl->info_lck);
if (isUpdated) {
RedoSpeedDiag(readRecPtr, endRecPtr);
update_dirty_page_queue_rec_lsn(readRecPtr);
}
#ifndef ENABLE_MULTIPLE_NODES
if (g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
int ret = dcf_set_election_priority(1, endRecPtr);
if (ret != 0) {
ereport(WARNING, (errmsg("In dcf mode, could not update current replay location:%08X/%08X",
(uint32)(endRecPtr >> 32), (uint32)(endRecPtr))));
}
}
#endif
}
void DumpXlogCtl()
{
ereport(LOG,
(errmodule(MOD_REDO), errcode(ERRCODE_LOG),
errmsg("[REDO_LOG_TRACE]txn : lastReplayedReadRecPtr %lu, lastReplayedEndRecPtr %lu,"
" replayEndRecPtr %lu, lastCheckPointRecPtr %lu, lastCheckPoint.redo %lu",
t_thrd.shemem_ptr_cxt.XLogCtl->lastReplayedReadRecPtr,
t_thrd.shemem_ptr_cxt.XLogCtl->lastReplayedEndRecPtr,
t_thrd.shemem_ptr_cxt.XLogCtl->replayEndRecPtr, t_thrd.shemem_ptr_cxt.XLogCtl->lastCheckPointRecPtr,
t_thrd.shemem_ptr_cxt.XLogCtl->lastCheckPoint.redo)));
}
* This function is used for Cluster Manager(CM), when recovery has done
* it return the current replay record, if recovery is doing, return
* InvalidXLogRecPtr, so that CM will not use the Replay LSN to arbitrate.
*/
XLogRecPtr GetXLogReplayRecPtrInPending(void)
{
HaShmemData *hashmdata = t_thrd.postmaster_cxt.HaShmData;
if (PENDING_MODE == hashmdata->current_mode) {
bool localrecoverydone = false;
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
localrecoverydone = xlogctl->IsRecoveryDone;
SpinLockRelease(&xlogctl->info_lck);
if (localrecoverydone == false) {
return InvalidXLogRecPtr;
}
}
return GetXLogReplayRecPtr(NULL);
}
* Get current standby flush position, ie, the last WAL position
* known to be fsync'd to disk in standby.
*
* If 'targetTLI' is not NULL, it's set to the current recovery target
* timeline.
*/
XLogRecPtr GetStandbyFlushRecPtr(TimeLineID *targetTLI)
{
XLogRecPtr receivePtr;
XLogRecPtr replayPtr;
receivePtr = GetWalRcvWriteRecPtr(NULL);
replayPtr = GetXLogReplayRecPtr(targetTLI);
if (XLByteLT(receivePtr, replayPtr)) {
return replayPtr;
} else {
return receivePtr;
}
}
* Get latest WAL insert pointer
*/
XLogRecPtr GetXLogInsertRecPtr(void)
{
volatile XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
uint64 current_bytepos;
#if defined(__x86_64__) || defined(__aarch64__) && !defined(__USE_SPINLOCK)
current_bytepos = pg_atomic_barrier_read_u64((uint64 *)&Insert->CurrBytePos);
#else
SpinLockAcquire(&Insert->insertpos_lck);
current_bytepos = Insert->CurrBytePos;
SpinLockRelease(&Insert->insertpos_lck);
#endif
return XLogBytePosToRecPtr(current_bytepos);
}
* Get latest WAL insert pointer.
* Like GetXLogInsertRecPtr, but if the position is at a page boundary,
* returns a pointer to the beginning of the page (ie. before page header),
* not to where the first xlog record on that page would go to.
*/
XLogRecPtr GetXLogInsertEndRecPtr(void)
{
volatile XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
uint64 current_bytepos;
#if defined(__x86_64__) || defined(__aarch64__) && !defined(__USE_SPINLOCK)
current_bytepos = pg_atomic_barrier_read_u64((uint64 *)&Insert->CurrBytePos);
#else
SpinLockAcquire(&Insert->insertpos_lck);
current_bytepos = Insert->CurrBytePos;
SpinLockRelease(&Insert->insertpos_lck);
#endif
return XLogBytePosToEndRecPtr(current_bytepos);
}
* Get latest WAL write pointer
*/
XLogRecPtr GetXLogWriteRecPtr(void)
{
{
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
*t_thrd.xlog_cxt.LogwrtResult = xlogctl->LogwrtResult;
SpinLockRelease(&xlogctl->info_lck);
}
return t_thrd.xlog_cxt.LogwrtResult->Write;
}
#ifndef ENABLE_MULTIPLE_NODES
* Get latest paxos write pointer
*/
XLogRecPtr GetPaxosWriteRecPtr(void)
{
{
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
*t_thrd.xlog_cxt.LogwrtPaxos = xlogctl->LogwrtPaxos;
SpinLockRelease(&xlogctl->info_lck);
}
return t_thrd.xlog_cxt.LogwrtPaxos->Write;
}
* Get latest paxos commit pointer
*/
XLogRecPtr GetPaxosConsensusRecPtr(void)
{
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
*t_thrd.xlog_cxt.LogwrtPaxos = xlogctl->LogwrtPaxos;
SpinLockRelease(&xlogctl->info_lck);
return t_thrd.xlog_cxt.LogwrtPaxos->Consensus;
}
#endif
* read_backup_label: check to see if a backup_label file is present
*
* If we see a backup_label during recovery, we assume that we are recovering
* from a backup dump file, and we therefore roll forward from the checkpoint
* identified by the label file, NOT what pg_control says. This avoids the
* problem that pg_control might have been archived one or more checkpoints
* later than the start of the dump, and so if we rely on it as the start
* point, we will fail to restore a consistent database state.
*
* Returns TRUE if a backup_label was found (and fills the checkpoint
* location and its REDO location into *checkPointLoc and RedoStartLSN,
* respectively); returns FALSE if not. If this backup_label came from a
* streamed backup, *backupEndRequired is set to TRUE. If this backup_label
* was created during recovery, *backupFromStandby is set to TRUE. If this backup_label
* was created during primary mode by gs_roach, *backupFromRoach is set to TRUE.
*/
static bool read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired, bool *backupFromStandby,
bool *backupFromRoach)
{
char startxlogfilename[MAXFNAMELEN];
TimeLineID tli;
FILE *lfp = NULL;
char ch;
char backuptype[20];
char backupfrom[20];
uint32 hi, lo;
*backupEndRequired = false;
*backupFromStandby = false;
* See if label file is present
*/
lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
if (lfp == NULL) {
if (errno != ENOENT) {
ereport(FATAL, (errcode_for_file_access(),
errmsg("could not read file \"%s\": %s", BACKUP_LABEL_FILE, TRANSLATE_ERRNO)));
}
return false;
}
* Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
* is pretty crude, but we are not expecting any variability in the file
* format).
*/
if (fscanf_s(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c", &hi, &lo, &tli, startxlogfilename,
sizeof(startxlogfilename), &ch, 1) != 5 ||
ch != '\n') {
ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
}
t_thrd.xlog_cxt.RedoStartLSN = (((uint64)hi) << 32) | lo;
g_instance.comm_cxt.predo_cxt.redoPf.redo_start_ptr = t_thrd.xlog_cxt.RedoStartLSN;
if (fscanf_s(lfp, "CHECKPOINT LOCATION: %X/%X%c", &hi, &lo, &ch, 1) != 3 || ch != '\n') {
ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
}
*checkPointLoc = (((uint64)hi) << 32) | lo;
* BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
* from an older backup anyway, but since the information on it is not
* strictly required, don't error out if it's missing for some reason.
*/
if (fscanf_s(lfp, "BACKUP METHOD: %19s\n", backuptype, sizeof(backuptype)) == 1) {
if (strcmp(backuptype, "streamed") == 0) {
*backupEndRequired = true;
}
}
if (fscanf_s(lfp, "BACKUP FROM: %19s\n", backupfrom, sizeof(backupfrom)) == 1) {
if (strcmp(backupfrom, "standby") == 0) {
*backupFromStandby = true;
} else if (strcmp(backupfrom, "roach_master") == 0) {
*backupFromRoach = true;
}
}
if (ferror(lfp) || FreeFile(lfp)) {
ereport(FATAL, (errcode_for_file_access(),
errmsg("could not read file \"%s\": %s", BACKUP_LABEL_FILE, TRANSLATE_ERRNO)));
}
return true;
}
* read_tablespace_map: check to see if a tablespace_map file is present
*
* If we see a tablespace_map file during recovery, we assume that we are
* recovering from a backup dump file, and we therefore need to create symlinks
* as per the information present in tablespace_map file.
*
* Returns TRUE if a tablespace_map file was found (and fills the link
* information for all the tablespace links present in file); returns FALSE
* if not.
*/
static bool read_tablespace_map(List **tablespaces)
{
tablespaceinfo *ti;
FILE *lfp;
char tbsoid[MAXPGPATH];
char *tbslinkpath;
char str[MAXPGPATH];
int ch, prevCh = -1, i = 0, n;
* See if tablespace_map file is present
*/
lfp = AllocateFile(TABLESPACE_MAP, "r");
if (!lfp) {
if (errno != ENOENT)
ereport(FATAL, (errcode_for_file_access(),
errmsg("could not read file \"%s\": %s", TABLESPACE_MAP, TRANSLATE_ERRNO)));
return false;
}
* Read and parse the link name and path lines from tablespace_map file
* (this code is pretty crude, but we are not expecting any variability in
* the file format). While taking backup we embed escape character '\\'
* before newline in tablespace path, so that during reading of
* tablespace_map file, we could distinguish newline in tablespace path
* and end of line. Now while reading tablespace_map file, remove the
* escape character that has been added in tablespace path during backup.
*/
while ((ch = fgetc(lfp)) != EOF) {
if ((ch == '\n' || ch == '\r') && prevCh != '\\') {
str[i] = '\0';
if (sscanf_s(str, "%s %n", tbsoid, MAXPGPATH, &n) != 1)
ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
tbslinkpath = str + n;
i = 0;
ti = (tablespaceinfo *)palloc(sizeof(tablespaceinfo));
ti->oid = pstrdup(tbsoid);
ti->path = pstrdup(tbslinkpath);
*tablespaces = lappend(*tablespaces, ti);
continue;
} else if ((ch == '\n' || ch == '\r') && prevCh == '\\')
str[i - 1] = ch;
else
str[i++] = ch;
prevCh = ch;
}
if (ferror(lfp) || FreeFile(lfp))
ereport(FATAL, (errcode_for_file_access(),
errmsg("could not read file \"%s\": %s", TABLESPACE_MAP, TRANSLATE_ERRNO)));
return true;
}
void rm_redo_error_callback(void *arg)
{
if (arg == NULL) {
return;
}
XLogReaderState *record = (XLogReaderState *)arg;
StringInfoData buf;
initStringInfo(&buf);
RmgrTable[XLogRecGetRmid(record)].rm_desc(&buf, record);
errcontext("xlog redo lsn %X/%X, %s",
(uint32)(record->EndRecPtr >> XLOG_LSN_SWAP), (uint32)record->EndRecPtr, buf.data);
pfree_ext(buf.data);
}
* BackupInProgress: check if online backup mode is active
*
* This is done by checking for existence of the "backup_label" file.
*/
bool BackupInProgress(void)
{
struct stat stat_buf;
return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
}
* CancelBackup: rename the "backup_label" file to cancel backup mode
*
* If the "backup_label" file exists, it will be renamed to "backup_label.old".
* Note that this will render an online backup in progress useless.
* To correctly finish an online backup, pg_stop_backup must be called.
*/
void CancelBackup(void)
{
struct stat stat_buf;
if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0) {
return;
}
unlink(BACKUP_LABEL_OLD);
if (durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, DEBUG1) == 0) {
ereport(LOG, (errmsg("online backup mode canceled"),
errdetail("\"%s\" was renamed to \"%s\".", BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
return;
}
if (stat(TABLESPACE_MAP, &stat_buf) < 0) {
ereport(LOG, (errmsg("online backup mode canceled"),
errdetail("File \"%s\" was renamed to \"%s\".", BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
return;
}
unlink(TABLESPACE_MAP_OLD);
if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0) {
ereport(LOG, (errmsg("online backup mode canceled"),
errdetail("Files \"%s\" and \"%s\" were renamed to "
"\"%s\" and \"%s\", respectively.",
BACKUP_LABEL_FILE, TABLESPACE_MAP, BACKUP_LABEL_OLD, TABLESPACE_MAP_OLD)));
} else {
ereport(WARNING, (errcode_for_file_access(), errmsg("online backup mode was not canceled"),
errdetail("Could not rename \"%s\" to \"%s\": %m.", BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
}
}
static bool IsRedoDonePromoting(void)
{
bool result = false;
XLogRecPtr redoPos = InvalidXLogRecPtr;
XLogRecPtr receivePos = InvalidXLogRecPtr;
if (CheckForFailoverTrigger()) {
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
SpinLockAcquire(&walrcv->mutex);
redoPos = walrcv->receiver_replay_location;
receivePos = walrcv->receivedUpto;
SpinLockRelease(&walrcv->mutex);
result = XLByteEQ(redoPos, receivePos) ? true : false;
}
return result;
}
bool XLogReadFromWriteBuffer(XLogRecPtr targetStartPtr, int reqLen, char *readBuf, uint32 *rereadlen)
{
uint32 startoff = targetStartPtr % XLogSegSize;
if (t_thrd.xlog_cxt.readFile < 0 || !XLByteInSeg(targetStartPtr, t_thrd.xlog_cxt.readSegNo)) {
char path[MAXPGPATH];
if (t_thrd.xlog_cxt.readFile >= 0) {
(void)close(t_thrd.xlog_cxt.readFile);
}
XLByteToSeg(targetStartPtr, t_thrd.xlog_cxt.readSegNo);
XLogFilePath(path, MAXPGPATH, t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo);
t_thrd.xlog_cxt.readFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
if (t_thrd.xlog_cxt.readFile < 0) {
* If the file is not found, assume it's because the standby
* asked for a too old WAL segment that has already been
* removed or recycled.
*/
if (FILE_POSSIBLY_DELETED(errno)) {
ereport(ERROR, (errcode_for_file_access(),
errmsg("requested WAL segment %s has already been removed",
XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo))));
} else {
ereport(ERROR, (errcode_for_file_access(),
errmsg("could not open file \"%s\" (log segment %s): %s", path,
XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo),
TRANSLATE_ERRNO)));
}
}
t_thrd.xlog_cxt.readOff = 0;
}
if (startoff != t_thrd.xlog_cxt.readOff) {
if (lseek(t_thrd.xlog_cxt.readFile, (off_t)startoff, SEEK_SET) < 0) {
(void)close(t_thrd.xlog_cxt.readFile);
t_thrd.xlog_cxt.readFile = -1;
ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in log segment %s to offset %u: %s",
XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo), startoff, TRANSLATE_ERRNO)));
}
t_thrd.xlog_cxt.readOff = startoff;
}
if (reqLen > (int)(XLogSegSize - startoff)) {
reqLen = (int)(XLogSegSize - startoff);
}
int readbytes = read(t_thrd.xlog_cxt.readFile, readBuf, reqLen);
if (readbytes <= 0) {
(void)close(t_thrd.xlog_cxt.readFile);
t_thrd.xlog_cxt.readFile = -1;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read from log segment %s, offset %u, length %lu: %s",
XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo),
t_thrd.xlog_cxt.readOff, INT2ULONG(reqLen), TRANSLATE_ERRNO)));
}
t_thrd.xlog_cxt.readOff += reqLen;
*rereadlen = reqLen;
return true;
}
bool NewDataIsInBuf(XLogRecPtr expectedRecPtr)
{
bool havedata = false;
if (XLByteLT(expectedRecPtr, t_thrd.xlog_cxt.receivedUpto)) {
havedata = true;
} else {
XLogRecPtr latestChunkStart;
t_thrd.xlog_cxt.receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart);
if (XLByteLT(expectedRecPtr, t_thrd.xlog_cxt.receivedUpto)) {
havedata = true;
if (!XLByteLT(expectedRecPtr, latestChunkStart)) {
t_thrd.xlog_cxt.XLogReceiptTime = GetCurrentTimestamp();
SetCurrentChunkStartTime(t_thrd.xlog_cxt.XLogReceiptTime);
}
} else {
havedata = false;
}
}
return havedata;
}
static ReplConnTarget GetRepConntarget(void)
{
if (t_thrd.xlog_cxt.is_cascade_standby) {
return REPCONNTARGET_STANDBY;
} else {
return REPCONNTARGET_PRIMARY;
}
}
void HandleCascadeStandbyPromote(XLogRecPtr *recptr)
{
if (!t_thrd.xlog_cxt.is_cascade_standby || t_thrd.xlog_cxt.server_mode != STANDBY_MODE ||
!IS_DN_MULTI_STANDYS_MODE()) {
return;
}
ShutdownWalRcv();
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
t_thrd.xlog_cxt.receivedUpto = 0;
SpinLockAcquire(&walrcv->mutex);
walrcv->receivedUpto = 0;
SpinLockRelease(&walrcv->mutex);
t_thrd.xlog_cxt.is_cascade_standby = false;
if (t_thrd.postmaster_cxt.HaShmData->is_cross_region) {
t_thrd.xlog_cxt.is_hadr_main_standby = true;
SpinLockAcquire(&t_thrd.postmaster_cxt.HaShmData->mutex);
t_thrd.postmaster_cxt.HaShmData->is_cascade_standby = false;
t_thrd.postmaster_cxt.HaShmData->is_hadr_main_standby = true;
SpinLockRelease(&t_thrd.postmaster_cxt.HaShmData->mutex);
}
t_thrd.xlog_cxt.switchover_triggered = false;
ResetFailoverTriggered();
t_thrd.xlog_cxt.failover_triggered = false;
if (t_thrd.xlog_cxt.readFile >= 0) {
close(t_thrd.xlog_cxt.readFile);
t_thrd.xlog_cxt.readFile = -1;
}
SendPostmasterSignal(PMSIGNAL_UPDATE_NORMAL);
RequestXLogStreaming(recptr, t_thrd.xlog_cxt.PrimaryConnInfo, GetRepConntarget(),
u_sess->attr.attr_storage.PrimarySlotName);
}
void CheckMaxPageFlushLSN(XLogRecPtr reqLsn)
{
if (FORCE_FINISH_ENABLED && t_thrd.xlog_cxt.max_page_flush_lsn == MAX_XLOG_REC_PTR) {
t_thrd.xlog_cxt.max_page_flush_lsn = reqLsn;
ereport(LOG, (errmsg("update max_page_flush_lsn to %X/%X for standby before request streaming",
(uint32)(t_thrd.xlog_cxt.max_page_flush_lsn >> 32),
(uint32)(t_thrd.xlog_cxt.max_page_flush_lsn))));
update_max_page_flush_lsn(reqLsn, t_thrd.proc_cxt.MyProcPid, true);
set_global_max_page_flush_lsn(reqLsn);
}
}
* For single-instance cluster, non-pitr roach restore does not have restore target end point and
* recovery.conf file is renamed to indicate restore end when local xlog files have all been replayed.
*/
void rename_recovery_conf_for_roach() {
if (g_instance.role == VSINGLENODE &&
g_instance.roach_cxt.isRoachRestore &&
t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_UNSET &&
t_thrd.xlog_cxt.isRoachSingleReplayDone == false) {
ereport(LOG, (errmsg("rename recovery.conf to recovery.done for roach")));
durable_rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE, FATAL);
t_thrd.xlog_cxt.isRoachSingleReplayDone = true;
}
}
* Read the XLOG page containing RecPtr into readBuf (if not read already).
* Returns number of bytes read, if the page is read successfully, or -1
* in case of errors. When errors occur, they are ereport'ed, but only
* if they have not been previously reported.
*
* This is responsible for restoring files from archive as needed, as well
* as for waiting for the requested WAL record to arrive in standby mode.
*
* 'emode' specifies the log level used for reporting "file not found" or
* "end of WAL" situations in archive recovery, or in standby mode when a
* trigger file is found. If set to WARNING or below, XLogPageRead() returns
* false in those situations, on higher log levels the ereport() won't
* return.
*
* In standby mode, if after a successful return of XLogPageRead() the
* caller finds the record it's interested in to be broken, it should
* ereport the error with the level determined by
* emode_for_corrupt_record(), and then set "failedSources |= readSource"
* and call XLogPageRead() again with the same arguments. This lets
* XLogPageRead() to try fetching the record from another source, or to
* sleep and retry.
*
* In standby mode, this only returns false if promotion has been triggered.
* Otherwise it keeps sleeping and retrying indefinitely.
*/
int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr,
char *readBuf, TimeLineID *readTLI, char* xlog_path)
{
XLogPageReadPrivate *readprivate = (XLogPageReadPrivate *)xlogreader->private_data;
int emode = readprivate->emode;
bool randAccess = readprivate->randAccess;
bool fetching_ckpt = readprivate->fetching_ckpt;
uint32 targetPageOff;
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
XLogRecPtr RecPtr = targetPagePtr;
XLogRecPtr UwalPtr = InvalidXLogRecPtr;
bool processtrxn = false;
XLogSegNo replayedSegNo;
uint32 ret;
#ifdef USE_ASSERT_CHECKING
XLogSegNo targetSegNo;
XLByteToSeg(targetPagePtr, targetSegNo);
#endif
targetPageOff = targetPagePtr % XLogSegSize;
* See if we need to switch to a new segment because the requested record
* is not in the currently open one.
*/
if (t_thrd.xlog_cxt.readFile >= 0 && !XLByteInSeg(targetPagePtr, t_thrd.xlog_cxt.readSegNo)) {
* Request a restartpoint if we've replayed too much xlog since the
* last one.
*/
if (t_thrd.xlog_cxt.StandbyModeRequested &&
(t_thrd.xlog_cxt.bgwriterLaunched || t_thrd.xlog_cxt.pagewriter_launched) && !dummyStandbyMode) {
if (get_real_recovery_parallelism() > 1) {
XLByteToSeg(GetXLogReplayRecPtr(NULL), replayedSegNo);
} else {
replayedSegNo = t_thrd.xlog_cxt.readSegNo;
}
if (XLogCheckpointNeeded(replayedSegNo)) {
(void)GetRedoRecPtr();
if (XLogCheckpointNeeded(replayedSegNo)) {
RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
}
}
}
close(t_thrd.xlog_cxt.readFile);
t_thrd.xlog_cxt.readFile = -1;
t_thrd.xlog_cxt.readSource = 0;
}
XLByteToSeg(targetPagePtr, t_thrd.xlog_cxt.readSegNo);
XLByteAdvance(RecPtr, reqLen);
retry:
if (t_thrd.xlog_cxt.readFile < 0 ||
(t_thrd.xlog_cxt.readSource == XLOG_FROM_STREAM && XLByteLT(t_thrd.xlog_cxt.receivedUpto, RecPtr))) {
if (t_thrd.xlog_cxt.StandbyMode && t_thrd.xlog_cxt.startup_processing && !dummyStandbyMode) {
* In standby mode, wait for the requested record to become
* available, either via restore_command succeeding to restore the
* segment, or via walreceiver having streamed the record.
*/
for (;;) {
* Need to check here also for the case where consistency level is
* already reached without replaying any record i.e. just after reading
* of checkpoint data it has reached to minRecoveryPoint. Also
* whenever we are going to loop in the data receive from master
* node its bettwe we check if consistency level has reached. So
* instead of keeping in all places before ReadRecord, we can keep
* here in centralised location.
*/
ProcTxnWorkLoad(false);
CheckRecoveryConsistency();
if (WalRcvInProgress()) {
XLogRecPtr expectedRecPtr = RecPtr;
bool havedata = false;
if (CheckForStandbyTrigger()
#ifndef ENABLE_MULTIPLE_NODES
&& IsDCFReadyOrDisabled()
#endif
) {
SendPostmasterSignal(PMSIGNAL_UPDATE_NORMAL);
}
* If we find an invalid record in the WAL streamed from
* master, something is seriously wrong. There's little
* chance that the problem will just go away, but PANIC is
* not good for availability either, especially in hot
* standby mode. Disconnect, and retry from
* archive/pg_xlog again. The WAL in the archive should be
* identical to what was streamed, so it's unlikely that
* it helps, but one can hope...
*/
if (t_thrd.xlog_cxt.failedSources & XLOG_FROM_STREAM) {
ProcTxnWorkLoad(true);
ereport(LOG, (errmsg("read from stream failed, request xlog receivedupto at %X/%X.",
(uint32)(t_thrd.xlog_cxt.receivedUpto >> 32),
(uint32)t_thrd.xlog_cxt.receivedUpto)));
ShutdownWalRcv();
continue;
}
if (!g_instance.attr.attr_storage.enable_mix_replication && !IS_DN_MULTI_STANDYS_MODE() &&
!GetArchiveRecoverySlot() && !g_instance.attr.attr_storage.dcf_attr.enable_dcf &&
!IS_SHARED_STORAGE_MODE) {
StartupDataStreaming();
}
* Walreceiver is active, so see if new data has arrived.
*
* We only advance XLogReceiptTime when we obtain fresh
* WAL from walreceiver and observe that we had already
* processed everything before the most recent "chunk"
* that it flushed to disk. In steady state where we are
* keeping up with the incoming data, XLogReceiptTime will
* be updated on each cycle. When we are behind,
* XLogReceiptTime will not advance, so the grace time
* alloted to conflicting queries will decrease.
*/
if (RecPtr % XLogSegSize == 0) {
XLByteAdvance(expectedRecPtr, SizeOfXLogLongPHD);
} else if (RecPtr % XLOG_BLCKSZ == 0) {
XLByteAdvance(expectedRecPtr, SizeOfXLogShortPHD);
}
if (XLByteLT(expectedRecPtr, t_thrd.xlog_cxt.receivedUpto)) {
havedata = true;
} else {
XLogRecPtr latestChunkStart;
t_thrd.xlog_cxt.receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart);
if (XLByteLT(expectedRecPtr, t_thrd.xlog_cxt.receivedUpto)) {
havedata = true;
if (!XLByteLT(RecPtr, latestChunkStart)) {
t_thrd.xlog_cxt.XLogReceiptTime = GetCurrentTimestamp();
SetCurrentChunkStartTime(t_thrd.xlog_cxt.XLogReceiptTime);
}
} else {
havedata = false;
}
}
if (g_instance.attr.attr_storage.enable_uwal && UwalAcquire()) {
if (havedata && targetPagePtr >= t_thrd.xlog_cxt.uwalInfo.info.truncateOffset) {
SetUwalReadInfo(targetPagePtr, t_thrd.xlog_cxt.receivedUpto, targetPageOff);
goto uwal_buf_read;
} else if (havedata && !GsUwalCheckFileRename(targetPagePtr)) {
goto retry;
}
}
if (havedata) {
* Great, streamed far enough. Open the file if it's
* not open already. Use XLOG_FROM_STREAM so that
* source info is set correctly and XLogReceiptTime
* isn't changed.
*/
if (t_thrd.xlog_cxt.readFile < 0) {
t_thrd.xlog_cxt.readFile = XLogFileRead(t_thrd.xlog_cxt.readSegNo, PANIC,
t_thrd.xlog_cxt.recoveryTargetTLI, XLOG_FROM_STREAM,
false);
Assert(t_thrd.xlog_cxt.readFile >= 0);
} else {
t_thrd.xlog_cxt.readSource = XLOG_FROM_STREAM;
t_thrd.xlog_cxt.XLogReceiptSource = XLOG_FROM_STREAM;
}
break;
}
t_thrd.xlog_cxt.RedoDone = IsRedoDonePromoting();
pg_memory_barrier();
* The judging rules are the following:
* (1)For the One-Primary-Multi-Standbys deployment the dummy_status is set
* to be true which means no any dummy standby synchronization info need to be noticed;
* (2)For the Primary-Standby-DummyStandby deployment the dummy standby synchronization
* info need to be judged depending on enable_mix_replication.
*/
bool dummy_status = RecoveryFromDummyStandby()
? (GetWalRcvDummyStandbySyncPercent() == SYNC_DUMMY_STANDBY_END &&
(g_instance.attr.attr_storage.enable_mix_replication
? true
: GetDataRcvDummyStandbySyncPercent() == SYNC_DUMMY_STANDBY_END))
: true;
if (WalRcvIsDone() && CheckForFailoverTrigger() && dummy_status) {
t_thrd.xlog_cxt.receivedUpto = GetWalRcvWriteRecPtr(NULL);
if (XLByteLT(RecPtr, t_thrd.xlog_cxt.receivedUpto)) {
continue;
}
ProcTxnWorkLoad(true);
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
SpinLockAcquire(&walrcv->mutex);
walrcv->dummyStandbyConnectFailed = false;
SpinLockRelease(&walrcv->mutex);
ereport(LOG, (errmsg("RecPtr(%X/%X),receivedUpto(%X/%X)", (uint32)(RecPtr >> 32),
(uint32)RecPtr, (uint32)(t_thrd.xlog_cxt.receivedUpto >> 32),
(uint32)t_thrd.xlog_cxt.receivedUpto)));
if (t_thrd.xlog_cxt.is_cascade_standby) {
ereport(LOG, (errmsg("Cascade Standby has synchronized all the xlog and replication data, "
"promote to standby")));
} else {
ereport(LOG,
(errmsg(
"Secondary Standby has synchronized all the xlog and replication data, standby "
"promote to primary")));
}
ShutdownWalRcv();
ShutdownDataRcv();
if (t_thrd.xlog_cxt.is_cascade_standby) {
HandleCascadeStandbyPromote(fetching_ckpt ? &t_thrd.xlog_cxt.RedoStartLSN : &targetRecPtr);
continue;
} else {
goto triggered;
}
}
if (IS_SHARED_STORAGE_MODE) {
uint32 disableConnectionNode =
pg_atomic_read_u32(&g_instance.comm_cxt.localinfo_cxt.need_disable_connection_node);
if (disableConnectionNode && WalRcvIsRunning()) {
ereport(LOG, (errmsg("request xlog receivedupto at %X/%X.",
(uint32)(t_thrd.xlog_cxt.receivedUpto >> 32),
(uint32)t_thrd.xlog_cxt.receivedUpto)));
ShutdownWalRcv();
}
}
* Data not here yet, so check for trigger then sleep for
* five seconds like in the WAL file polling case below.
*/
if (WalRcvIsDone() && (CheckForSwitchoverTrigger() || CheckForFailoverTrigger())) {
if (t_thrd.xlog_cxt.is_cascade_standby && t_thrd.xlog_cxt.server_mode == STANDBY_MODE) {
HandleCascadeStandbyPromote(fetching_ckpt ? &t_thrd.xlog_cxt.RedoStartLSN : &targetRecPtr);
continue;
}
goto retry;
}
if (!processtrxn) {
ProcTxnWorkLoad(true);
processtrxn = true;
goto retry;
}
* Wait for more WAL to arrive, or timeout to be reached
*/
WaitLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch, WL_LATCH_SET | WL_TIMEOUT, 1000L);
processtrxn = false;
ResetLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch);
} else {
uint32 sources;
* Until walreceiver manages to reconnect, poll the
* archive.
*/
if (t_thrd.xlog_cxt.readFile >= 0) {
close(t_thrd.xlog_cxt.readFile);
t_thrd.xlog_cxt.readFile = -1;
}
if (randAccess) {
t_thrd.xlog_cxt.curFileTLI = 0;
}
* Try to restore the file from archive, or read an
* existing file from pg_xlog.
*/
sources = XLOG_FROM_ARCHIVE | XLOG_FROM_PG_XLOG;
ereport(DEBUG5, (errmsg("failedSources: %u", t_thrd.xlog_cxt.failedSources)));
if (!(sources & ~t_thrd.xlog_cxt.failedSources)) {
* We've exhausted all options for retrieving the
* file. Retry.
*/
t_thrd.xlog_cxt.failedSources = 0;
* Before we sleep, re-scan for possible new timelines
* if we were requested to recover to the latest
* timeline.
*/
if (t_thrd.xlog_cxt.recoveryTargetIsLatest) {
if (rescanLatestTimeLine()) {
continue;
}
}
if (t_thrd.startup_cxt.shutdown_requested) {
ereport(LOG, (errmsg("startup shutdown")));
proc_exit(0);
}
if (!xlogctl->IsRecoveryDone) {
g_instance.comm_cxt.predo_cxt.redoPf.redo_done_time = GetCurrentTimestamp();
g_instance.comm_cxt.predo_cxt.redoPf.recovery_done_ptr = t_thrd.xlog_cxt.ReadRecPtr;
ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
errmsg("XLogPageRead IsRecoveryDone is set true,"
"ReadRecPtr:%X/%X, EndRecPtr:%X/%X",
(uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32),
(uint32)(t_thrd.xlog_cxt.ReadRecPtr),
(uint32)(t_thrd.xlog_cxt.EndRecPtr >> 32),
(uint32)(t_thrd.xlog_cxt.EndRecPtr))));
parallel_recovery::redo_dump_all_stats();
}
* signal postmaster to update local redo end
* point to gaussdb state file.
*/
ProcTxnWorkLoad(true);
if (!xlogctl->IsRecoveryDone) {
SendPostmasterSignal(PMSIGNAL_LOCAL_RECOVERY_DONE);
}
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->IsRecoveryDone = true;
SpinLockRelease(&xlogctl->info_lck);
static uint64 printFrequency = 0;
if (!(IS_SHARED_STORAGE_MODE) ||
pg_atomic_read_u32(&t_thrd.walreceiverfuncs_cxt.WalRcv->rcvDoneFromShareStorage)) {
knl_g_set_redo_finish_status(REDO_FINISH_STATUS_LOCAL | REDO_FINISH_STATUS_CM);
if ((printFrequency & 0xFF) == 0) {
ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
errmsg("XLogPageRead set redo finish status,"
"ReadRecPtr:%X/%X, EndRecPtr:%X/%X",
(uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32),
(uint32)(t_thrd.xlog_cxt.ReadRecPtr),
(uint32)(t_thrd.xlog_cxt.EndRecPtr >> 32),
(uint32)(t_thrd.xlog_cxt.EndRecPtr))));
}
printFrequency++;
* If it hasn't been long since last attempt, sleep 1s to
* avoid busy-waiting.
*/
if (IS_SHARED_STORAGE_MODE) {
uint32 connMode =
pg_atomic_read_u32(&g_instance.comm_cxt.localinfo_cxt.need_disable_connection_node);
if (connMode) {
pg_atomic_write_u32(&g_instance.comm_cxt.localinfo_cxt.need_disable_connection_node,
false);
}
pg_usleep(2000000L);
} else {
#ifdef ENABLE_LITE_MODE
pg_usleep(1000000L);
#else
pg_usleep(50000L);
#endif
}
}
* If primary_conninfo is set, launch walreceiver to
* try to stream the missing WAL, before retrying to
* restore from archive/pg_xlog.
*
* If fetching_ckpt is TRUE, RecPtr points to the
* initial checkpoint location. In that case, we use
* RedoStartLSN as the streaming start position
* instead of RecPtr, so that when we later jump
* backwards to start redo at RedoStartLSN, we will
* have the logs streamed already.
*/
load_server_mode();
if (CheckForFailoverTrigger()) {
if (t_thrd.xlog_cxt.is_cascade_standby) {
HandleCascadeStandbyPromote(fetching_ckpt ? &t_thrd.xlog_cxt.RedoStartLSN
: &targetRecPtr);
continue;
}
if (!RecoveryFromDummyStandby()) {
goto triggered;
}
ereport(LOG, (errmsg("Standby connect to DummyStandby begin ......")));
if (t_thrd.xlog_cxt.server_mode == STANDBY_MODE) {
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
ereport(LOG, (errmsg("request xlog stream at %X/%X.",
fetching_ckpt ? (uint32)(t_thrd.xlog_cxt.RedoStartLSN >> 32)
: (uint32)(targetRecPtr >> 32),
fetching_ckpt ? (uint32)t_thrd.xlog_cxt.RedoStartLSN
: (uint32)targetRecPtr)));
ShutdownWalRcv();
t_thrd.xlog_cxt.receivedUpto = 0;
SpinLockAcquire(&walrcv->mutex);
walrcv->receivedUpto = 0;
SpinLockRelease(&walrcv->mutex);
RequestXLogStreaming(fetching_ckpt ? &t_thrd.xlog_cxt.RedoStartLSN : &targetRecPtr,
t_thrd.xlog_cxt.PrimaryConnInfo, REPCONNTARGET_DUMMYSTANDBY,
u_sess->attr.attr_storage.PrimarySlotName);
if (!g_instance.attr.attr_storage.enable_mix_replication &&
!IS_DN_MULTI_STANDYS_MODE() && !g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
StartupDataStreaming();
}
continue;
}
} else if (IS_OBS_DISASTER_RECOVER_MODE && !IsRoachRestore()) {
ProcTxnWorkLoad(false);
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
ereport(LOG, (errmsg("request xlog stream from obs at %X/%X.",
fetching_ckpt ? (uint32)(t_thrd.xlog_cxt.RedoStartLSN >> 32)
: (uint32)(targetRecPtr >> 32),
fetching_ckpt ? (uint32)t_thrd.xlog_cxt.RedoStartLSN
: (uint32)targetRecPtr)));
ShutdownWalRcv();
t_thrd.xlog_cxt.receivedUpto = 0;
SpinLockAcquire(&walrcv->mutex);
walrcv->receivedUpto = 0;
SpinLockRelease(&walrcv->mutex);
RequestXLogStreaming(fetching_ckpt ? &t_thrd.xlog_cxt.RedoStartLSN : &targetRecPtr, 0,
REPCONNTARGET_OBS, 0);
continue;
} else if (IS_SHARED_STORAGE_STANBY_MODE && !IS_SHARED_STORAGE_MAIN_STANDBY_MODE) {
ProcTxnWorkLoad(false);
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
#ifndef ENABLE_MULTIPLE_NODES
rename_recovery_conf_for_roach();
#endif
ereport(LOG, (errmsg("request xlog stream from shared storage at %X/%X.",
fetching_ckpt ? (uint32)(t_thrd.xlog_cxt.RedoStartLSN >> 32)
: (uint32)(targetRecPtr >> 32),
fetching_ckpt ? (uint32)t_thrd.xlog_cxt.RedoStartLSN
: (uint32)targetRecPtr)));
ShutdownWalRcv();
t_thrd.xlog_cxt.receivedUpto = 0;
SpinLockAcquire(&walrcv->mutex);
walrcv->receivedUpto = 0;
SpinLockRelease(&walrcv->mutex);
RequestXLogStreaming(fetching_ckpt ? &t_thrd.xlog_cxt.RedoStartLSN : &targetRecPtr, 0,
REPCONNTARGET_SHARED_STORAGE, 0);
continue;
} else if (t_thrd.xlog_cxt.PrimaryConnInfo || t_thrd.xlog_cxt.server_mode == STANDBY_MODE) {
ProcTxnWorkLoad(false);
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
CheckMaxPageFlushLSN(targetRecPtr);
#ifndef ENABLE_MULTIPLE_NODES
rename_recovery_conf_for_roach();
#endif
ereport(LOG, (errmsg("request xlog stream at %X/%X.",
fetching_ckpt ? (uint32)(t_thrd.xlog_cxt.RedoStartLSN >> 32)
: (uint32)(targetRecPtr >> 32),
fetching_ckpt ? (uint32)t_thrd.xlog_cxt.RedoStartLSN
: (uint32)targetRecPtr)));
ShutdownWalRcv();
t_thrd.xlog_cxt.receivedUpto = 0;
SpinLockAcquire(&walrcv->mutex);
walrcv->receivedUpto = 0;
SpinLockRelease(&walrcv->mutex);
RequestXLogStreaming(fetching_ckpt ? &t_thrd.xlog_cxt.RedoStartLSN : &targetRecPtr,
t_thrd.xlog_cxt.PrimaryConnInfo, REPCONNTARGET_PRIMARY,
u_sess->attr.attr_storage.PrimarySlotName);
if (!g_instance.attr.attr_storage.enable_mix_replication && !IS_DN_MULTI_STANDYS_MODE() &&
!g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
StartupDataStreaming();
}
continue;
}
}
sources &= ~t_thrd.xlog_cxt.failedSources;
if (g_instance.attr.attr_storage.enable_uwal && UwalAcquire()) {
if (targetPagePtr >= t_thrd.xlog_cxt.uwalInfo.info.truncateOffset &&
(targetPagePtr + reqLen <= t_thrd.xlog_cxt.uwalInfo.info.writeOffset)) {
SetUwalReadInfo(targetPagePtr, t_thrd.xlog_cxt.uwalInfo.info.writeOffset, targetPageOff);
goto uwal_buf_read;
} else if (targetPagePtr < t_thrd.xlog_cxt.uwalInfo.info.truncateOffset &&
!GsUwalCheckFileRename(targetPagePtr)) {
goto retry;
}
}
t_thrd.xlog_cxt.readFile = XLogFileReadAnyTLI(t_thrd.xlog_cxt.readSegNo, DEBUG2, sources);
if (t_thrd.xlog_cxt.readFile >= 0) {
break;
}
ereport(DEBUG5, (errmsg("do not find any more files.sources=%u failedSources=%u", sources,
t_thrd.xlog_cxt.failedSources)));
* Nope, not found in archive and/or pg_xlog.
*/
t_thrd.xlog_cxt.failedSources |= sources;
* Check to see if the trigger file exists. Note that we
* do this only after failure, so when you create the
* trigger file, we still finish replaying as much as we
* can from archive and pg_xlog before failover.
*/
if (CheckForPrimaryTrigger()) {
goto triggered;
}
if ((CheckForFailoverTrigger() && !RecoveryFromDummyStandby()) || CheckForSwitchoverTrigger()) {
if (t_thrd.xlog_cxt.is_cascade_standby && t_thrd.xlog_cxt.server_mode == STANDBY_MODE) {
HandleCascadeStandbyPromote(fetching_ckpt ? &t_thrd.xlog_cxt.RedoStartLSN : &targetRecPtr);
continue;
}
XLogRecPtr receivedUpto = GetWalRcvWriteRecPtr(NULL);
XLogRecPtr EndRecPtrTemp = t_thrd.xlog_cxt.EndRecPtr;
XLByteAdvance(EndRecPtrTemp, SizeOfXLogRecord);
if (XLByteLT(EndRecPtrTemp, receivedUpto) && !FORCE_FINISH_ENABLED &&
t_thrd.xlog_cxt.currentRetryTimes++ < g_retryTimes) {
ereport(WARNING, (errmsg("there are some received xlog have not been redo "
"the tail of last redo lsn:%X/%X, received lsn:%X/%X, retry %d times",
(uint32)(EndRecPtrTemp >> 32), (uint32)EndRecPtrTemp,
(uint32)(receivedUpto >> 32), (uint32)receivedUpto,
t_thrd.xlog_cxt.currentRetryTimes)));
return -1;
}
ereport(LOG,
(errmsg("read record failed when promoting, current lsn (%X/%X), received lsn(%X/%X),"
"sources[%u], failedSources[%u], readSource[%u], readFile[%d], readId[%u],"
"readSeg[%u], readOff[%u], readLen[%u]",
(uint32)(RecPtr >> 32), (uint32)RecPtr,
(uint32)(t_thrd.xlog_cxt.receivedUpto >> 32),
(uint32)t_thrd.xlog_cxt.receivedUpto, sources, t_thrd.xlog_cxt.failedSources,
t_thrd.xlog_cxt.readSource, t_thrd.xlog_cxt.readFile,
(uint32)(t_thrd.xlog_cxt.readSegNo >> 32), (uint32)t_thrd.xlog_cxt.readSegNo,
t_thrd.xlog_cxt.readOff, t_thrd.xlog_cxt.readLen)));
goto triggered;
}
}
* This possibly-long loop needs to handle interrupts of
* startup process.
*/
RedoInterruptCallBack();
}
} else {
if (t_thrd.xlog_cxt.readFile < 0) {
uint32 sources;
if (randAccess) {
t_thrd.xlog_cxt.curFileTLI = 0;
}
sources = XLOG_FROM_PG_XLOG;
if (t_thrd.xlog_cxt.InArchiveRecovery) {
sources |= XLOG_FROM_ARCHIVE;
}
if (g_instance.attr.attr_storage.enable_uwal && UwalAcquire() &&
t_thrd.xlog_cxt.uwalInfo.info.dataSize > 0) {
if (targetPagePtr >= t_thrd.xlog_cxt.uwalInfo.info.truncateOffset) {
if (targetPagePtr + reqLen > t_thrd.xlog_cxt.uwalInfo.info.writeOffset &&
0 == GsUwalQuery(&t_thrd.xlog_cxt.uwalInfo.id, &t_thrd.xlog_cxt.uwalInfo.info) &&
targetPagePtr + reqLen > t_thrd.xlog_cxt.uwalInfo.info.writeOffset) {
goto next_record_is_invalid;
}
t_thrd.xlog_cxt.curFileTLI = t_thrd.xlog_cxt.uwalInfo.info.startTimeLine;
t_thrd.xlog_cxt.readLen = XLOG_BLCKSZ;
t_thrd.xlog_cxt.readOff = targetPageOff;
goto uwal_buf_read;
} else if (!GsUwalCheckFileRename(targetPagePtr)) {
goto retry;
}
}
t_thrd.xlog_cxt.readFile = XLogFileReadAnyTLI(t_thrd.xlog_cxt.readSegNo, emode, sources);
if (dummyStandbyMode) {
ereport(LOG,
(errmsg("readseg:%lu, readFile:%d", t_thrd.xlog_cxt.readSegNo, t_thrd.xlog_cxt.readFile)));
}
if (t_thrd.xlog_cxt.readFile < 0) {
return -1;
}
}
}
}
* At this point, we have the right segment open and if we're streaming we
* know the requested record is in it.
*/
Assert(t_thrd.xlog_cxt.readFile != -1);
* If the current segment is being streamed from master, calculate how
* much of the current page we have received already. We know the
* requested record has been received, but this is for the benefit of
* future calls, to allow quick exit at the top of this function.
*/
if (t_thrd.xlog_cxt.readSource == XLOG_FROM_STREAM) {
if ((targetPagePtr / XLOG_BLCKSZ) != (t_thrd.xlog_cxt.receivedUpto / XLOG_BLCKSZ)) {
t_thrd.xlog_cxt.readLen = XLOG_BLCKSZ;
} else {
t_thrd.xlog_cxt.readLen = t_thrd.xlog_cxt.receivedUpto % XLogSegSize - targetPageOff;
}
} else {
t_thrd.xlog_cxt.readLen = XLOG_BLCKSZ;
}
t_thrd.xlog_cxt.readOff = targetPageOff;
if (g_instance.attr.attr_storage.enable_uwal && UwalAcquire()) {
if (targetPagePtr >= t_thrd.xlog_cxt.uwalInfo.info.truncateOffset) {
if (targetPagePtr + reqLen > t_thrd.xlog_cxt.uwalInfo.info.writeOffset) {
if (0 == GsUwalQuery(&t_thrd.xlog_cxt.uwalInfo.id, &t_thrd.xlog_cxt.uwalInfo.info)) {
if (targetPagePtr + reqLen > t_thrd.xlog_cxt.uwalInfo.info.writeOffset) {
goto next_record_is_invalid;
}
}
}
goto uwal_buf_read;
} else if (!GsUwalCheckFileRename(targetPagePtr)) {
goto retry;
}
}
try_again:
if (lseek(t_thrd.xlog_cxt.readFile, (off_t)t_thrd.xlog_cxt.readOff, SEEK_SET) < 0) {
ereport(emode_for_corrupt_record(emode, RecPtr),
(errcode_for_file_access(),
errmsg("could not seek in log file %s to offset %u: %s",
XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo),
t_thrd.xlog_cxt.readOff, TRANSLATE_ERRNO)));
if (errno == EINTR) {
errno = 0;
pg_usleep(1000);
goto try_again;
}
goto next_record_is_invalid;
}
pgstat_report_waitevent(WAIT_EVENT_WAL_READ);
ret = read(t_thrd.xlog_cxt.readFile, readBuf, XLOG_BLCKSZ);
pgstat_report_waitevent(WAIT_EVENT_END);
if (ret != XLOG_BLCKSZ) {
ereport(emode_for_corrupt_record(emode, RecPtr),
(errcode_for_file_access(),
errmsg("could not read from log file %s to offset %u: %s",
XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo),
t_thrd.xlog_cxt.readOff, TRANSLATE_ERRNO)));
if (errno == EINTR) {
errno = 0;
pg_usleep(1000);
goto try_again;
}
goto next_record_is_invalid;
}
Assert(targetSegNo == t_thrd.xlog_cxt.readSegNo);
Assert(targetPageOff == t_thrd.xlog_cxt.readOff);
Assert((uint32)reqLen <= t_thrd.xlog_cxt.readLen);
*readTLI = t_thrd.xlog_cxt.curFileTLI;
return t_thrd.xlog_cxt.readLen;
uwal_buf_read:
UwalPtr =
Min(Max(t_thrd.xlog_cxt.uwalInfo.info.writeOffset, t_thrd.xlog_cxt.receivedUpto), targetPagePtr + XLOG_BLCKSZ);
if (0 > UwalXLogPageRead(targetPagePtr, readBuf, UwalPtr)) {
if (0 == GsUwalQuery(&t_thrd.xlog_cxt.uwalInfo.id, &t_thrd.xlog_cxt.uwalInfo.info)) {
goto retry;
} else {
ereport(LOG, (errmsg("uwal_query return failed, goto next_record_is_invalid")));
goto next_record_is_invalid;
}
}
Assert(targetSegNo == t_thrd.xlog_cxt.readSegNo);
Assert(targetPageOff == t_thrd.xlog_cxt.readOff);
Assert((uint32)reqLen <= t_thrd.xlog_cxt.readLen);
*readTLI = t_thrd.xlog_cxt.curFileTLI;
return t_thrd.xlog_cxt.readLen;
next_record_is_invalid:
t_thrd.xlog_cxt.failedSources |= t_thrd.xlog_cxt.readSource;
if (t_thrd.xlog_cxt.readFile >= 0) {
close(t_thrd.xlog_cxt.readFile);
}
t_thrd.xlog_cxt.readFile = -1;
t_thrd.xlog_cxt.readLen = 0;
t_thrd.xlog_cxt.readSource = 0;
if (t_thrd.xlog_cxt.StandbyMode) {
goto retry;
} else {
return -1;
}
triggered:
if (t_thrd.xlog_cxt.readFile >= 0) {
close(t_thrd.xlog_cxt.readFile);
}
t_thrd.xlog_cxt.readFile = -1;
t_thrd.xlog_cxt.readLen = 0;
t_thrd.xlog_cxt.readSource = 0;
t_thrd.xlog_cxt.recoveryTriggered = true;
return -1;
}
* Determine what log level should be used to report a corrupt WAL record
* in the current WAL page, previously read by XLogPageRead().
*
* 'emode' is the error mode that would be used to report a file-not-found
* or legitimate end-of-WAL situation. Generally, we use it as-is, but if
* we're retrying the exact same record that we've tried previously, only
* complain the first time to keep the noise down. However, we only do when
* reading from pg_xlog, because we don't expect any invalid records in archive
* or in records streamed from master. Files in the archive should be complete,
* and we should never hit the end of WAL because we stop and wait for more WAL
* to arrive before replaying it.
*
* NOTE: This function remembers the RecPtr value it was last called with,
* to suppress repeated messages about the same record. Only call this when
* you are about to ereport(), or you might cause a later message to be
* erroneously suppressed.
*/
int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
{
if (t_thrd.xlog_cxt.readSource == XLOG_FROM_PG_XLOG && emode == LOG) {
if (XLByteEQ(RecPtr, t_thrd.xlog_cxt.lastComplaint)) {
emode = DEBUG1;
} else {
t_thrd.xlog_cxt.lastComplaint = RecPtr;
}
}
return emode;
}
* Check to see whether the user-specified trigger file exists and whether a
* failover request has arrived. If either condition holds, request postmaster
* to shut down walreceiver, wait for it to exit, and return true.
*/
bool CheckForFailoverTrigger(void)
{
struct stat stat_buf;
StringInfo slotname = NULL;
bool ret = false;
if (t_thrd.xlog_cxt.server_mode != STANDBY_MODE) {
ResetFailoverTriggered();
t_thrd.xlog_cxt.failover_triggered = false;
ret = false;
goto exit;
}
if (t_thrd.xlog_cxt.failover_triggered) {
ret = true;
goto exit;
}
slotname = get_rcv_slot_name();
Assert(slotname != NULL);
if (IsFailoverTriggered()) {
ereport(LOG, (errmsg("received failover notification")));
ShutdownWalRcv();
ShutdownDataRcv();
ResetFailoverTriggered();
t_thrd.xlog_cxt.failover_triggered = true;
* If database has a mass of bcm page, clear bcm page maybe very slowly,
* so we do not clear it.
* getBcmFileList(true);
*/
ResetSlotLSNEndRecovery(slotname);
if (!SS_DISASTER_MAIN_STANDBY_NODE) {
SendPostmasterSignal(PMSIGNAL_UPDATE_PROMOTING);
}
ret = true;
goto exit;
}
if (t_thrd.xlog_cxt.TriggerFile == NULL) {
ret = false;
goto exit;
}
if (stat(t_thrd.xlog_cxt.TriggerFile, &stat_buf) == 0) {
ereport(LOG, (errmsg("trigger file found: %s", t_thrd.xlog_cxt.TriggerFile)));
ShutdownWalRcv();
ShutdownDataRcv();
unlink(t_thrd.xlog_cxt.TriggerFile);
t_thrd.xlog_cxt.failover_triggered = true;
* If database has a mass of bcm page, clear bcm page maybe very slowly,
* so we do not clear it.
* getBcmFileList(true);
*/
ResetSlotLSNEndRecovery(slotname);
ret = true;
}
exit:
if (slotname != NULL) {
if (slotname->data) {
pfree_ext(slotname->data);
}
pfree_ext(slotname);
}
return ret;
}
bool CheckForForceFinishRedoTrigger(TermFileData *term_file)
{
uint32 permit_finish_redo = pg_atomic_read_u32(&(g_instance.comm_cxt.predo_cxt.permitFinishRedo));
uint32 hot_standby = pg_atomic_read_u32(&(g_instance.comm_cxt.predo_cxt.hotStdby));
if (hot_standby == ATOMIC_TRUE && permit_finish_redo == ATOMIC_TRUE && FORCE_FINISH_ENABLED) {
*term_file = GetTermFileDataAndClear();
return term_file->finish_redo;
} else {
return false;
}
}
* Check to see whether the user-specified trigger file exists and whether a
* switchover request has arrived. If either condition holds, request postmaster
* to shut down walreceiver, wait for it to exit, and return true.
*/
bool CheckForSwitchoverTrigger(void)
{
if (t_thrd.xlog_cxt.switchover_triggered) {
return true;
}
if (IsSwitchoverTriggered()) {
StringInfo slotname = NULL;
ereport(LOG, (errmsg("received switchover notification")));
slotname = get_rcv_slot_name();
Assert(slotname != NULL);
ShutdownWalRcv();
ShutdownDataRcv();
ResetSwitchoverTriggered();
t_thrd.xlog_cxt.switchover_triggered = true;
* If database has a mass of bcm page, clear bcm page maybe very slowly,
* so we do not clear it.
* getBcmFileList(true);
*/
ResetSlotLSNEndRecovery(slotname);
if (slotname->data != NULL) {
pfree_ext(slotname->data);
}
pfree_ext(slotname);
SendPostmasterSignal(PMSIGNAL_UPDATE_PROMOTING);
return true;
}
return false;
}
static bool CheckForPrimaryTrigger(void)
{
if (AmStartupProcess()) {
if (IsPrimaryTriggered()) {
ereport(LOG, (errmsg("received primary request")));
ResetPrimaryTriggered();
return true;
}
return false;
} else {
uint32 tgigger = pg_atomic_read_u32(&g_startupTriggerState);
if (tgigger == TRIGGER_PRIMARY) {
(void)pg_atomic_compare_exchange_u32(&g_startupTriggerState, &tgigger, TRIGGER_NORMAL);
return true;
}
}
return false;
}
bool CheckForStandbyTrigger(void)
{
if (AmStartupProcess()) {
if (IsStandbyTriggered()) {
ereport(LOG, (errmsg("received standby request")));
ResetStandbyTriggered();
return true;
}
return false;
} else {
uint32 tgigger = pg_atomic_read_u32(&g_startupTriggerState);
if (tgigger == TRIGGER_STADNBY) {
(void)pg_atomic_compare_exchange_u32(&(g_startupTriggerState), &tgigger, TRIGGER_NORMAL);
return true;
}
}
return false;
}
bool CheckFinishRedoSignal(void)
{
bool is_finish_redo = false;
int cnt = 0;
int fp = -1;
if ((fp = open("finish_redo_file", O_RDONLY, S_IRUSR)) >= 0) {
cnt = read(fp, &is_finish_redo, sizeof(bool));
if (close(fp)) {
ereport(LOG, (errcode_for_file_access(), errmsg("cannot close finish redo file.\n")));
}
if (cnt != sizeof(bool)) {
ereport(LOG, (0, errmsg("cannot read finish redo file.\n")));
unlink("finish_redo_file");
return false;
}
ereport(LOG, (errmsg("CheckFinishRedoSignal read is_finish_redo successfully, forced_finishredo: %d\n",
is_finish_redo)));
} else {
return false;
}
unlink("finish_redo_file");
return is_finish_redo;
}
Enum_TriggeredState CheckForSatartupStatus(void)
{
if (t_thrd.startup_cxt.primary_triggered) {
ereport(LOG, (errmsg("received primary request")));
ResetPrimaryTriggered();
return TRIGGER_PRIMARY;
}
if (t_thrd.startup_cxt.standby_triggered) {
ereport(LOG, (errmsg("received standby request")));
ResetStandbyTriggered();
return TRIGGER_STADNBY;
}
if (t_thrd.startup_cxt.failover_triggered) {
ereport(LOG, (errmsg("received failover request")));
ResetFailoverTriggered();
return TRIGGER_FAILOVER;
}
if (t_thrd.startup_cxt.switchover_triggered) {
ereport(LOG, (errmsg("received switchover request")));
ResetSwitchoverTriggered();
return TRIGGER_FAILOVER;
}
return TRIGGER_NORMAL;
}
* Check to see if a promote request has arrived. Should be
* called by postmaster after receiving SIGUSR1.
*/
bool CheckPromoteSignal(void)
{
uint32 Term = g_InvalidTermDN;
int cnt = 0;
int fp = -1;
if ((fp = open("term_file", O_RDONLY, S_IRUSR)) >= 0) {
cnt = read(fp, &Term, sizeof(uint32));
if (close(fp)) {
ereport(LOG, (errcode_for_file_access(), errmsg("cannot close term file when checking promote signal.\n")));
}
if (cnt != sizeof(uint32)) {
ereport(LOG, (errcode_for_file_access(), errmsg("cannot read term file.\n")));
return false;
}
}
if (Term > g_instance.comm_cxt.localinfo_cxt.term_from_file &&
t_thrd.postmaster_cxt.HaShmData->current_mode == PRIMARY_MODE) {
g_instance.comm_cxt.localinfo_cxt.term_from_file = Term;
}
struct stat stat_buf;
if (stat(FAILOVER_SIGNAL_FILE, &stat_buf) == 0) {
* Since we are in a signal handler, it's not safe to ereport. We
* silently ignore any error from unlink.
*/
unlink(FAILOVER_SIGNAL_FILE);
if (Term > g_instance.comm_cxt.localinfo_cxt.term_from_file) {
g_instance.comm_cxt.localinfo_cxt.term_from_file = Term;
g_instance.comm_cxt.localinfo_cxt.set_term = false;
ereport(LOG, (errmsg("CheckPromoteSignal read term file, term %d\n", Term)));
}
return true;
}
if (CheckPostmasterSignal(PMSIGNAL_PROMOTE_STANDBY)) {
* Since we are in a signal handler, it's not safe to ereport. We
* silently ignore any error from unlink.
*/
if (Term > g_instance.comm_cxt.localinfo_cxt.term_from_file) {
g_instance.comm_cxt.localinfo_cxt.term_from_file = Term;
g_instance.comm_cxt.localinfo_cxt.set_term = false;
ereport(LOG, (errmsg("CheckPromoteSignal read term file, term %d\n", Term)));
}
MatviewShmemSetInvalid();
return true;
}
return false;
}
TermFileData GetTermFileDataAndClear(void)
{
TermFileData term_file;
uint32 readRst;
term_file.term = Max(g_instance.comm_cxt.localinfo_cxt.term_from_file,
g_instance.comm_cxt.localinfo_cxt.term_from_xlog);
readRst = pg_atomic_read_u32(&(g_instance.comm_cxt.localinfo_cxt.is_finish_redo));
term_file.finish_redo = (readRst == 1);
if (term_file.finish_redo && knl_g_get_local_redo_finish_status()) {
ereport(WARNING, (0, errmsg("GetTermFileData: skip force finish redo when local redo finish")));
term_file.finish_redo = false;
pg_atomic_write_u32(&(g_instance.comm_cxt.localinfo_cxt.is_finish_redo), 0);
}
return term_file;
}
* Check whether the signal is primary signal
*/
bool CheckPrimarySignal(void)
{
struct stat stat_buf;
uint32 term = g_InvalidTermDN;
int cnt = 0;
int fp = -1;
if ((fp = open("term_file", O_RDONLY, S_IRUSR)) >= 0) {
cnt = read(fp, &term, sizeof(uint32));
if (close(fp)) {
ereport(LOG, (errcode_for_file_access(), errmsg("cannot close term file when checking primary signal.\n")));
}
if (cnt != sizeof(uint32)) {
ereport(LOG, (errcode_for_file_access(), errmsg("cannot read term file for notify primary.\n")));
return false;
}
}
if (stat(PRIMARY_SIGNAL_FILE, &stat_buf) == 0) {
* Since we are in a signal handler, it's not safe to ereport. We
* silently ignore any error from unlink.
*/
(void)unlink(PRIMARY_SIGNAL_FILE);
if (term > g_instance.comm_cxt.localinfo_cxt.term_from_file) {
g_instance.comm_cxt.localinfo_cxt.term_from_file = term;
ereport(LOG, (errmsg("CheckPrimarySignal term %u\n", term)));
}
return true;
}
return false;
}
* Check whether the signal is standby signal
*/
bool CheckStandbySignal(void)
{
struct stat stat_buf;
if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0) {
* Since we are in a signal handler, it's not safe to elog. We
* silently ignore any error from unlink.
*/
(void)unlink(STANDBY_SIGNAL_FILE);
return true;
}
return false;
}
* Check whether the signal is cascade standby signal
*/
bool CheckCascadeStandbySignal(void)
{
struct stat stat_buf;
if (stat(CASCADE_STANDBY_SIGNAL_FILE, &stat_buf) == 0) {
* Since we are in a signal handler, it's not safe to elog. We
* silently ignore any error from unlink.
*/
(void)unlink(CASCADE_STANDBY_SIGNAL_FILE);
return true;
}
return false;
}
* Check to see if a switchover request has arrived and
* read demote mode from switchover signal file.
* Should be called by postmaster after receiving SIGUSR1.
*/
int CheckSwitchoverSignal(void)
{
int mode = 0;
int switchfile = -1;
int returnCode;
* Since we are in a signal handler, it's not safe to elog. We
* silently ignore any error from fopen/fread/fclose/unlink.
*/
switchfile = open(SWITCHOVER_SIGNAL_FILE, O_RDONLY, S_IRUSR);
if (switchfile < 0) {
return mode;
}
returnCode = read(switchfile, &mode, sizeof(int));
if (returnCode != sizeof(int)) {
mode = 0;
}
if (mode < SmartDemote || mode > ExtremelyFast) {
mode = 0;
}
if (close(switchfile)) {
ereport(LOG, (errcode_for_file_access(), errmsg("cannot close switchfile.\n")));
}
(void)unlink(SWITCHOVER_SIGNAL_FILE);
return mode;
}
* Check to see if a switchover timeout signal has arrived.
* Should be called by postmaster after receiving SIGUSR1.
*/
bool CheckSwitchoverTimeoutSignal(void)
{
int fp = -1;
if ((fp = open(SWITCHOVER_TIMEOUT_SIGNAL_FILE, O_RDONLY, S_IRUSR)) >= 0) {
if (close(fp)) {
ereport(LOG, (errcode_for_file_access(), errmsg("cannot close switchover timeout file.\n")));
}
unlink(SWITCHOVER_TIMEOUT_SIGNAL_FILE);
return true;
} else {
return false;
}
}
* Wake up startup process to replay newly arrived WAL, or to notice that
* failover has been requested.
*/
void WakeupRecovery(void)
{
SetLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch);
if (IsExtremeRedo() && u_sess->attr.attr_storage.recovery_min_apply_delay > 0) {
SetLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupDelayLatch);
}
}
void WakeupDataRecovery(void)
{
SetLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->dataRecoveryLatch);
}
* Update the WalWriterSleeping flag.
*/
void SetWalWriterSleeping(bool sleeping)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->WalWriterSleeping = sleeping;
SpinLockRelease(&xlogctl->info_lck);
}
* CloseXlogFilesAtThreadExit
* Close opened xlog files at thread exit time
*/
void CloseXlogFilesAtThreadExit(void)
{
if (t_thrd.xlog_cxt.openLogFile >= 0) {
XLogFileClose();
}
closeXLogRead();
}
void StartupDataStreaming(void)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
if (IS_SINGLE_NODE && !IS_DN_DUMMY_STANDYS_MODE())
return;
if (xlogctl->IsRecoveryDone && !DataRcvInProgress()) {
load_server_mode();
if (t_thrd.xlog_cxt.server_mode == STANDBY_MODE && !CheckForSwitchoverTrigger()) {
if (CheckForFailoverTrigger()) {
if (GetDataRcvDummyStandbySyncPercent() != SYNC_DUMMY_STANDBY_END) {
RequestDataStreaming(NULL, REPCONNTARGET_DUMMYSTANDBY);
}
} else {
RequestDataStreaming(NULL, REPCONNTARGET_PRIMARY);
}
}
}
}
* load the current ongoing mode from the HaShmData
*/
void load_server_mode(void)
{
volatile HaShmemData *hashmdata = t_thrd.postmaster_cxt.HaShmData;
SpinLockAcquire(&hashmdata->mutex);
t_thrd.xlog_cxt.server_mode = hashmdata->current_mode;
t_thrd.xlog_cxt.is_cascade_standby = hashmdata->is_cascade_standby;
t_thrd.xlog_cxt.is_hadr_main_standby = hashmdata->is_hadr_main_standby;
SpinLockRelease(&hashmdata->mutex);
}
* Get the current ongoing mode from the HaShmData
*/
bool IsServerModeStandby()
{
load_server_mode();
return (t_thrd.xlog_cxt.server_mode == STANDBY_MODE);
}
* Calculate difference between two lsns.
*/
uint64 XLogDiff(XLogRecPtr end, XLogRecPtr start)
{
int64 len = 0;
if (XLByteLT(end, start)) {
return 0;
}
len = end - start;
return len;
}
* get and set walsender EndRecPtr for dummy standby.
*/
static void SetDummyStandbyEndRecPtr(XLogReaderState *xlogreader)
{
XLogRecord *record = NULL;
do {
record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
} while (record != NULL);
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
SpinLockAcquire(&walrcv->mutex);
if (XLByteLT(walrcv->receivedUpto, t_thrd.xlog_cxt.EndRecPtr))
walrcv->receivedUpto = t_thrd.xlog_cxt.EndRecPtr;
SpinLockRelease(&walrcv->mutex);
}
* get the crc of the xlog record
*/
pg_crc32 GetXlogRecordCrc(XLogRecPtr RecPtr, bool &crcvalid, XLogPageReadCB pagereadfunc, Size bufAlignSize)
{
XLogReaderState *xlogreader = NULL;
XLogPageReadPrivate readprivate;
pg_crc32 reccrc = 0;
TimeLineID tli = 0;
XLogRecord *rec = NULL;
char *errorMsg = NULL;
errno_t rc = EOK;
if (!t_thrd.xlog_cxt.expectedTLIs && t_thrd.shemem_ptr_cxt.XLogCtl && t_thrd.shemem_ptr_cxt.ControlFile) {
tli = t_thrd.shemem_ptr_cxt.XLogCtl->ThisTimeLineID;
t_thrd.xlog_cxt.expectedTLIs = list_make1_int((int)tli);
tli = t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.ThisTimeLineID;
if (!list_member_int(t_thrd.xlog_cxt.expectedTLIs, (int)tli)) {
t_thrd.xlog_cxt.expectedTLIs = lcons_int((int)tli, t_thrd.xlog_cxt.expectedTLIs);
}
}
if (!XRecOffIsValid(RecPtr)) {
ereport(ERROR, (errcode(ERRCODE_CASE_NOT_FOUND),
errmsg("invalid record offset at %X/%X.", (uint32)(RecPtr >> 32), (uint32)RecPtr)));
}
rc = memset_s(&readprivate, sizeof(XLogPageReadPrivate), 0, sizeof(XLogPageReadPrivate));
securec_check(rc, "\0", "\0");
if (ENABLE_DMS && ENABLE_DSS) {
xlogreader = SSXLogReaderAllocate(pagereadfunc, &readprivate, ALIGNOF_BUFFER);
} else {
xlogreader = XLogReaderAllocate(pagereadfunc, &readprivate, bufAlignSize);
}
if (xlogreader == NULL) {
ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"),
errdetail("Failed while allocating an XLog reading processor")));
}
xlogreader->system_identifier = GetSystemIdentifier();
if (dummyStandbyMode) {
rec = ReadRecord(xlogreader, RecPtr, LOG, false);
} else {
rec = XLogReadRecord(xlogreader, RecPtr, &errorMsg, true);
}
crcvalid = true;
if (rec != NULL) {
reccrc = rec->xl_crc;
if (dummyStandbyMode) {
SetDummyStandbyEndRecPtr(xlogreader);
}
} else {
crcvalid = false;
}
ShutdownReadFileFacility();
XLogReaderFree(xlogreader);
xlogreader = NULL;
return reccrc;
}
static void ShutdownReadFileFacility(void)
{
if (t_thrd.xlog_cxt.readFile >= 0) {
close(t_thrd.xlog_cxt.readFile);
t_thrd.xlog_cxt.readFile = -1;
}
if (t_thrd.xlog_cxt.expectedTLIs) {
list_free(t_thrd.xlog_cxt.expectedTLIs);
t_thrd.xlog_cxt.expectedTLIs = NULL;
}
}
bool CheckFpwBeforeFirstCkpt(void)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
return xlogctl->FpwBeforeFirstCkpt;
}
void DisableFpwBeforeFirstCkpt(void)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->FpwBeforeFirstCkpt = false;
SpinLockRelease(&xlogctl->info_lck);
}
* calculate the LsnXlogFlushData file size
*/
Size LsnXlogFlushChkShmemSize(void)
{
Size size = sizeof(LsnXlogFlushData);
size = MAXALIGN(size);
return size;
}
uint32 get_controlfile_timeline(void)
{
return t_thrd.shemem_ptr_cxt.ControlFile->timeline;
}
* initital the LsnXlogFlushData share memory
*/
void LsnXlogFlushChkShmInit(void)
{
bool found = false;
t_thrd.shemem_ptr_cxt.g_LsnXlogFlushChkFile = (LsnXlogFlushData *)ShmemInitStruct("LsnXlogFlushChk File",
sizeof(LsnXlogFlushData), &found);
if (t_thrd.shemem_ptr_cxt.g_LsnXlogFlushChkFile == NULL) {
ereport(FATAL, (errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("not enough shared memory for pg_lsnxlogflushchk share memory")));
}
if (!found) {
t_thrd.shemem_ptr_cxt.g_LsnXlogFlushChkFile->localLsnFlushPoint = 0;
t_thrd.shemem_ptr_cxt.g_LsnXlogFlushChkFile->peerXlogFlushPoint = 0;
INIT_CRC32(t_thrd.shemem_ptr_cxt.g_LsnXlogFlushChkFile->crc);
COMP_CRC32(t_thrd.shemem_ptr_cxt.g_LsnXlogFlushChkFile->crc,
(char *)t_thrd.shemem_ptr_cxt.g_LsnXlogFlushChkFile, offsetof(LsnXlogFlushData, crc));
FIN_CRC32(t_thrd.shemem_ptr_cxt.g_LsnXlogFlushChkFile->crc);
}
return;
}
void heap_xlog_bcm_new_page(xl_heap_logical_newpage *xlrec, RelFileNode node, char *cuData)
{
CFileNode cFileNode(node);
cFileNode.m_forkNum = xlrec->forknum;
cFileNode.m_attid = xlrec->attid;
CUStorage *cuStorage = New(CurrentMemoryContext) CUStorage(cFileNode);
cuStorage->SaveCU(cuData, xlrec->offset, xlrec->blockSize, false);
DELETE_EX(cuStorage);
}
static bool heap_xlog_logical_new_page_with_data(XLogReaderState *record)
{
bool result = false;
xl_heap_logical_newpage *xlrec = (xl_heap_logical_newpage *)XLogRecGetData(record);
if (xlrec->type == COLUMN_STORE && xlrec->hasdata) {
char *cuData = XLogRecGetData(record) + SizeOfHeapLogicalNewPage;
RelFileNode tmp_node;
RelFileNodeCopy(tmp_node, xlrec->node, (int16)XLogRecGetBucketId(record));
Assert(XLogRecGetBucketId(record) == InvalidBktId);
heap_xlog_bcm_new_page(xlrec, tmp_node, cuData);
result = true;
}
return result;
}
void heap_xlog_logical_new_page(XLogReaderState *record)
{
* one primary muti standby and enable_mix_replication is off need reocord cu xlog.
* NO ereport(ERROR) from here till newpage op is logged
*/
if (IS_DN_MULTI_STANDYS_MODE()) {
heap_xlog_logical_new_page_with_data(record);
return;
}
if (!g_instance.attr.attr_storage.enable_mix_replication && !IS_DN_MULTI_STANDYS_MODE() &&
!g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
if (g_instance.attr.attr_common.enable_tsdb) {
bool result = heap_xlog_logical_new_page_with_data(record);
if (!result) {
StartupDataStreaming();
}
} else {
StartupDataStreaming();
}
}
return;
}
* Returns true if 'expectedTLEs' contains a timeline with id 'tli'
*/
bool timeLineInHistory(TimeLineID tli, List *expectedTLEs)
{
ListCell *cell = NULL;
foreach (cell, expectedTLEs) {
if ((TimeLineID)lfirst_int(cell) == tli) {
return true;
}
}
return false;
}
void SetCBMTrackedLSN(XLogRecPtr trackedLSN)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->cbmTrackedLSN = trackedLSN;
SpinLockRelease(&xlogctl->info_lck);
ereport(LOG, (errmsg("set CBM tracked LSN point: %08X/%08X ", (uint32)(trackedLSN >> 32), (uint32)trackedLSN)));
}
XLogRecPtr GetCBMTrackedLSN(void)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
XLogRecPtr trackedLSN;
SpinLockAcquire(&xlogctl->info_lck);
trackedLSN = xlogctl->cbmTrackedLSN;
SpinLockRelease(&xlogctl->info_lck);
ereport(DEBUG1, (errmsg("get CBM tracked LSN point: %08X/%08X ", (uint32)(trackedLSN >> 32), (uint32)trackedLSN)));
return trackedLSN;
}
void SetCBMRotateLsn(XLogRecPtr rotate_lsn)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
XLogRecPtr curr_rotate_lsn;
curr_rotate_lsn = GetCBMRotateLsn();
while (XLByteLT(curr_rotate_lsn, rotate_lsn) &&
pg_atomic_compare_exchange_u64(&xlogctl->cbmMaxRequestRotateLsn, &curr_rotate_lsn, rotate_lsn) == false)
;
ereport(LOG, (errmsg("set CBM SetCBMRotateLsn point: %08X/%08X ", (uint32)(rotate_lsn >> 32), (uint32)rotate_lsn)));
}
XLogRecPtr GetCBMRotateLsn(void)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
volatile XLogRecPtr rotate_lsn;
rotate_lsn = pg_atomic_read_u64(&xlogctl->cbmMaxRequestRotateLsn);
return rotate_lsn;
}
void SetCBMFileStartLsn(XLogRecPtr currCbmNameStartLsn)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->currCbmFileStartLsn = currCbmNameStartLsn;
SpinLockRelease(&xlogctl->info_lck);
ereport(DEBUG1, (errmsg("set CBM SetCBMFileStartLsn %08X/%08X ", (uint32)(currCbmNameStartLsn >> 32),
(uint32)currCbmNameStartLsn)));
}
XLogRecPtr GetCBMFileStartLsn()
{
XLogRecPtr currCbmNameStartLsn;
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
currCbmNameStartLsn = xlogctl->currCbmFileStartLsn;
SpinLockRelease(&xlogctl->info_lck);
return currCbmNameStartLsn;
}
void SetDelayXlogRecycle(bool toDelay, bool isRedo)
{
XLogRecPtr recptr;
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->delayXlogRecycle = toDelay;
SpinLockRelease(&xlogctl->info_lck);
if (isRedo == false && t_thrd.proc->workingVersionNum >= 92233) {
START_CRIT_SECTION();
XLogBeginInsert();
XLogRegisterData((char *)(&toDelay), sizeof(toDelay));
recptr = XLogInsert(RM_XLOG_ID, XLOG_DELAY_XLOG_RECYCLE);
XLogWaitFlush(recptr);
END_CRIT_SECTION();
}
ereport(DEBUG5, (errmsg("set delay xlog recycle to %s", toDelay ? "true" : "false")));
}
bool GetDelayXlogRecycle(void)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
bool toDelay = false;
SpinLockAcquire(&xlogctl->info_lck);
toDelay = xlogctl->delayXlogRecycle;
SpinLockRelease(&xlogctl->info_lck);
ereport(DEBUG5, (errmsg("get delay xlog recycle of %s", toDelay ? "true" : "false")));
return toDelay;
}
void SetDDLDelayStartPtr(XLogRecPtr ddlDelayStartPtr)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->ddlDelayStartPtr = ddlDelayStartPtr;
SpinLockRelease(&xlogctl->info_lck);
ereport(LOG, (errmsg("set DDL delay start LSN point: %08X/%08X ", (uint32)(ddlDelayStartPtr >> 32),
(uint32)ddlDelayStartPtr)));
}
XLogRecPtr GetDDLDelayStartPtr(void)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
XLogRecPtr start_lsn;
XLogRecPtr old_start_lsn;
start_lsn = ReplicationSlotsComputeConfirmedLSN();
if (t_thrd.proc != NULL && t_thrd.proc->workingVersionNum < BACKUP_SLOT_VERSION_NUM) {
SpinLockAcquire(&xlogctl->info_lck);
old_start_lsn = xlogctl->ddlDelayStartPtr;
SpinLockRelease(&xlogctl->info_lck);
if (XLByteLT(old_start_lsn, start_lsn)) {
start_lsn = old_start_lsn;
}
}
ereport(DEBUG5, (errmsg("get DDL delay start LSN: %08X/%08X ", (uint32)(start_lsn >> 32), (uint32)start_lsn)));
return start_lsn;
}
bool IsRoachRestore(void)
{
return (t_thrd.xlog_cxt.InRecovery && ((t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_BARRIER &&
t_thrd.xlog_cxt.recoveryTargetBarrierId != NULL &&
strncmp(t_thrd.xlog_cxt.recoveryTargetBarrierId, ROACH_BACKUP_PREFIX, strlen(ROACH_BACKUP_PREFIX)) == 0) ||
(g_instance.role == VSINGLENODE && g_instance.roach_cxt.isRoachRestore)));
}
const uint UPDATE_REC_XLOG_NUM = 4;
#if defined(__x86_64__) || defined(__aarch64__) && !defined(__USE_SPINLOCK)
bool atomic_update_dirty_page_queue_rec_lsn(XLogRecPtr current_insert_lsn, bool need_immediately_update)
{
XLogRecPtr cur_rec_lsn = InvalidXLogRecPtr;
uint128_u compare;
uint128_u exchange;
uint128_u current;
compare = atomic_compare_and_swap_u128((uint128_u *)&g_instance.ckpt_cxt_ctl->dirty_page_queue_reclsn);
Assert(sizeof(g_instance.ckpt_cxt_ctl->dirty_page_queue_reclsn) == SIZE_OF_UINT64);
Assert(sizeof(g_instance.ckpt_cxt_ctl->dirty_page_queue_tail) == SIZE_OF_UINT64);
loop:
cur_rec_lsn = compare.u64[0];
if (!XLByteLE(current_insert_lsn, cur_rec_lsn) &&
(need_immediately_update || current_insert_lsn - cur_rec_lsn > XLogBaseSize * UPDATE_REC_XLOG_NUM)) {
exchange.u64[0] = current_insert_lsn;
exchange.u64[1] = compare.u64[1];
current = atomic_compare_and_swap_u128((uint128_u *)&g_instance.ckpt_cxt_ctl->dirty_page_queue_reclsn, compare,
exchange);
if (!UINT128_IS_EQUAL(compare, current)) {
UINT128_COPY(compare, current);
goto loop;
}
return true;
}
return false;
}
#endif
* @Description: Push dirty buffer to dirty page queue, need ensure the recLSN incrementally. when is
* not in recovery mode, XLog insert or do full checkpoint will update the dirty page
* queue recLSN. when is in recovery mode, redo checkpoint XLog, update the dirty page
* queue recLSN to checkpoint's redo lsn.
*/
void update_dirty_page_queue_rec_lsn(XLogRecPtr current_insert_lsn, bool need_immediately_update)
{
bool is_update = false;
uint32 freespace;
if (unlikely(!ENABLE_INCRE_CKPT)) {
return;
}
freespace = INSERT_FREESPACE(current_insert_lsn);
if (freespace == 0) {
if (current_insert_lsn % XLogSegSize == 0) {
current_insert_lsn += SizeOfXLogLongPHD;
} else {
current_insert_lsn += SizeOfXLogShortPHD;
}
}
#if defined(__x86_64__) || defined(__aarch64__) && !defined(__USE_SPINLOCK)
is_update = atomic_update_dirty_page_queue_rec_lsn(current_insert_lsn, need_immediately_update);
#else
SpinLockAcquire(&g_instance.ckpt_cxt_ctl->queue_lock);
if (!XLByteLE(current_insert_lsn, g_instance.ckpt_cxt_ctl->dirty_page_queue_reclsn) &&
(need_immediately_update ||
current_insert_lsn - g_instance.ckpt_cxt_ctl->dirty_page_queue_reclsn > XLogBaseSize * UPDATE_REC_XLOG_NUM)) {
g_instance.ckpt_cxt_ctl->dirty_page_queue_reclsn = current_insert_lsn;
is_update = true;
}
SpinLockRelease(&g_instance.ckpt_cxt_ctl->queue_lock);
#endif
return;
}
uint64 get_dirty_page_queue_rec_lsn()
{
uint64 dirty_page_queue_rec_lsn = 0;
#if defined(__x86_64__) || defined(__aarch64__) && !defined(__USE_SPINLOCK)
dirty_page_queue_rec_lsn = pg_atomic_barrier_read_u64(&g_instance.ckpt_cxt_ctl->dirty_page_queue_reclsn);
#else
SpinLockAcquire(&g_instance.ckpt_cxt_ctl->queue_lock);
dirty_page_queue_rec_lsn = g_instance.ckpt_cxt_ctl->dirty_page_queue_reclsn;
SpinLockRelease(&g_instance.ckpt_cxt_ctl->queue_lock);
#endif
return dirty_page_queue_rec_lsn;
}
* @Description: get the recLSN of the first page in the dirty page queue.
* @return: dirty page queue first valid recLSN.
*/
XLogRecPtr ckpt_get_min_rec_lsn(void)
{
uint64 queue_loc;
XLogRecPtr min_rec_lsn = InvalidXLogRecPtr;
* If head recLSN is Invalid, then add head, get next buffer recLSN, if head equal tail,
* return InvalidXLogRecPtr.
*/
if (get_dirty_page_num() == 0) {
return InvalidXLogRecPtr;
}
queue_loc = pg_atomic_read_u64(&g_instance.ckpt_cxt_ctl->dirty_page_queue_head);
while (XLogRecPtrIsInvalid(min_rec_lsn) && (queue_loc < get_dirty_page_queue_tail())) {
Buffer buffer;
BufferDesc *buf_desc = NULL;
XLogRecPtr page_rec_lsn = InvalidXLogRecPtr;
uint64 temp_loc = queue_loc % g_instance.ckpt_cxt_ctl->dirty_page_queue_size;
volatile DirtyPageQueueSlot *slot = &g_instance.ckpt_cxt_ctl->dirty_page_queue[temp_loc];
if (!(pg_atomic_read_u32(&slot->slot_state) & SLOT_VALID)) {
pg_usleep(1);
queue_loc = pg_atomic_read_u64(&g_instance.ckpt_cxt_ctl->dirty_page_queue_head);
continue;
}
pg_memory_barrier();
queue_loc++;
buffer = slot->buffer;
if (BufferIsInvalid(buffer)) {
continue;
}
buf_desc = GetBufferDescriptor(buffer - 1);
page_rec_lsn = pg_atomic_read_u64(&buf_desc->extra->rec_lsn);
if (!BufferIsInvalid(slot->buffer)) {
min_rec_lsn = page_rec_lsn;
}
}
return min_rec_lsn;
}
void WaitCheckpointSync(void)
{
if (t_thrd.shemem_ptr_cxt.ControlFile != NULL &&
u_sess->attr.attr_storage.guc_synchronous_commit > SYNCHRONOUS_COMMIT_LOCAL_FLUSH) {
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
XLogRecPtr checkpoint = t_thrd.shemem_ptr_cxt.ControlFile->checkPoint;
LWLockRelease(ControlFileLock);
if (!IsInitdb && g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
SyncPaxosWaitForLSN(checkpoint);
} else {
SyncRepWaitForLSN(checkpoint);
}
}
}
void GetRecoveryLatch()
{
OwnLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch);
}
void ReLeaseRecoveryLatch()
{
DisownLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch);
}
* Reset latch before startup process delay recovery.
*/
void ResetRecoveryDelayLatch() {
if (IsExtremeRedo()) {
ResetLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupDelayLatch);
} else {
ResetLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch);
}
}
* Let startup process wait for a while until waitTime ms or other threads set latch.
*/
void WaitRecoveryDelayLatch(long waitTime) {
if (IsExtremeRedo()) {
(void)WaitLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupDelayLatch, WL_LATCH_SET | WL_TIMEOUT, waitTime);
} else {
(void)WaitLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch, WL_LATCH_SET | WL_TIMEOUT, waitTime);
}
}
XLogRecPtr mpfl_get_max_lsn_from_buf(char *buf)
{
uint32 i = 0;
XLogRecPtr *lsn = NULL;
XLogRecPtr tmp_lsn = 0;
Assert(buf != NULL);
for (i = 0; i < MPFL_PAGE_NUM; ++i) {
lsn = (XLogRecPtr *)(buf + i * BLCKSZ);
pg_atomic_write_u64(&(g_instance.comm_cxt.predo_cxt.max_page_flush_lsn[i]), *lsn);
if (XLByteLT(tmp_lsn, *lsn)) {
tmp_lsn = *lsn;
}
}
return tmp_lsn;
}
XLogRecPtr mpfl_read_max_flush_lsn()
{
int fd = -1;
char *file_head = g_instance.comm_cxt.predo_cxt.ali_buf;
XLogRecPtr max_flush_lsn;
if (!file_exists(MAX_PAGE_FLUSH_LSN_FILE)) {
ereport(WARNING, (errcode_for_file_access(), errmodule(MOD_REDO), errmsg("mpfl file does not exist")));
return MAX_XLOG_REC_PTR;
}
fd = open(MAX_PAGE_FLUSH_LSN_FILE, MPFL_FILE_FLAG, MPFL_FILE_PERM);
if (fd == -1) {
ereport(WARNING, (errcode_for_file_access(), errmodule(MOD_DW),
errmsg("mpfl could not open file \"%s\": %s",
MAX_PAGE_FLUSH_LSN_FILE, TRANSLATE_ERRNO)));
return MAX_XLOG_REC_PTR;
}
Assert(file_head != NULL);
pgstat_report_waitevent(WAIT_EVENT_MPFL_READ);
if (!mpfl_pread_file(fd, file_head, MPFL_FILE_SIZE, 0)) {
max_flush_lsn = MAX_XLOG_REC_PTR;
} else {
max_flush_lsn = mpfl_get_max_lsn_from_buf(file_head);
}
pgstat_report_waitevent(WAIT_EVENT_END);
close(fd);
if (max_flush_lsn == 0) {
max_flush_lsn = MAX_XLOG_REC_PTR;
}
return max_flush_lsn;
}
void mpfl_ulink_file()
{
int ret = 0;
ret = unlink(MAX_PAGE_FLUSH_LSN_FILE);
if (ret < 0 && errno != ENOENT) {
ereport(WARNING, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
errmsg("[mpfl_ulink_file]: unlink %s failed! ret:%u", MAX_PAGE_FLUSH_LSN_FILE, ret)));
} else {
ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
errmsg("[mpfl_ulink_file]: unlink %s sucessfully! ret:%u", MAX_PAGE_FLUSH_LSN_FILE, ret)));
}
}
bool mpfl_pread_file(int fd, void *buf, int32 size, int64 offset)
{
int32 curr_size, total_size;
total_size = 0;
do {
curr_size = pread64(fd, ((char *)buf + total_size), (size - total_size), offset);
if (curr_size == -1) {
ereport(WARNING, (errcode_for_file_access(), errmodule(MOD_REDO),
errmsg("mpfl_pread_file:Read file error, size:%d, offset:%ld, total_size:%d", size,
offset, total_size)));
return false;
}
total_size += curr_size;
offset += curr_size;
} while (curr_size > 0);
if (total_size != size) {
ereport(WARNING, (errcode_for_file_access(), errmodule(MOD_REDO),
errmsg("mpfl_pread_file: file size mismatch: expected %d, read %d, offset:%ld", size,
total_size, offset)));
return false;
}
return true;
}
void mpfl_pwrite_file(int fd, const void *buf, int size, int64 offset)
{
int write_size = 0;
uint32 try_times = 0;
const uint64 mpfl_try_times = 8;
const uint64 mpfl_sleep_us = 256L;
if ((offset + size) > MPFL_FILE_SIZE) {
ereport(PANIC, (errmodule(MOD_REDO), errmsg("mpfl_pwrite_file failed, MPFL_FILE_SIZE %d, "
"offset %ld, write_size %d",
MPFL_FILE_SIZE, offset, size)));
}
while (try_times < mpfl_try_times) {
write_size = pwrite64(fd, buf, size, offset);
if (write_size == 0) {
try_times++;
pg_usleep(mpfl_sleep_us);
continue;
} else if (write_size < 0) {
ereport(PANIC,
(errcode_for_file_access(), errmodule(MOD_REDO), errmsg("mpfl_pwrite_file: Write file error")));
} else {
break;
}
}
if (write_size != size) {
ereport(PANIC,
(errcode_for_file_access(), errmodule(MOD_REDO),
errmsg("mpfl_pwrite_file: Write file size mismatch: expected %d, written %d", size, write_size)));
}
}
void mpfl_new_file()
{
errno_t errorno = EOK;
char *file_head = g_instance.comm_cxt.predo_cxt.ali_buf;
int fd = -1;
if (file_exists(MAX_PAGE_FLUSH_LSN_FILE)) {
ereport(WARNING, (errcode_for_file_access(), errmodule(MOD_REDO), "max_flush_lsn file exists, deleting it"));
if (unlink(MAX_PAGE_FLUSH_LSN_FILE) != 0) {
ereport(PANIC,
(errcode_for_file_access(), errmodule(MOD_REDO), errmsg("Could not remove max_flush_lsn file")));
}
}
fd = open(MAX_PAGE_FLUSH_LSN_FILE, (MPFL_FILE_FLAG | O_CREAT), MPFL_FILE_PERM);
if (fd == -1) {
ereport(PANIC, (errcode_for_file_access(), errmodule(MOD_REDO),
errmsg("Could not create file \"%s\"", MAX_PAGE_FLUSH_LSN_FILE)));
}
Assert(file_head != NULL);
errorno = memset_s(file_head, MPFL_FILE_SIZE, 0, MPFL_FILE_SIZE);
securec_check(errorno, "", "");
pgstat_report_waitevent(WAIT_EVENT_MPFL_INIT);
mpfl_pwrite_file(fd, file_head, MPFL_FILE_SIZE, 0);
pgstat_report_waitevent(WAIT_EVENT_END);
close(fd);
}
void update_max_page_flush_lsn(XLogRecPtr biggest_lsn, ThreadId thdId, bool is_force)
{
uint32 hash_value = hashquickany(0xFFFFFFFF, (unsigned char *)(&thdId), sizeof(ThreadId));
uint32 lock_id = hash_value % NUM_MAX_PAGE_FLUSH_LSN_PARTITIONS;
LWLock *p_lock = NULL;
int fd = -1;
char *page = NULL;
int64 offset;
XLogRecPtr *new_lsn;
XlogUpMinRecovPointSwitchChk(biggest_lsn);
if (XLByteLE(biggest_lsn, pg_atomic_read_u64(&(g_instance.comm_cxt.predo_cxt.max_page_flush_lsn[lock_id]))) &&
!is_force) {
return;
}
p_lock = &(t_thrd.shemem_ptr_cxt.mainLWLockArray[FirstMPFLLock + (lock_id)].lock);
LWLockAcquire(p_lock, LW_EXCLUSIVE);
if (is_force) {
pg_atomic_write_u64(&(g_instance.comm_cxt.predo_cxt.max_page_flush_lsn[lock_id]), biggest_lsn);
} else {
if (XLByteLT(pg_atomic_read_u64(&g_instance.comm_cxt.predo_cxt.max_page_flush_lsn[lock_id]), biggest_lsn)) {
pg_atomic_write_u64(&(g_instance.comm_cxt.predo_cxt.max_page_flush_lsn[lock_id]), biggest_lsn);
} else {
LWLockRelease(p_lock);
return;
}
}
fd = open(MAX_PAGE_FLUSH_LSN_FILE, (MPFL_FILE_FLAG), MPFL_FILE_PERM);
if (fd == -1) {
LWLockRelease(p_lock);
ereport(PANIC, (errcode_for_file_access(), errmodule(MOD_REDO),
errmsg("Could not open file \"%s\"", MAX_PAGE_FLUSH_LSN_FILE)));
}
offset = (int64)lock_id * (int64)BLCKSZ;
page = g_instance.comm_cxt.predo_cxt.ali_buf + offset;
new_lsn = (XLogRecPtr *)page;
*new_lsn = biggest_lsn;
pgstat_report_waitevent(WAIT_EVENT_MPFL_WRITE);
mpfl_pwrite_file(fd, page, BLCKSZ, offset);
pgstat_report_waitevent(WAIT_EVENT_END);
close(fd);
LWLockRelease(p_lock);
}
void set_global_max_page_flush_lsn(XLogRecPtr lsn)
{
LWLock *p_lock = NULL;
for (uint32 i = 0; i < NUM_MAX_PAGE_FLUSH_LSN_PARTITIONS; ++i) {
p_lock = &(t_thrd.shemem_ptr_cxt.mainLWLockArray[FirstMPFLLock + (i)].lock);
LWLockAcquire(p_lock, LW_EXCLUSIVE);
pg_atomic_write_u64(&(g_instance.comm_cxt.predo_cxt.max_page_flush_lsn[i]), lsn);
LWLockRelease(p_lock);
}
}
XLogRecPtr get_global_max_page_flush_lsn()
{
XLogRecPtr maxLsn = 0;
for (uint32 i = 0; i < NUM_MAX_PAGE_FLUSH_LSN_PARTITIONS; ++i) {
XLogRecPtr tmpLsn = pg_atomic_read_u64(&g_instance.comm_cxt.predo_cxt.max_page_flush_lsn[i]);
if (XLByteLT(maxLsn, tmpLsn)) {
maxLsn = tmpLsn;
}
}
return maxLsn;
}
bool IsRecoveryDone()
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
return xlogctl->IsRecoveryDone;
}
void ExtremRtoUpdateMinCheckpoint()
{
if (t_thrd.shemem_ptr_cxt.XLogCtl->IsRecoveryDone && t_thrd.xlog_cxt.minRecoveryPoint == InvalidXLogRecPtr) {
t_thrd.xlog_cxt.minRecoveryPoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
}
}
#ifndef ENABLE_MULTIPLE_NODES
void DcfUpdateConsensusLsnAndIndex(XLogRecPtr ConsensusLsn, unsigned long long dcfIndex)
{
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->LogwrtPaxos.Consensus = ConsensusLsn;
xlogctl->LogwrtPaxos.PaxosIndex = dcfIndex;
SpinLockRelease(&xlogctl->info_lck);
}
#endif
void SetRemainSegsStartPoint(XLogRecPtr remainSegsStartPoint)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->remain_segs_start_point = remainSegsStartPoint;
SpinLockRelease(&xlogctl->info_lck);
}
XLogRecPtr GetRemainSegsStartPoint(void)
{
XLogRecPtr remainSegsStartPoint;
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
remainSegsStartPoint = xlogctl->remain_segs_start_point;
SpinLockRelease(&xlogctl->info_lck);
return remainSegsStartPoint;
}
void UpdateRemainSegsStartPoint(XLogRecPtr newStartPoint)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
if (XLByteLT(xlogctl->remain_segs_start_point, newStartPoint)) {
xlogctl->remain_segs_start_point = newStartPoint;
}
SpinLockRelease(&xlogctl->info_lck);
}
bool IsNeedLogRemainSegs(XLogRecPtr curLsn)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
if (xlogctl->is_need_log_remain_segs) {
return true;
}
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->is_need_log_remain_segs = (XLByteLT(xlogctl->remain_segs_start_point, curLsn));
SpinLockRelease(&xlogctl->info_lck);
return xlogctl->is_need_log_remain_segs;
}
void SetRemainCommitLsn(XLogRecPtr remainCommitLsn)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->remainCommitLsn = remainCommitLsn;
SpinLockRelease(&xlogctl->info_lck);
}
XLogRecPtr GetRemainCommitLsn(void)
{
XLogRecPtr remainCommitLsn;
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
remainCommitLsn = xlogctl->remainCommitLsn;
SpinLockRelease(&xlogctl->info_lck);
return remainCommitLsn;
}
void UpdateRemainCommitLsn(XLogRecPtr newRemainCommitLsn)
{
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
if (XLByteLT(xlogctl->remainCommitLsn, newRemainCommitLsn)) {
xlogctl->remainCommitLsn = newRemainCommitLsn;
}
SpinLockRelease(&xlogctl->info_lck);
}
uint32 GetDigitalByLog2(uint32 num)
{
if (num == 1) {
return 0;
} else {
return 1 + GetDigitalByLog2(num >> 1);
}
}
void InitXlogStatuEntryTblSize()
{
uint32 conCfgNum = 0;
uint32 threadCfgNum = 0;
int maxConn = g_instance.attr.attr_network.MaxConnections;
conCfgNum = (maxConn <= 0) ? 0 : GetDigitalByLog2(maxConn) + 1;
if (g_instance.attr.attr_common.enable_thread_pool) {
char *attr = TrimStr(g_instance.attr.attr_common.thread_pool_attr);
char *ptoken = NULL;
char *psave = NULL;
const char *pdelimiter = ",";
if ((attr) == NULL || (attr)[0] == '\0') {
threadCfgNum = 0;
} else {
ptoken = TrimStr(strtok_r(attr, pdelimiter, &psave));
if ((ptoken) == NULL || (ptoken)[0] == '\0') {
threadCfgNum = 0;
} else {
int32 threadNum = pg_strtoint32(ptoken);
threadCfgNum = (threadNum <= 0) ? 0 : GetDigitalByLog2(threadNum) + 1;
}
}
} else {
threadCfgNum = 0;
}
g_instance.attr.attr_storage.wal_insert_status_entries_power =
((conCfgNum >= threadCfgNum) ? conCfgNum : threadCfgNum) + WAL_INSERT_STATUS_TBL_EXPAND_FACTOR;
if (g_instance.attr.attr_storage.wal_insert_status_entries_power < MIN_WAL_INSERT_STATUS_ENTRY_POW) {
g_instance.attr.attr_storage.wal_insert_status_entries_power = MIN_WAL_INSERT_STATUS_ENTRY_POW;
}
}
void InitShareStorageCtlInfo(ShareStorageXLogCtl *ctlInfo, uint64 sysidentifier)
{
ctlInfo->magic = SHARE_STORAGE_CTL_MAGIC;
ctlInfo->length = SizeOfShareStorageXLogCtl;
ctlInfo->version = CURRENT_SHARE_STORAGE_CTL_VERSION;
ctlInfo->systemIdentifier = sysidentifier;
ctlInfo->insertHead = XLogSegSize;
ctlInfo->xlogFileSize = g_instance.attr.attr_storage.xlog_file_size;
ctlInfo->checkNumber = SHARE_STORAGE_CTL_CHCK_NUMBER;
ctlInfo->insertTail = XLogSegSize;
ctlInfo->term = 1;
ctlInfo->pad1 = 0;
ctlInfo->pad2 = 0;
ctlInfo->pad3 = 0;
ctlInfo->pad4 = 0;
}
void UpdateShareStorageCtlInfo(const ShareStorageXLogCtl *ctlInfo)
{
Assert(g_instance.xlog_cxt.shareStorageopCtl.isInit && (g_instance.xlog_cxt.shareStorageopCtl.opereateIf != NULL));
g_instance.xlog_cxt.shareStorageopCtl.opereateIf->WriteCtlInfo(ctlInfo);
}
void ReadShareStorageCtlInfo(ShareStorageXLogCtl *ctlInfo)
{
Assert(g_instance.xlog_cxt.shareStorageopCtl.isInit && (g_instance.xlog_cxt.shareStorageopCtl.opereateIf != NULL));
g_instance.xlog_cxt.shareStorageopCtl.opereateIf->ReadCtlInfo(ctlInfo);
}
int ReadXlogFromShareStorage(XLogRecPtr startLsn, char *buf, int expectReadLen)
{
Assert(g_instance.xlog_cxt.shareStorageopCtl.isInit && (g_instance.xlog_cxt.shareStorageopCtl.opereateIf != NULL));
return g_instance.xlog_cxt.shareStorageopCtl.opereateIf->readXlog(startLsn, buf, expectReadLen);
}
int WriteXlogToShareStorage(XLogRecPtr startLsn, char *buf, int writeLen)
{
Assert(g_instance.xlog_cxt.shareStorageopCtl.isInit && (g_instance.xlog_cxt.shareStorageopCtl.opereateIf != NULL));
return g_instance.xlog_cxt.shareStorageopCtl.opereateIf->WriteXlog(startLsn, buf, writeLen);
}
pg_crc32c CalShareStorageCtlInfoCrc(const ShareStorageXLogCtl *ctlInfo)
{
if (IS_SHARED_STORAGE_MODE) {
Assert(g_instance.xlog_cxt.shareStorageopCtl.isInit);
}
pg_crc32c crc;
INIT_CRC32C(crc);
COMP_CRC32C(crc, (char*)ctlInfo, offsetof(ShareStorageXLogCtl, crc));
FIN_CRC32C(crc);
return crc;
}
void UpdatePostgresqlFile(const char *optName, const char *gucLine)
{
char guc[MAXPGPATH];
char gucBak[MAXPGPATH];
char gucLock[MAXPGPATH];
errno_t ret = snprintf_s(guc, MAXPGPATH, MAXPGPATH - 1, "%s/postgresql.conf", t_thrd.proc_cxt.DataDir);
securec_check_ss(ret, "\0", "\0");
ret = snprintf_s(gucBak, MAXPGPATH, MAXPGPATH - 1, "%s/%s", t_thrd.proc_cxt.DataDir, CONFIG_BAK_FILENAME_WAL);
securec_check_ss(ret, "\0", "\0");
struct stat statbuf;
if (lstat(guc, &statbuf) != 0) {
if (errno != ENOENT) {
ereport(FATAL, (errmsg("UpdatePostgresqlFile could not stat file \"%s\": %s", guc, TRANSLATE_ERRNO)));
}
}
ConfFileLock filelock = { NULL, 0 };
ret = snprintf_s(gucLock, MAXPGPATH, MAXPGPATH - 1, "%s/postgresql.conf.lock", t_thrd.proc_cxt.DataDir);
securec_check_ss(ret, "\0", "\0");
if (get_file_lock(gucLock, &filelock) != CODE_OK) {
ereport(FATAL, (errmsg("UpdatePostgresqlFile:Modify postgresql.conf failed : can not get the file lock ")));
}
copy_file_internal(guc, gucBak, true);
char** opt_lines = read_guc_file(gucBak);
if (opt_lines == NULL) {
release_file_lock(&filelock);
ereport(FATAL, (errmsg("UpdatePostgresqlFile:the config file has no data,please check it.")));
}
modify_guc_one_line(&opt_lines, optName, gucLine);
ret = write_guc_file(gucBak, opt_lines);
release_opt_lines(opt_lines);
opt_lines = NULL;
if (ret != CODE_OK) {
release_file_lock(&filelock);
ereport(FATAL, (errmsg("UpdatePostgresqlFile:the config file %s updates failed.", gucBak)));
}
if (rename(gucBak, guc) != 0) {
release_file_lock(&filelock);
ereport(FATAL, (errcode_for_file_access(), errmsg("could not rename \"%s\" to \"%s\": %s",
gucBak, guc, TRANSLATE_ERRNO)));
}
if (lstat(guc, &statbuf) != 0) {
if (errno != ENOENT) {
release_file_lock(&filelock);
ereport(PANIC, (errmsg("UpdatePostgresqlFile2:could not stat file \"%s\": %s",
guc, TRANSLATE_ERRNO)));
}
}
if (statbuf.st_size > 0) {
copy_file_internal(guc, gucBak, true);
}
release_file_lock(&filelock);
}
void NormalClusterDoradoStorageInit()
{
bool found = false;
void *tmpBuf = ShmemInitStruct("share storage Ctl", CalShareStorageCtlSize(), &found);
g_instance.xlog_cxt.shareStorageXLogCtl = (ShareStorageXLogCtl *)TYPEALIGN(MEMORY_ALIGNED_SIZE, tmpBuf);
g_instance.xlog_cxt.shareStorageXLogCtlOrigin = tmpBuf;
InitDoradoStorage(g_instance.attr.attr_storage.xlog_file_path,
(uint64)g_instance.attr.attr_storage.xlog_file_size);
if (!IsInitdb && g_instance.attr.attr_storage.xlog_lock_file_path != NULL) {
g_instance.xlog_cxt.shareStorageLockFd = BasicOpenFile(g_instance.attr.attr_storage.xlog_lock_file_path,
O_CREAT | O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
if (g_instance.xlog_cxt.shareStorageLockFd < 0) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %s",
g_instance.attr.attr_storage.xlog_lock_file_path, TRANSLATE_ERRNO)));
}
if (ENABLE_DMS) {
g_instance.dms_cxt.SSRecoveryInfo.dorado_sharestorage_inited = true;
}
}
if (((uint64)g_instance.attr.attr_storage.xlog_file_size !=
g_instance.xlog_cxt.shareStorageopCtl.xlogFileSize) && (IS_SHARED_STORAGE_STANDBY_CLUSTER)) {
char buf[MAX_XLOG_FILE_SIZE_BUFFER];
errno_t errorno = snprintf_s(buf, sizeof(buf), sizeof(buf) - 1, "%lu",
g_instance.xlog_cxt.shareStorageopCtl.xlogFileSize);
securec_check_ss(errorno, "", "");
SetConfigOption("xlog_file_size", buf, PGC_POSTMASTER, PGC_S_ARGV);
char option[MAX_XLOG_FILE_SIZE_BUFFER];
errorno = snprintf_s(option, sizeof(option), sizeof(option) - 1, "xlog_file_size=%lu\n",
g_instance.xlog_cxt.shareStorageopCtl.xlogFileSize);
securec_check_ss(errorno, "", "");
UpdatePostgresqlFile("xlog_file_size", option);
}
}
void ShareStorageInit()
{
if (SS_DORADO_CLUSTER) {
SSClusterDoradoStorageInit();
} else if (IS_SHARED_STORAGE_MODE) {
NormalClusterDoradoStorageInit();
}
}
void ShareStorageSetBuildErrorAndExit(HaRebuildReason reason, bool setRcvDone)
{
if (!setRcvDone) {
ha_set_rebuild_connerror(reason, REPL_INFO_ERROR);
} else {
uint32 disableConnectionNode =
pg_atomic_read_u32(&g_instance.comm_cxt.localinfo_cxt.need_disable_connection_node);
if (disableConnectionNode) {
pg_atomic_write_u32(&t_thrd.walreceiverfuncs_cxt.WalRcv->rcvDoneFromShareStorage, true);
}
}
ereport(ERROR, (errcode(ERRCODE_INVALID_STATUS),
errmsg("ShareStorageSetBuildErrorAndExit, exit from walreceiver! build reason: %s, RcvDone:%d",
wal_get_rebuild_reason_string(reason), (int)setRcvDone)));
}
void FindLastRecordCheckInfoOnShareStorage(XLogRecPtr *lastRecordPtr, pg_crc32 *lastRecordCrc, int *lastRecordLen)
{
ShareStorageXLogCtl *ctlInfo = AlignAllocShareStorageCtl();
ReadShareStorageCtlInfo(ctlInfo);
XLogRecPtr startLsn = ctlInfo->insertHead - (ctlInfo->insertHead % XLOG_BLCKSZ);
XLogRecPtr endPtr = XLogSegSize;
if (ctlInfo->insertHead > g_instance.xlog_cxt.shareStorageopCtl.xlogFileSize) {
endPtr = ctlInfo->insertHead - g_instance.xlog_cxt.shareStorageopCtl.xlogFileSize;
}
XLogReaderState *xlogReader = XLogReaderAllocate(&SharedStorageXLogPageRead, NULL,
g_instance.xlog_cxt.shareStorageopCtl.blkSize);
if (xlogReader == NULL) {
ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"),
errdetail("FindLastRecordCheckInfoOnShareStorage Failed while allocating an XLog reader")));
}
bool findValidXLogFile = false;
XLogRecPtr curLsn;
while (XLByteLE(endPtr, startLsn)) {
XLogRecPtr curEnd;
curLsn = XLogFindNextRecord(xlogReader, startLsn, &curEnd);
if (XLogRecPtrIsInvalid(curLsn) || XLByteLT(ctlInfo->insertHead, curEnd)) {
startLsn = startLsn - XLOG_BLCKSZ;
continue;
} else {
findValidXLogFile = true;
break;
}
}
if (!findValidXLogFile) {
if (xlogReader != NULL) {
XLogReaderFree(xlogReader);
xlogReader = NULL;
}
AlignFreeShareStorageCtl(ctlInfo);
ereport(ERROR, (errcode(ERRCODE_INVALID_STATUS), errmsg("could not find valid xlog on share storage")));
return;
}
for (;;) {
char *errorMsg = NULL;
XLogRecord *record = XLogReadRecord(xlogReader, curLsn, &errorMsg);
if (record == NULL || XLByteLT(ctlInfo->insertHead, xlogReader->EndRecPtr)) {
break;
}
curLsn = InvalidXLogRecPtr;
*lastRecordCrc = record->xl_crc;
*lastRecordLen = (int)(xlogReader->EndRecPtr - xlogReader->ReadRecPtr);
*lastRecordPtr = xlogReader->ReadRecPtr;
}
if (xlogReader != NULL) {
XLogReaderFree(xlogReader);
xlogReader = NULL;
}
if (XLogRecPtrIsInvalid(*lastRecordPtr)) {
ereport(ERROR, (errcode(ERRCODE_INVALID_STATUS), errmsg("could not find max xlog on share storage")));
} else {
uint32 shiftSize = 32;
ereport(LOG, (errcode(ERRCODE_INVALID_STATUS),
errmsg("find max xlog on share storage %X/%X", static_cast<uint32>(*lastRecordPtr >> shiftSize),
static_cast<uint32>(*lastRecordPtr))));
}
AlignFreeShareStorageCtl(ctlInfo);
}
Size SimpleValidatePage(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, char *page)
{
XLogPageHeader hdr = (XLogPageHeader)page;
bool ret = ValidXLogPageHeader(xlogreader, targetPagePtr, hdr);
if (!ret) {
elog(LOG, "SimpleValidatePage:%s", xlogreader->errormsg_buf);
return 0;
}
return hdr->xlp_total_len;
}
int SharedStorageXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI, char* xlog_path)
{
int read_len = ReadXlogFromShareStorage(targetPagePtr, readBuf, Max(XLOG_BLCKSZ, reqLen));
return read_len;
}
void GetWritePermissionSharedStorage()
{
if (!IS_SHARED_STORAGE_MODE || IsInitdb) {
return;
}
const int maxRetryTimes = 20;
int retryTimes = 0;
if (g_instance.attr.attr_storage.xlog_lock_file_path == NULL) {
char cmd[MAX_PATH_LEN] = {0};
char *appName = u_sess->attr.attr_common.application_name;
char *shareStoragePath = g_instance.attr.attr_storage.xlog_file_path;
errno_t rc = 0;
rc = snprintf_s(cmd, MAX_PATH_LEN, MAX_PATH_LEN - 1, "gs_persist %s %s -R", shareStoragePath, appName);
securec_check_ss_c(rc, "\0", "\0");
while (retryTimes < maxRetryTimes) {
++retryTimes;
rc = system(cmd);
if (rc == 0) {
break;
}
(void)sleep(1);
}
if (rc != 0) {
ereport(FATAL, (errmsg("could not reserve the key of %s on %s", appName, shareStoragePath)));
}
ereport(LOG, (errmsg("reserve the key of %s on shared storage: %s", appName, shareStoragePath)));
} else {
bool locked = false;
while (retryTimes < maxRetryTimes) {
++retryTimes;
locked = LockNasWriteFile(g_instance.xlog_cxt.shareStorageLockFd);
if (locked) {
break;
}
(void)sleep(1);
}
if (!locked) {
ereport(FATAL, (errmsg("could not lock lock file")));
}
ereport(LOG, (errmsg("lock lock file(%d) %s", g_instance.xlog_cxt.shareStorageLockFd,
g_instance.attr.attr_storage.xlog_lock_file_path)));
}
}
XLogRecPtr GetFlushMainStandby()
{
XLogRecPtr recptr = InvalidXLogRecPtr;
XLogRecPtr replayptr = InvalidXLogRecPtr;
XLogRecPtr flushptr = InvalidXLogRecPtr;
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&walrcv->mutex);
recptr = walrcv->receivedUpto;
SpinLockRelease(&walrcv->mutex);
SpinLockAcquire(&xlogctl->info_lck);
replayptr = xlogctl->lastReplayedEndRecPtr;
SpinLockRelease(&xlogctl->info_lck);
flushptr = XLByteLT(recptr, replayptr)? replayptr : recptr;
return flushptr;
}
bool SSModifySharedLunAllowed()
{
if (!ENABLE_DMS ||
SS_PRIMARY_MODE ||
g_instance.dms_cxt.SSClusterState == NODESTATE_PRIMARY_DEMOTING ||
g_instance.dms_cxt.SSClusterState == NODESTATE_STANDBY_PROMOTING ||
g_instance.dms_cxt.SSClusterState == NODESTATE_STANDBY_PROMOTED ||
SS_STANDBY_FAILOVER) {
return true;
}
return false;
}
static int SSReadXLog(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int expectReadLen,
XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI, char* xlog_path)
{
XLogPageReadPrivate *readprivate = (XLogPageReadPrivate *)xlogreader->private_data;
int emode = IsExtremeRedo() ? LOG : readprivate->emode;
bool randAccess = IsExtremeRedo() ? false : readprivate->randAccess;
XLogRecPtr RecPtr = targetPagePtr;
uint32 targetPageOff;
uint32 actualBytes;
#ifdef USE_ASSERT_CHECKING
XLogSegNo targetSegNo;
XLByteToSeg(targetPagePtr, targetSegNo);
#endif
targetPageOff = targetPagePtr % XLogSegSize;
* See if we need to switch to a new segment because the requested record
* is not in the currently open one.
*/
if (t_thrd.xlog_cxt.readFile >= 0 && !XLByteInSeg(targetPagePtr, t_thrd.xlog_cxt.readSegNo)) {
close(t_thrd.xlog_cxt.readFile);
t_thrd.xlog_cxt.readFile = -1;
t_thrd.xlog_cxt.readSource = 0;
}
XLByteToSeg(targetPagePtr, t_thrd.xlog_cxt.readSegNo);
XLByteAdvance(RecPtr, expectReadLen);
if (t_thrd.xlog_cxt.readFile < 0) {
uint32 sources;
if (randAccess) {
t_thrd.xlog_cxt.curFileTLI = 0;
}
sources = XLOG_FROM_PG_XLOG;
if (t_thrd.xlog_cxt.InArchiveRecovery) {
sources |= XLOG_FROM_ARCHIVE;
}
t_thrd.xlog_cxt.readFile = XLogFileReadAnyTLI(t_thrd.xlog_cxt.readSegNo, emode, sources);
if (t_thrd.xlog_cxt.readFile < 0) {
return -1;
}
}
* At this point, we have the right segment open and if we're streaming we
* know the requested record is in it.
*/
Assert(t_thrd.xlog_cxt.readFile != -1);
t_thrd.xlog_cxt.readLen = XLOG_BLCKSZ;
t_thrd.xlog_cxt.readOff = targetPageOff;
if (xlogreader->preReadBuf == NULL) {
actualBytes = (uint32)pread(t_thrd.xlog_cxt.readFile, readBuf, t_thrd.xlog_cxt.readLen,
t_thrd.xlog_cxt.readOff);
} else {
actualBytes = (uint32)SSReadXlogInternal(xlogreader, targetPagePtr, targetRecPtr, readBuf,
t_thrd.xlog_cxt.readLen, t_thrd.xlog_cxt.readFile);
}
if (actualBytes != t_thrd.xlog_cxt.readLen) {
ereport(LOG, (errcode_for_file_access(), errmsg("read xlog(start:%X/%X, pos:%u len:%d) failed : %s",
static_cast<uint32>(targetPagePtr >> BIT_NUM_INT32),
static_cast<uint32>(targetPagePtr), targetPageOff,
expectReadLen, TRANSLATE_ERRNO)));
ereport(emode_for_corrupt_record(emode, RecPtr),
(errcode_for_file_access(),
errmsg("could not read from log file %s to offset %u: %s",
XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo),
t_thrd.xlog_cxt.readOff, TRANSLATE_ERRNO)));
goto next_record_is_invalid;
}
Assert(targetSegNo == t_thrd.xlog_cxt.readSegNo);
Assert(targetPageOff == t_thrd.xlog_cxt.readOff);
Assert((uint32)expectReadLen <= t_thrd.xlog_cxt.readLen);
*readTLI = t_thrd.xlog_cxt.curFileTLI;
return (int)t_thrd.xlog_cxt.readLen;
next_record_is_invalid:
t_thrd.xlog_cxt.failedSources |= t_thrd.xlog_cxt.readSource;
if (t_thrd.xlog_cxt.readFile >= 0) {
close(t_thrd.xlog_cxt.readFile);
}
t_thrd.xlog_cxt.readFile = -1;
t_thrd.xlog_cxt.readLen = 0;
t_thrd.xlog_cxt.readSource = 0;
t_thrd.xlog_cxt.recoveryTriggered = true;
return -1;
}
static int SSDoradoReadXLog(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI, char* xlog_path)
{
XLogPageReadPrivate *readprivate = (XLogPageReadPrivate*)xlogreader->private_data;
int emode = readprivate->emode;
XLogRecPtr RecPtr = targetPagePtr;
uint32 targetPageOff;
bool processtrxn = false;
bool randAccess = readprivate->randAccess;
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
XLogSegNo replayedSegNo;
uint32 sources;
uint32 actualBytes;
#ifdef USE_ASSERT_CHECKING
XLogSegNo targetSegNo;
XLByteToSeg(targetPagePtr, targetSegNo);
#endif
targetPageOff = targetPagePtr % XLogSegSize;
if (t_thrd.xlog_cxt.readFile >= 0 && !XLByteInSeg(targetPagePtr, t_thrd.xlog_cxt.readSegNo)) {
* Request a restartpoint if we've replayed too much xlog since the
* last one.
*/
if (t_thrd.xlog_cxt.StandbyModeRequested &&
(t_thrd.xlog_cxt.bgwriterLaunched || t_thrd.xlog_cxt.pagewriter_launched) && !dummyStandbyMode) {
if (get_real_recovery_parallelism() > 1) {
XLByteToSeg(GetXLogReplayRecPtr(NULL), replayedSegNo);
} else {
replayedSegNo = t_thrd.xlog_cxt.readSegNo;
}
if (XLogCheckpointNeeded(replayedSegNo)) {
(void)GetRedoRecPtr();
if (XLogCheckpointNeeded(replayedSegNo)) {
RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
}
}
}
close(t_thrd.xlog_cxt.readFile);
t_thrd.xlog_cxt.readFile = -1;
t_thrd.xlog_cxt.readSource = 0;
}
t_thrd.xlog_cxt.readOff = targetPageOff;
t_thrd.xlog_cxt.readLen = XLOG_BLCKSZ;
t_thrd.xlog_cxt.receivedUpto = g_instance.xlog_cxt.ssReplicationXLogCtl->insertHead;
XLByteToSeg(targetPagePtr, t_thrd.xlog_cxt.readSegNo);
XLByteAdvance(RecPtr, reqLen);
XLogRecPtr expectedRecPtr = RecPtr;
if (RecPtr % XLogSegSize == 0) {
XLByteAdvance(expectedRecPtr, SizeOfXLogLongPHD);
} else if (RecPtr % XLOG_BLCKSZ == 0) {
XLByteAdvance(expectedRecPtr, SizeOfXLogShortPHD);
}
retry:
if (t_thrd.xlog_cxt.readFile < 0 ||
(t_thrd.xlog_cxt.readSource == XLOG_FROM_STREAM && XLByteLT(t_thrd.xlog_cxt.receivedUpto, RecPtr))) {
if (t_thrd.xlog_cxt.StandbyMode && t_thrd.xlog_cxt.startup_processing) {
for (;;) {
* Need to check here also for the case where consistency level is
* already reached without replaying any record i.e. just after reading
* of checkpoint data it has reached to minRecoveryPoint. Also
* whenever we are going to loop in the data receive from master
* node its bettwe we check if consistency level has reached. So
* instead of keeping in all places before ReadRecord, we can keep
* here in centralised location.
*/
ProcTxnWorkLoad(false);
CheckRecoveryConsistency();
if (WalRcvInProgress()) {
bool havedata = false;
* Walreceiver is active, so see if new data has arrived.
*
* We only advance XLogReceiptTime when we obtain fresh
* WAL from walreceiver and observe that we had already
* processed everything before the most recent "chunk"
* that it flushed to disk. In steady state where we are
* keeping up with the incoming data, XLogReceiptTime will
* be updated on each cycle. When we are behind,
* XLogReceiptTime will not advance, so the grace time
* alloted to conflicting queries will decrease.
*/
if (XLByteLT(expectedRecPtr, t_thrd.xlog_cxt.receivedUpto)) {
havedata = true;
} else {
havedata = false;
XLogRecPtr latestChunkStart;
ReadSSDoradoCtlInfoFile();
t_thrd.xlog_cxt.receivedUpto = g_instance.xlog_cxt.ssReplicationXLogCtl->insertHead;
latestChunkStart = g_instance.xlog_cxt.ssReplicationXLogCtl->insertHead;
if (XLByteLT(expectedRecPtr, t_thrd.xlog_cxt.receivedUpto)) {
havedata = true;
if (!XLByteLT(RecPtr, latestChunkStart)) {
t_thrd.xlog_cxt.XLogReceiptTime = GetCurrentTimestamp();
SetCurrentChunkStartTime(t_thrd.xlog_cxt.XLogReceiptTime);
}
} else {
havedata = false;
}
}
if (havedata) {
t_thrd.xlog_cxt.readSource = XLOG_FROM_STREAM;
t_thrd.xlog_cxt.XLogReceiptSource = XLOG_FROM_STREAM;
if (t_thrd.xlog_cxt.readFile < 0) {
t_thrd.xlog_cxt.readFile = XLogFileRead(t_thrd.xlog_cxt.readSegNo, PANIC,
t_thrd.xlog_cxt.recoveryTargetTLI, XLOG_FROM_STREAM,
false);
Assert(t_thrd.xlog_cxt.readFile >=0);
} else {
t_thrd.xlog_cxt.readSource = XLOG_FROM_STREAM;
t_thrd.xlog_cxt.XLogReceiptSource = XLOG_FROM_STREAM;
}
break;
}
t_thrd.xlog_cxt.RedoDone = IsRedoDonePromoting();
pg_memory_barrier();
if (SS_DORADO_MAIN_STANDBY_NODE && WalRcvIsDone() && CheckForFailoverTrigger()) {
ProcTxnWorkLoad(true);
ereport(LOG, (errmsg("RecPtr(%X/%X), receivedUpto(%X/%X).", (uint32)(RecPtr >> 32),
(uint32)RecPtr, (uint32)(t_thrd.xlog_cxt.receivedUpto >> 32),
(uint32)t_thrd.xlog_cxt.receivedUpto)));
ShutdownWalRcv();
ShutdownDataRcv();
goto triggered;
}
uint32 disableConnectionNode =
pg_atomic_read_u32(&g_instance.comm_cxt.localinfo_cxt.need_disable_connection_node);
if (disableConnectionNode && WalRcvIsRunning()) {
ereport(LOG, (errmsg("request xlog receivedupto at %X/%X.",
(uint32)(t_thrd.xlog_cxt.receivedUpto >> 32),
(uint32)t_thrd.xlog_cxt.receivedUpto)));
ShutdownWalRcv();
}
if (!processtrxn) {
ProcTxnWorkLoad(true);
processtrxn = true;
goto retry;
}
* Wait for more WAL to arrive, or timeout to be reached
*/
WaitLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch, WL_LATCH_SET | WL_TIMEOUT, 1000L);
processtrxn = false;
ResetLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch);
} else {
if (t_thrd.xlog_cxt.readFile >= 0) {
close(t_thrd.xlog_cxt.readFile);
t_thrd.xlog_cxt.readFile = -1;
}
if (randAccess) {
t_thrd.xlog_cxt.curFileTLI = 0;
}
if (SS_DORADO_MAIN_STANDBY_NODE && CheckForFailoverTrigger()) {
goto triggered;
}
if (t_thrd.startup_cxt.shutdown_requested) {
ereport(LOG, (errmsg("startup shutdown")));
proc_exit(0);
}
* Try to restore the file from archive, or read an
* existing file from pg_xlog.
*/
sources = XLOG_FROM_ARCHIVE | XLOG_FROM_PG_XLOG;
* Before we sleep, re-scan for possible new timelines
* if we were requested to recover to the latest
* timeline.
*/
if (t_thrd.xlog_cxt.recoveryTargetIsLatest) {
if (rescanLatestTimeLine()) {
continue;
}
}
if (!xlogctl->IsRecoveryDone) {
g_instance.comm_cxt.predo_cxt.redoPf.redo_done_time = GetCurrentTimestamp();
g_instance.comm_cxt.predo_cxt.redoPf.recovery_done_ptr = t_thrd.xlog_cxt.ReadRecPtr;
ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
errmsg("XLogPageRead IsRecoveryDone is set true, "
"ReadRecPtr:%X/%X, EndRecPtr:%X/%X, "
"receivedUpto:%X/%X, CtlInfo_insertHead:%X/%X.",
(uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32),
(uint32)(t_thrd.xlog_cxt.ReadRecPtr),
(uint32)(t_thrd.xlog_cxt.EndRecPtr >> 32),
(uint32)(t_thrd.xlog_cxt.EndRecPtr),
(uint32)(t_thrd.xlog_cxt.receivedUpto >> 32),
(uint32)(t_thrd.xlog_cxt.receivedUpto),
(uint32)(g_instance.xlog_cxt.ssReplicationXLogCtl->insertHead >> 32),
(uint32)(g_instance.xlog_cxt.ssReplicationXLogCtl->insertHead))));
parallel_recovery::redo_dump_all_stats();
}
* signal postmaster to update local redo end point to gaussdb state file.
*/
ProcTxnWorkLoad(true);
if (!xlogctl->IsRecoveryDone) {
SendPostmasterSignal(PMSIGNAL_LOCAL_RECOVERY_DONE);
if (SS_PERFORMING_SWITCHOVER) {
g_instance.dms_cxt.SSClusterState = NODESTATE_STANDBY_PROMOTED;
}
}
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->IsRecoveryDone = true;
SpinLockRelease(&xlogctl->info_lck);
* or 0x111(XLOG_FROM_STREAM | XLOG_FROM_ARCHIVE | XLOG_FROM_PG_XLOG) can enter.
* no xlog: due to read xlog is null > 10 times, set t_thrd.xlog_cxt.failedSource = readsource = 0x011
* if enter, it indicate that pg_xlog or arichive fail so switch to stream.
*/
if ((!(sources & ~t_thrd.xlog_cxt.failedSources))) {
ereport(LOG, (errmsg("current source=%u, last read with failedSource=%u", sources,
t_thrd.xlog_cxt.failedSources)));
t_thrd.xlog_cxt.failedSources = 0;
}
* If primary_conninfo is set, launch walreceiver to
* try to stream the missing WAL, before retrying to
* restore from archive/pg_xlog.
*
* If fetching_ckpt is TRUE, RecPtr points to the
* initial checkpoint location. In that case, we use
* RedoStartLSN as the streaming start position
* instead of RecPtr, so that when we later jump
* backwards to start redo at RedoStartLSN, we will
* have the logs streamed already.
*/
load_server_mode();
ProcTxnWorkLoad(false);
if (!SS_IN_REFORM) {
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
rename_recovery_conf_for_roach();
ereport(LOG, (errmsg("request xlog stream from dorado copy at %X/%X.",
(uint32)(targetRecPtr >> 32),
(uint32)targetRecPtr)));
ShutdownWalRcv();
SpinLockAcquire(&walrcv->mutex);
walrcv->receivedUpto = 0;
SpinLockRelease(&walrcv->mutex);
RequestXLogStreaming(&targetRecPtr, 0, REPCONNTARGET_SHARED_STORAGE, 0);
continue;
}
* In startup phase enter, failedSource = 0, sources = 011
*/
sources &= ~t_thrd.xlog_cxt.failedSources;
t_thrd.xlog_cxt.readFile = XLogFileReadAnyTLI(t_thrd.xlog_cxt.readSegNo, emode, sources);
if (t_thrd.xlog_cxt.readFile >=0) {
break;
}
ereport(LOG, (errmsg("do not find any more files. source=%u failedSource=%u", sources,
t_thrd.xlog_cxt.failedSources)));
* Nope, not found in archive and/or pg_xlog. set t_thrd.xlog_cxt.failedSources = 0x111,
* which indicate that XLOG_FROM_STREAM | XLOG_FROM_ARCHIVE | XLOG_FROM_ARCHIVE.
*/
t_thrd.xlog_cxt.failedSources |= sources;
}
* This possibly-long loop needs to handle interrupts of
* startup process.
*/
RedoInterruptCallBack();
}
} else {
if (t_thrd.xlog_cxt.readFile < 0) {
uint32 sources;
if (randAccess) {
t_thrd.xlog_cxt.curFileTLI = 0;
}
sources = XLOG_FROM_PG_XLOG;
if (t_thrd.xlog_cxt.InArchiveRecovery) {
sources |= XLOG_FROM_ARCHIVE;
}
t_thrd.xlog_cxt.readFile = XLogFileReadAnyTLI(t_thrd.xlog_cxt.readSegNo, emode, sources);
if (t_thrd.xlog_cxt.readFile < 0) {
return -1;
}
}
}
}
* At this point, we have the right segment open and if we're streaming we
* know the requested record is in it.
*/
Assert(t_thrd.xlog_cxt.readFile != -1);
t_thrd.xlog_cxt.readLen = XLOG_BLCKSZ;
t_thrd.xlog_cxt.readOff = targetPageOff;
if (xlogreader->preReadBuf == NULL) {
actualBytes = (uint32)pread(t_thrd.xlog_cxt.readFile, readBuf, t_thrd.xlog_cxt.readLen, t_thrd.xlog_cxt.readOff);
} else {
actualBytes = (uint32)SSReadXlogInternal(xlogreader, targetPagePtr, targetRecPtr, readBuf, t_thrd.xlog_cxt.readLen,
t_thrd.xlog_cxt.readFile);
}
if (actualBytes != t_thrd.xlog_cxt.readLen) {
ereport(LOG, (errmsg("%s read failed, change xlog file.", SS_XLOGDIR)));
goto next_record_is_invalid;
}
Assert(targetSegNo == t_thrd.xlog_cxt.readSegNo);
Assert(targetPageOff == t_thrd.xlog_cxt.readOff);
Assert((uint32)reqLen <= t_thrd.xlog_cxt.readLen);
*readTLI = t_thrd.xlog_cxt.curFileTLI;
return t_thrd.xlog_cxt.readLen;
next_record_is_invalid:
t_thrd.xlog_cxt.failedSources |= t_thrd.xlog_cxt.readSource;
if (t_thrd.xlog_cxt.readFile >= 0) {
close(t_thrd.xlog_cxt.readFile);
}
t_thrd.xlog_cxt.readFile = -1;
t_thrd.xlog_cxt.readLen = 0;
t_thrd.xlog_cxt.readSource = 0;
return -1;
triggered:
if (t_thrd.xlog_cxt.readFile >= 0) {
close(t_thrd.xlog_cxt.readFile);
}
t_thrd.xlog_cxt.readFile = -1;
t_thrd.xlog_cxt.readLen = 0;
t_thrd.xlog_cxt.readSource = 0;
t_thrd.xlog_cxt.recoveryTriggered = true;
return -1;
}
static int SSStreamReadXLog(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI, char* xlog_path)
{
XLogPageReadPrivate *readprivate = (XLogPageReadPrivate*)xlogreader->private_data;
bool fetching_ckpt = readprivate->fetching_ckpt;
int emode = readprivate->emode;
XLogRecPtr RecPtr = targetPagePtr;
uint32 targetPageOff;
bool processtrxn = false;
bool randAccess = readprivate->randAccess;
XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
XLogSegNo replayedSegNo;
uint32 actualBytes;
targetPageOff = targetPagePtr % XLogSegSize;
if (t_thrd.xlog_cxt.readFile >= 0 && !XLByteInSeg(targetPagePtr, t_thrd.xlog_cxt.readSegNo)) {
* Request a restartpoint if we've replayed too much xlog since the
* last one.
*/
if (t_thrd.xlog_cxt.StandbyModeRequested &&
(t_thrd.xlog_cxt.bgwriterLaunched || t_thrd.xlog_cxt.pagewriter_launched)) {
if (get_real_recovery_parallelism() > 1) {
XLByteToSeg(GetXLogReplayRecPtr(NULL), replayedSegNo);
} else {
replayedSegNo = t_thrd.xlog_cxt.readSegNo;
}
if (XLogCheckpointNeeded(replayedSegNo)) {
(void)GetRedoRecPtr();
if (XLogCheckpointNeeded(replayedSegNo)) {
RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
}
}
}
close(t_thrd.xlog_cxt.readFile);
t_thrd.xlog_cxt.readFile = -1;
t_thrd.xlog_cxt.readSource = 0;
}
XLByteToSeg(targetPagePtr, t_thrd.xlog_cxt.readSegNo);
XLByteAdvance(RecPtr, reqLen);
retry:
if (t_thrd.xlog_cxt.readFile < 0 ||
(t_thrd.xlog_cxt.readSource == XLOG_FROM_STREAM && XLByteLT(t_thrd.xlog_cxt.receivedUpto, RecPtr))) {
if (t_thrd.xlog_cxt.startup_processing) {
for (;;) {
* Need to check here also for the case where consistency level is
* already reached without replaying any record i.e. just after reading
* of checkpoint data it has reached to minRecoveryPoint. Also
* whenever we are going to loop in the data receive from master
* node its bettwe we check if consistency level has reached. So
* instead of keeping in all places before ReadRecord, we can keep
* here in centralised location.
*/
ProcTxnWorkLoad(false);
CheckRecoveryConsistency();
if (WalRcvInProgress()) {
XLogRecPtr expectedRecPtr = RecPtr;
bool havedata = false;
if (t_thrd.xlog_cxt.failedSources & XLOG_FROM_STREAM) {
ProcTxnWorkLoad(true);
ereport(LOG, (errmsg("read from stream failed, request xlog receivedupto at %X/%X.",
(uint32)(t_thrd.xlog_cxt.receivedUpto >> 32),
(uint32)t_thrd.xlog_cxt.receivedUpto)));
ShutdownWalRcv();
continue;
}
* Walreceiver is active, so see if new data has arrived.
*
* We only advance XLogReceiptTime when we obtain fresh
* WAL from walreceiver and observe that we had already
* processed everything before the most recent "chunk"
* that it flushed to disk. In steady state where we are
* keeping up with the incoming data, XLogReceiptTime will
* be updated on each cycle. When we are behind,
* XLogReceiptTime will not advance, so the grace time
* alloted to conflicting queries will decrease.
*/
if (RecPtr % XLogSegSize == 0) {
XLByteAdvance(expectedRecPtr, SizeOfXLogLongPHD);
} else if (RecPtr % XLOG_BLCKSZ == 0) {
XLByteAdvance(expectedRecPtr, SizeOfXLogShortPHD);
}
if (XLByteLT(expectedRecPtr, t_thrd.xlog_cxt.receivedUpto)) {
havedata = true;
} else {
XLogRecPtr latestChunkStart;
t_thrd.xlog_cxt.receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart);
if (XLByteLT(expectedRecPtr, t_thrd.xlog_cxt.receivedUpto)) {
havedata = true;
if (!XLByteLT(RecPtr, latestChunkStart)) {
t_thrd.xlog_cxt.XLogReceiptTime = GetCurrentTimestamp();
SetCurrentChunkStartTime(t_thrd.xlog_cxt.XLogReceiptTime);
}
} else {
havedata = false;
}
}
if (havedata) {
* Great, streamed far enough. Open the file if it's
* not open already. Use XLOG_FROM_STREAM so that
* source info is set correctly and XLogReceiptTime
* isn't changed.
*/
if (t_thrd.xlog_cxt.readFile < 0) {
t_thrd.xlog_cxt.readFile = XLogFileRead(t_thrd.xlog_cxt.readSegNo, PANIC,
t_thrd.xlog_cxt.recoveryTargetTLI, XLOG_FROM_STREAM,
false);
Assert(t_thrd.xlog_cxt.readFile >= 0);
} else {
t_thrd.xlog_cxt.readSource = XLOG_FROM_STREAM;
t_thrd.xlog_cxt.XLogReceiptSource = XLOG_FROM_STREAM;
}
break;
}
t_thrd.xlog_cxt.RedoDone = IsRedoDonePromoting();
pg_memory_barrier();
if (WalRcvIsDone() && (CheckForSwitchoverTrigger() || CheckForFailoverTrigger())) {
goto retry;
}
if (!processtrxn) {
ProcTxnWorkLoad(true);
processtrxn = true;
goto retry;
}
* Wait for more WAL to arrive, or timeout to be reached
*/
WaitLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch, WL_LATCH_SET | WL_TIMEOUT, 1000L);
processtrxn = false;
ResetLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch);
} else {
* Until walreceiver manages to reconnect, poll the
* archive.
*/
if (t_thrd.xlog_cxt.readFile >= 0) {
close(t_thrd.xlog_cxt.readFile);
t_thrd.xlog_cxt.readFile = -1;
}
if (randAccess) {
t_thrd.xlog_cxt.curFileTLI = 0;
}
* Try to restore the file from archive, or read an
* existing file from pg_xlog.
*/
uint32 sources = XLOG_FROM_ARCHIVE | XLOG_FROM_PG_XLOG;
ereport(DEBUG5, (errmsg("failedSources: %u", t_thrd.xlog_cxt.failedSources)));
if (!(sources & ~t_thrd.xlog_cxt.failedSources)) {
* We've exhausted all options for retrieving the
* file. Retry.
*/
t_thrd.xlog_cxt.failedSources = 0;
* Before we sleep, re-scan for possible new timelines
* if we were requested to recover to the latest
* timeline.
*/
if (t_thrd.xlog_cxt.recoveryTargetIsLatest) {
if (rescanLatestTimeLine()) {
continue;
}
}
if (t_thrd.startup_cxt.shutdown_requested) {
ereport(LOG, (errmsg("startup shutdown")));
proc_exit(0);
}
if (!xlogctl->IsRecoveryDone) {
g_instance.comm_cxt.predo_cxt.redoPf.redo_done_time = GetCurrentTimestamp();
g_instance.comm_cxt.predo_cxt.redoPf.recovery_done_ptr = t_thrd.xlog_cxt.ReadRecPtr;
ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
errmsg("XLogPageRead IsRecoveryDone is set true,"
"ReadRecPtr:%X/%X, EndRecPtr:%X/%X",
(uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32),
(uint32)(t_thrd.xlog_cxt.ReadRecPtr),
(uint32)(t_thrd.xlog_cxt.EndRecPtr >> 32),
(uint32)(t_thrd.xlog_cxt.EndRecPtr))));
parallel_recovery::redo_dump_all_stats();
}
* signal postmaster to update local redo end
* point to gaussdb state file.
*/
ProcTxnWorkLoad(true);
if (!xlogctl->IsRecoveryDone) {
SendPostmasterSignal(PMSIGNAL_LOCAL_RECOVERY_DONE);
if (SS_PERFORMING_SWITCHOVER) {
g_instance.dms_cxt.SSClusterState = NODESTATE_STANDBY_PROMOTED;
}
}
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->IsRecoveryDone = true;
SpinLockRelease(&xlogctl->info_lck);
static uint64 printFrequency = 0;
knl_g_set_redo_finish_status(REDO_FINISH_STATUS_LOCAL | REDO_FINISH_STATUS_CM);
if ((printFrequency & 0xFF) == 0) {
ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
errmsg("XLogPageRead set redo finish status,"
"ReadRecPtr:%X/%X, EndRecPtr:%X/%X",
(uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32),
(uint32)(t_thrd.xlog_cxt.ReadRecPtr),
(uint32)(t_thrd.xlog_cxt.EndRecPtr >> 32),
(uint32)(t_thrd.xlog_cxt.EndRecPtr))));
}
printFrequency++;
pg_usleep(50000L);
* If primary_conninfo is set, launch walreceiver to
* try to stream the missing WAL, before retrying to
* restore from archive/pg_xlog.
*
* If fetching_ckpt is TRUE, RecPtr points to the
* initial checkpoint location. In that case, we use
* RedoStartLSN as the streaming start position
* instead of RecPtr, so that when we later jump
* backwards to start redo at RedoStartLSN, we will
* have the logs streamed already.
*/
load_server_mode();
if (CheckForFailoverTrigger()) {
goto triggered;
}
ProcTxnWorkLoad(false);
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
CheckMaxPageFlushLSN(targetRecPtr);
rename_recovery_conf_for_roach();
ereport(LOG, (errmsg("request xlog stream at %X/%X.",
fetching_ckpt ? (uint32)(t_thrd.xlog_cxt.RedoStartLSN >> 32)
: (uint32)(targetRecPtr >> 32),
fetching_ckpt ? (uint32)t_thrd.xlog_cxt.RedoStartLSN
: (uint32)targetRecPtr)));
ShutdownWalRcv();
t_thrd.xlog_cxt.receivedUpto = 0;
SpinLockAcquire(&walrcv->mutex);
walrcv->receivedUpto = 0;
SpinLockRelease(&walrcv->mutex);
RequestXLogStreaming(fetching_ckpt ? &t_thrd.xlog_cxt.RedoStartLSN : &targetRecPtr,
t_thrd.xlog_cxt.PrimaryConnInfo, REPCONNTARGET_PRIMARY,
u_sess->attr.attr_storage.PrimarySlotName);
continue;
}
sources &= ~t_thrd.xlog_cxt.failedSources;
t_thrd.xlog_cxt.readFile = XLogFileReadAnyTLI(t_thrd.xlog_cxt.readSegNo, emode, sources);
if (t_thrd.xlog_cxt.readFile >= 0) {
break;
}
ereport(LOG, (errmsg("do not find any more files.sources=%u failedSources=%u", sources,
t_thrd.xlog_cxt.failedSources)));
* Nope, not found in archive and/or pg_xlog.
*/
t_thrd.xlog_cxt.failedSources |= sources;
* Check to see if the trigger file exists. Note that we
* do this only after failure, so when you create the
* trigger file, we still finish replaying as much as we
* can from archive and pg_xlog before failover.
*/
if (CheckForFailoverTrigger() || CheckForSwitchoverTrigger()) {
XLogRecPtr receivedUpto = GetWalRcvWriteRecPtr(NULL);
XLogRecPtr EndRecPtrTemp = t_thrd.xlog_cxt.EndRecPtr;
XLByteAdvance(EndRecPtrTemp, SizeOfXLogRecord);
if (XLByteLT(EndRecPtrTemp, receivedUpto) && !FORCE_FINISH_ENABLED &&
t_thrd.xlog_cxt.currentRetryTimes++ < g_retryTimes) {
ereport(WARNING, (errmsg("there are some received xlog have not been redo "
"the tail of last redo lsn:%X/%X, received lsn:%X/%X, retry %d times",
(uint32)(EndRecPtrTemp >> 32), (uint32)EndRecPtrTemp,
(uint32)(receivedUpto >> 32), (uint32)receivedUpto,
t_thrd.xlog_cxt.currentRetryTimes)));
return -1;
}
ereport(LOG,
(errmsg("read record failed when promoting, current lsn (%X/%X), received lsn(%X/%X),"
"sources[%u], failedSources[%u], readSource[%u], readFile[%d], readId[%u],"
"readSeg[%u], readOff[%u], readLen[%u]",
(uint32)(RecPtr >> 32), (uint32)RecPtr,
(uint32)(t_thrd.xlog_cxt.receivedUpto >> 32),
(uint32)t_thrd.xlog_cxt.receivedUpto, sources, t_thrd.xlog_cxt.failedSources,
t_thrd.xlog_cxt.readSource, t_thrd.xlog_cxt.readFile,
(uint32)(t_thrd.xlog_cxt.readSegNo >> 32), (uint32)t_thrd.xlog_cxt.readSegNo,
t_thrd.xlog_cxt.readOff, t_thrd.xlog_cxt.readLen)));
goto triggered;
}
}
* This possibly-long loop needs to handle interrupts of
* startup process.
*/
RedoInterruptCallBack();
}
} else {
if (t_thrd.xlog_cxt.readFile < 0) {
uint32 sources;
if (randAccess) {
t_thrd.xlog_cxt.curFileTLI = 0;
}
sources = XLOG_FROM_PG_XLOG;
if (t_thrd.xlog_cxt.InArchiveRecovery) {
sources |= XLOG_FROM_ARCHIVE;
}
t_thrd.xlog_cxt.readFile = XLogFileReadAnyTLI(t_thrd.xlog_cxt.readSegNo, emode, sources);
if (t_thrd.xlog_cxt.readFile < 0) {
return -1;
}
}
}
}
if (t_thrd.xlog_cxt.readSource == XLOG_FROM_STREAM) {
if ((targetPagePtr / XLOG_BLCKSZ) != (t_thrd.xlog_cxt.receivedUpto / XLOG_BLCKSZ)) {
t_thrd.xlog_cxt.readLen = XLOG_BLCKSZ;
} else {
t_thrd.xlog_cxt.readLen = t_thrd.xlog_cxt.receivedUpto % XLogSegSize - targetPageOff;
}
} else {
t_thrd.xlog_cxt.readLen = XLOG_BLCKSZ;
}
t_thrd.xlog_cxt.readOff = targetPageOff;
actualBytes = (uint32)pread(t_thrd.xlog_cxt.readFile, readBuf, XLOG_BLCKSZ, t_thrd.xlog_cxt.readOff);
if (actualBytes != XLOG_BLCKSZ) {
ereport(LOG, (errmsg("%s read failed", SS_XLOGDIR)));
goto next_record_is_invalid;
}
Assert(targetPageOff == t_thrd.xlog_cxt.readOff);
Assert((uint32)reqLen <= t_thrd.xlog_cxt.readLen);
*readTLI = t_thrd.xlog_cxt.curFileTLI;
return t_thrd.xlog_cxt.readLen;
next_record_is_invalid:
t_thrd.xlog_cxt.failedSources |= t_thrd.xlog_cxt.readSource;
if (t_thrd.xlog_cxt.readFile >= 0) {
close(t_thrd.xlog_cxt.readFile);
}
t_thrd.xlog_cxt.readFile = -1;
t_thrd.xlog_cxt.readLen = 0;
t_thrd.xlog_cxt.readSource = 0;
return -1;
triggered:
if (t_thrd.xlog_cxt.readFile >= 0) {
close(t_thrd.xlog_cxt.readFile);
}
t_thrd.xlog_cxt.readFile = -1;
t_thrd.xlog_cxt.readLen = 0;
t_thrd.xlog_cxt.readSource = 0;
t_thrd.xlog_cxt.recoveryTriggered = true;
return -1;
}
int SSXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI, char* xlog_path)
{
int read_len;
if (SS_DORADO_CLUSTER) {
read_len = SSDoradoReadXLog(xlogreader, targetPagePtr, reqLen, targetRecPtr, readBuf, readTLI, xlog_path);
} else if (SS_STREAM_MAIN_STANDBY_NODE) {
read_len = SSStreamReadXLog(xlogreader, targetPagePtr, reqLen, targetRecPtr, readBuf, readTLI, xlog_path);
} else {
read_len = SSReadXLog(xlogreader, targetPagePtr, reqLen, targetRecPtr, readBuf, readTLI, xlog_path);
}
return read_len;
}
int UwalXLogPageRead(XLogRecPtr targetPagePtr, char *readBuf, XLogRecPtr targetRecPtr = 0)
{
uint64_t readLen = XLOG_BLCKSZ;
if (targetRecPtr > 0) {
uint64_t midLen = targetRecPtr - targetPagePtr;
if (midLen < XLOG_BLCKSZ && midLen > 0) {
readLen = midLen;
}
}
if (t_thrd.xlog_cxt.readSource == XLOG_FROM_STREAM && t_thrd.xlog_cxt.StandbyMode &&
t_thrd.xlog_cxt.startup_processing && !dummyStandbyMode) {
if (!WalRcvInProgress()) {
return -1;
}
}
int ret = GsUwalRead(&t_thrd.xlog_cxt.uwalInfo.id, targetPagePtr, readBuf, readLen);
if (0 != ret) {
return -1;
}
return readLen;
}
static void XLogFlushUwal(XLogRecPtr writeRqstPtr)
{
START_CRIT_SECTION();
XLogWriteUwal(writeRqstPtr);
END_CRIT_SECTION();
pg_memory_barrier();
WalSndWakeupProcessRequests();
SendShareStorageXLogCopyBackendWakeupRequest();
* Wake up waiting commit threads to check on the latest LSN flushed.
*/
(void)pg_atomic_exchange_u64((uint64 *)&g_instance.wal_cxt.flushResult,
t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtResult.Flush);
* Notifity commit agents to wake up and check latest flushResult
*/
WakeupWalSemaphore(&g_instance.wal_cxt.walFlushWaitLock->l.sem);
}
static bool UwalAcquire()
{
if (t_thrd.xlog_cxt.uwalInfo.info.dataSize <= 0) {
TimeLineID tli =
(t_thrd.xlog_cxt.startup_processing ? t_thrd.xlog_cxt.recoveryTargetTLI : t_thrd.xlog_cxt.ThisTimeLineID);
if (GsUwalQueryByUser(tli, false) != 0) {
ereport(PANIC, (errcode_for_file_access(), errmsg("uwal query by user failed")));
}
if (t_thrd.xlog_cxt.uwalInfo.info.dataSize > 0) {
GsUwalRenewFileRenamePtr();
}
}
if (t_thrd.xlog_cxt.uwalInfo.info.dataSize > 0) {
return true;
}
return false;
}
static void SetUwalReadInfo(XLogRecPtr targetPagePtr, XLogRecPtr writeUwalOffSet, uint32 targetPageOff)
{
if ((targetPagePtr / XLOG_BLCKSZ) != (writeUwalOffSet / XLOG_BLCKSZ)) {
t_thrd.xlog_cxt.readLen = XLOG_BLCKSZ;
} else {
t_thrd.xlog_cxt.readLen = writeUwalOffSet % XLogSegSize - targetPageOff;
}
t_thrd.xlog_cxt.readOff = targetPageOff;
}
void XlogArchUwal(XLogRecPtr archRqstPtr)
{
XLogSegNo archLogSegNo;
XLByteToSeg(archRqstPtr, archLogSegNo);
if (XLogArchivingActive()) {
XLogArchiveNotifySeg(archLogSegNo);
}
return;
}