*
* buf_internals.h
* Internal definitions for buffer manager and the buffer replacement
* strategy.
*
*
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/storage/buf/buf_internals.h
*
* -------------------------------------------------------------------------
*/
#ifndef BUFMGR_INTERNALS_H
#define BUFMGR_INTERNALS_H
#include "storage/buf/buf.h"
#include "storage/buf/bufmgr.h"
#include "storage/latch.h"
#include "storage/lock/lwlock.h"
#include "storage/shmem.h"
#include "storage/smgr/smgr.h"
#include "storage/spin.h"
#include "utils/relcache.h"
#include "utils/atomic.h"
#include "access/xlogdefs.h"
* Buffer state is a single 32-bit variable where following data is combined.
*
* - 18 bits refcount
* - 4 bits usage count
* - 10 bits of flags
*
* Combining these values allows to perform some operations without locking
* the buffer header, by modifying them together with a CAS loop.
*
* The definition of buffer state components is below.
*/
#define BUF_REFCOUNT_ONE 1LU
#define BUF_REFCOUNT_ONE_16 (1LU << 16)
#define BUF_REFCOUNT_ONE_32 (1LU << 32)
#define BUF_REFCOUNT_MASK ((1LU << 17) - 1)
#define BUF_USAGECOUNT_MASK 0x003C000000000000LU
#define BUF_USAGECOUNT_ONE (1LU << 18 << 32)
#define BUF_USAGECOUNT_SHIFT (18 + 32)
#define BUF_FLAG_MASK 0xFFC0000000000000LU
#define BUF_STATE_GET_USAGECOUNT(state) (((state)&BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
#define BUF_STATE_GET_REFCOUNT(state) (((state)&0xFFFFFFFF))
* Flags for buffer descriptors
*
* Note: TAG_VALID essentially means that there is a buffer hashtable
* entry associated with the buffer's tag.
*/
#define BM_IN_MIGRATE (1LU << 16 << 32)
#define BM_IS_TMP_BUF (1LU << 21 << 32)
#define BM_IS_META (1LU << 17 << 32)
#define BM_LOCKED (1LU << 22 << 32)
#define BM_DIRTY (1LU << 23 << 32)
#define BM_VALID (1LU << 24 << 32)
#define BM_TAG_VALID (1LU << 25 << 32)
#define BM_IO_IN_PROGRESS (1LU << 26 << 32)
#define BM_IO_ERROR (1LU << 27 << 32)
#define BM_JUST_DIRTIED (1LU << 28 << 32)
#define BM_PIN_COUNT_WAITER (1LU << 29 << 32)
#define BM_CHECKPOINT_NEEDED (1LU << 30 << 32)
#define BM_PERMANENT \
(1LU << 31 << 32)
* unlogged, or init fork) ) */
* The maximum allowed value of usage_count represents a tradeoff between
* accuracy and speed of the clock-sweep buffer management algorithm. A
* large value (comparable to NBuffers) would approximate LRU semantics.
* But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of
* clock sweeps to find a free buffer, so in practice we don't want the
* value to be very large.
*/
#define BM_MAX_USAGE_COUNT 15
* Buffer tag identifies which disk block the buffer contains.
*
* Note: the BufferTag data must be sufficient to determine where to write the
* block, without reference to pg_class or pg_tablespace entries. It's
* possible that the backend flushing the buffer doesn't even believe the
* relation is visible yet (its xact may have started before the xact that
* created the rel). The storage manager must be able to cope anyway.
*
* Note: if there's any pad bytes in the struct, INIT_BUFFERTAG will have
* to be fixed to zero them, since this struct is used as a hash key.
*/
typedef struct buftag {
RelFileNode rnode;
ForkNumber forkNum;
BlockNumber blockNum;
} BufferTag;
typedef struct buftagnocompress {
RelFileNodeV2 rnode;
ForkNumber forkNum;
BlockNumber blockNum;
} BufferTagSecondVer;
typedef struct buftagnohbkt {
RelFileNodeOld rnode;
ForkNumber forkNum;
BlockNumber blockNum;
} BufferTagFirstVer;
typedef struct {
BufferTag key;
int id;
} BufferLookupEnt;
#define CLEAR_BUFFERTAG(a) \
((a).rnode.spcNode = InvalidOid, \
(a).rnode.dbNode = InvalidOid, \
(a).rnode.relNode = InvalidOid, \
(a).rnode.bucketNode = -1,\
(a).rnode.opt = DefaultFileNodeOpt, \
(a).forkNum = InvalidForkNumber, \
(a).blockNum = InvalidBlockNumber)
#define INIT_BUFFERTAG(a, xx_rnode, xx_forkNum, xx_blockNum) \
((a).rnode = (xx_rnode), (a).forkNum = (xx_forkNum), (a).blockNum = (xx_blockNum))
#define BUFFERTAGS_EQUAL(a, b) \
(RelFileNodeEquals((a).rnode, (b).rnode) && (a).blockNum == (b).blockNum && (a).forkNum == (b).forkNum)
#define BUFFERTAGS_PTR_EQUAL(a, b) \
(RelFileNodeEquals((a)->rnode, (b)->rnode) && (a)->blockNum == (b)->blockNum && (a)->forkNum == (b)->forkNum)
#define BUFFERTAGS_PTR_SET(a, b) \
((a)->rnode.spcNode = (b)->rnode.spcNode, \
(a)->rnode.dbNode = (b)->rnode.dbNode, \
(a)->rnode.relNode = (b)->rnode.relNode, \
(a)->rnode.bucketNode = (b)->rnode.bucketNode,\
(a)->rnode.opt = (b)->rnode.opt, \
(a)->forkNum = (b)->forkNum, \
(a)->blockNum = (b)->blockNum)
extern char* BufferTagToString(const BufferTag* buftag, char* resBuffer=NULL, int len=-1);
* The shared buffer mapping table is partitioned to reduce contention.
* To determine which partition lock a given tag requires, compute the tag's
* hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
* NB: NUM_BUFFER_PARTITIONS must be a power of 2!
*/
#define BufTableHashPartition(hashcode) ((hashcode) % NUM_BUFFER_PARTITIONS)
#define BufMappingPartitionLock(hashcode) \
(&t_thrd.shemem_ptr_cxt.mainLWLockArray[FirstBufMappingLock + \
BufTableHashPartition(hashcode)].lock)
#define BufMappingPartitionLockByIndex(i) \
(&t_thrd.shemem_ptr_cxt.mainLWLockArray[FirstBufMappingLock + (i)].lock)
* BufferDesc -- shared descriptor/state data for a single shared buffer.
*
* Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
* the tag, state or wait_backend_pid fields. In general, buffer header lock
* is a spinlock which is combined with flags, refcount and usagecount into
* single atomic variable. This layout allow us to do some operations in a
* single atomic operation, without actually acquiring and releasing spinlock;
* for instance, increase or decrease refcount. buf_id field never changes
* after initialization, so does not need locking. The LWLock can take care
* of itself. The buffer header lock is *not* used to control access to the
* data in the buffer!
*
* It's assumed that nobody changes the state field while buffer header lock
* is held. Thus buffer header lock holder can do complex updates of the
* state variable in single write, simultaneously with lock release (cleaning
* BM_LOCKED flag). On the other hand, updating of state without holding
* buffer header lock is restricted to CAS, which insure that BM_LOCKED flag
* is not set. Atomic increment/decrement, OR/AND etc. are not allowed.
*
* An exception is that if we have the buffer pinned, its tag can't change
* underneath us, so we can examine the tag without locking the buffer header.
* Also, in places we do one-time reads of the flags without bothering to
* lock the buffer header; this is generally for situations where we don't
* expect the flag bit being tested to be changing.
*
* We can't physically remove items from a disk page if another backend has
* the buffer pinned. Hence, a backend may need to wait for all other pins
* to go away. This is signaled by storing its own PID into
* wait_backend_pid and setting flag bit BM_PIN_COUNT_WAITER. At present,
* there can be only one such waiter per buffer.
*
* We use this same struct for local buffer headers, but the lock fields
* are not used and not all of the flag bits are useful either.
*/
typedef struct BufferDescExtra {
uint8 seg_fileno;
BlockNumber seg_blockno;
pg_atomic_uint64 rec_lsn;
volatile uint64 dirty_queue_loc;
bool encrypt;
volatile uint64 lsn_on_disk;
volatile bool aio_in_progress;
} BufferDescExtra;
typedef struct BufferDesc {
BufferTag tag;
pg_atomic_uint64 state;
int buf_id;
ThreadId wait_backend_pid;
LWLock* io_in_progress_lock;
LWLock* content_lock;
BufferDescExtra *extra;
#ifdef USE_ASSERT_CHECKING
volatile uint64 lsn_dirty;
#endif
} BufferDesc;
* Concurrent access to buffer headers has proven to be more efficient if
* they're cache line aligned. So we force the start of the BufferDescriptors
* array to be on a cache line boundary and force the elements to be cache
* line sized.
*
* XXX: As this is primarily matters in highly concurrent workloads which
* probably all are 64bit these days, and the space wastage would be a bit
* more noticeable on 32bit systems, we don't force the stride to be cache
* line sized on those. If somebody does actual performance testing, we can
* reevaluate.
*
* Note that local buffer descriptors aren't forced to be aligned - as there's
* no concurrent access to those it's unlikely to be beneficial.
*
* We use 64bit as the cache line size here, because that's the most common
* size. Making it bigger would be a waste of memory. Even if running on a
* platform with either 32 or 128 byte line sizes, it's good to align to
* boundaries and avoid false sharing.
*/
#define BUFFERDESC_PAD_TO_SIZE (SIZEOF_VOID_P == 8 ? 64 : 1)
typedef union BufferDescPadded {
BufferDesc bufferdesc;
char pad[BUFFERDESC_PAD_TO_SIZE];
} BufferDescPadded;
#define GetBufferDescriptor(id) (&t_thrd.storage_cxt.BufferDescriptors[(id)].bufferdesc)
#define GetLocalBufferDescriptor(id) ((BufferDesc *)&u_sess->storage_cxt.LocalBufferDescriptors[(id)].bufferdesc)
#define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
#define BufferGetBufferDescriptor(buffer) \
(AssertMacro(BufferIsValid(buffer)), BufferIsLocal(buffer) ? \
(BufferDesc *)&u_sess->storage_cxt.LocalBufferDescriptors[-(buffer)-1].bufferdesc : \
&t_thrd.storage_cxt.BufferDescriptors[(buffer)-1].bufferdesc)
#define BufferDescriptorGetContentLock(bdesc) (((bdesc)->content_lock))
* Functions for acquiring/releasing a shared buffer header's spinlock. Do
* not apply these to local buffers!
*/
extern uint64 LockBufHdr(BufferDesc* desc);
#ifdef ENABLE_THREAD_CHECK
extern "C" {
void AnnotateHappensBefore(const char *f, int l, uintptr_t addr);
}
#define TsAnnotateHappensBefore(addr) AnnotateHappensBefore(__FILE__, __LINE__, (uintptr_t)addr)
#else
#define TsAnnotateHappensBefore(addr)
#endif
#define UnlockBufHdr(desc, s) \
do { \
\
TsAnnotateHappensBefore(&desc->state); \
pg_write_barrier(); \
pg_atomic_write_u32((((volatile uint32 *)&(desc)->state) + 1), ( ( (s) & (~BM_LOCKED) ) >> 32) ); \
} while (0)
#define FIX_SEG_BUFFER_TAG(node, tag, rel_node, block_num) \
do { \
if (IsSegmentFileNode(node)) { \
tag.rnode.relnode = rel_node; \
tag.blocknum = block_num; \
tag.rnode.bucketnode = SegmentBktId; \
} \
} while (0)
#define FIX_BUFFER_DESC(buf, pblk) \
do { \
Assert(PhyBlockIsValid(*pblk)); \
buf->seg_fileno = pblk->rel_node; \
buf->seg_blockno = pblk->block; \
buf->seg_lsn = pblk->lsn; \
} while (0)
extern bool retryLockBufHdr(BufferDesc* desc, uint64* buf_state);
* The PendingWriteback & WritebackContext structure are used to keep
* information about pending flush requests to be issued to the OS.
*/
typedef struct PendingWriteback {
BufferTag tag;
} PendingWriteback;
typedef struct WritebackContext {
int* max_pending;
int nr_pending;
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES];
} WritebackContext;
* Structure to sort buffers per file on checkpoints.
*
* This structure is allocated per buffer in shared memory, so it should be
* kept as small as possible.
*/
typedef struct CkptSortItem {
Oid tsId;
Oid relNode;
int2 bucketNode;
ForkNumber forkNum;
BlockNumber blockNum;
int buf_id;
uint8 seg_fileno;
BlockNumber seg_blockno;
} CkptSortItem;
* Internal routines: only called by bufmgr
*/
extern void WritebackContextInit(WritebackContext* context, int* max_pending);
extern void IssuePendingWritebacks(WritebackContext* context);
extern void ScheduleBufferTagForWriteback(WritebackContext* context, BufferTag* tag);
extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy, uint64 *buf_state);
extern void StrategyFreeBuffer(volatile BufferDesc* buf);
extern bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc* buf);
extern int StrategySyncStart(uint32* complete_passes, uint32* num_buf_alloc);
extern void StrategyNotifyBgWriter(int bgwprocno);
extern Size StrategyShmemSize(void);
extern void StrategyInitialize(bool init);
extern BufferDesc *SSTryGetBuffer(uint64 times, uint64 *buf_state);
extern Size BufTableShmemSize(int size);
extern void InitBufTable(int size);
extern uint32 BufTableHashCode(BufferTag* tagPtr);
extern int BufTableLookup(BufferTag* tagPtr, uint32 hashcode);
extern int BufTableInsert(BufferTag* tagPtr, uint32 hashcode, int buf_id);
extern void BufTableDelete(BufferTag* tagPtr, uint32 hashcode);
extern void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum);
extern BufferDesc* LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool* foundPtr);
extern void MarkLocalBufferDirty(Buffer buffer);
extern void DropRelFileNodeLocalBuffers(const RelFileNode& rnode, ForkNumber forkNum, BlockNumber firstDelBlock);
extern void DropRelFileNodeAllLocalBuffers(const RelFileNode& rnode);
extern void AtEOXact_LocalBuffers(bool isCommit);
extern void update_wait_lockid(LWLock* lock);
extern char* PageDataEncryptForBuffer(Page page, BufferDesc *bufdesc, bool is_segbuf = false);
extern void FlushBuffer(void* buf, SMgrRelation reln, ReadBufferMethod flushmethod = WITH_NORMAL_CACHE, bool skipFsync = false);
extern void LocalBufferFlushAllBuffer();
#endif