mirror of
https://github.com/zebrajr/postgres.git
synced 2025-12-06 12:20:15 +01:00
In some edge cases valgrind flags issues with the memory referenced by IOs. All of the cases addressed in this change are false positives. Most of the false positives are caused by UnpinBuffer[NoOwner] marking buffer data as inaccessible. This happens even though the AIO subsystem still holds a pin. That's good, there shouldn't be accesses to the buffer outside of AIO related code until it is pinned by "user" code again. But it requires some explicit work - if the buffer is not pinned by the current backend, we need to explicitly mark the buffer data accessible/inaccessible while executing completion callbacks. That however causes a cascading issue in IO workers: After the completion callbacks for a buffer is executed, the page is marked as inaccessible. If subsequently the same worker is executing IO targeting the same buffer, we would get an error, as the memory is still marked inaccessible. To avoid that, we need to explicitly mark the memory as accessible in IO workers. Another issue is that IO executed in workers or via io_uring will not mark memory as DEFINED. In the case of workers that is because valgrind does not track memory definedness across processes. For io_uring that is because valgrind does not understand io_uring, and therefore its IOs never mark memory as defined, whether the completions are processed in the defining process or in another context. It's not entirely clear how to best solve that. The current user of AIO is not affected, as it explicitly marks buffers as DEFINED & NOACCESS anyway. Defer solving this issue until we have a user with different needs. Per buildfarm animal skink. Reviewed-by: Noah Misch <noah@leadboat.com> Co-authored-by: Noah Misch <noah@leadboat.com> Discussion: https://postgr.es/m/3pd4322mogfmdd5nln3zphdwhtmq3rzdldqjwb2sfqzcgs22lf@ok2gletdaoe6
431 lines
12 KiB
C
431 lines
12 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* aio_internal.h
|
|
* AIO related declarations that should only be used by the AIO subsystem
|
|
* internally.
|
|
*
|
|
*
|
|
* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* src/include/storage/aio_internal.h
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#ifndef AIO_INTERNAL_H
|
|
#define AIO_INTERNAL_H
|
|
|
|
|
|
#include "lib/ilist.h"
|
|
#include "port/pg_iovec.h"
|
|
#include "storage/aio.h"
|
|
#include "storage/condition_variable.h"
|
|
|
|
|
|
/*
|
|
* The maximum number of IOs that can be batch submitted at once.
|
|
*/
|
|
#define PGAIO_SUBMIT_BATCH_SIZE 32
|
|
|
|
|
|
|
|
/*
|
|
* State machine for handles. With some exceptions, noted below, handles move
|
|
* linearly through all states.
|
|
*
|
|
* State changes should all go through pgaio_io_update_state().
|
|
*
|
|
* Note that the externally visible functions to start IO
|
|
* (e.g. FileStartReadV(), via pgaio_io_start_readv()) move an IO from
|
|
* PGAIO_HS_HANDED_OUT to at least PGAIO_HS_STAGED and at most
|
|
* PGAIO_HS_COMPLETED_LOCAL (at which point the handle will be reused).
|
|
*/
|
|
typedef enum PgAioHandleState
|
|
{
|
|
/* not in use */
|
|
PGAIO_HS_IDLE = 0,
|
|
|
|
/*
|
|
* Returned by pgaio_io_acquire(). The next state is either DEFINED (if
|
|
* pgaio_io_start_*() is called), or IDLE (if pgaio_io_release() is
|
|
* called).
|
|
*/
|
|
PGAIO_HS_HANDED_OUT,
|
|
|
|
/*
|
|
* pgaio_io_start_*() has been called, but IO is not yet staged. At this
|
|
* point the handle has all the information for the IO to be executed.
|
|
*/
|
|
PGAIO_HS_DEFINED,
|
|
|
|
/*
|
|
* stage() callbacks have been called, handle ready to be submitted for
|
|
* execution. Unless in batchmode (see c.f. pgaio_enter_batchmode()), the
|
|
* IO will be submitted immediately after.
|
|
*/
|
|
PGAIO_HS_STAGED,
|
|
|
|
/* IO has been submitted to the IO method for execution */
|
|
PGAIO_HS_SUBMITTED,
|
|
|
|
/* IO finished, but result has not yet been processed */
|
|
PGAIO_HS_COMPLETED_IO,
|
|
|
|
/*
|
|
* IO completed, shared completion has been called.
|
|
*
|
|
* If the IO completion occurs in the issuing backend, local callbacks
|
|
* will immediately be called. Otherwise the handle stays in
|
|
* COMPLETED_SHARED until the issuing backend waits for the completion of
|
|
* the IO.
|
|
*/
|
|
PGAIO_HS_COMPLETED_SHARED,
|
|
|
|
/*
|
|
* IO completed, local completion has been called.
|
|
*
|
|
* After this the handle will be made reusable and go into IDLE state.
|
|
*/
|
|
PGAIO_HS_COMPLETED_LOCAL,
|
|
} PgAioHandleState;
|
|
|
|
|
|
struct ResourceOwnerData;
|
|
|
|
/* typedef is in aio_types.h */
|
|
struct PgAioHandle
|
|
{
|
|
/* all state updates should go through pgaio_io_update_state() */
|
|
PgAioHandleState state:8;
|
|
|
|
/* what are we operating on */
|
|
PgAioTargetID target:8;
|
|
|
|
/* which IO operation */
|
|
PgAioOp op:8;
|
|
|
|
/* bitfield of PgAioHandleFlags */
|
|
uint8 flags;
|
|
|
|
uint8 num_callbacks;
|
|
|
|
/* using the proper type here would use more space */
|
|
uint8 callbacks[PGAIO_HANDLE_MAX_CALLBACKS];
|
|
|
|
/* data forwarded to each callback */
|
|
uint8 callbacks_data[PGAIO_HANDLE_MAX_CALLBACKS];
|
|
|
|
/*
|
|
* Length of data associated with handle using
|
|
* pgaio_io_set_handle_data_*().
|
|
*/
|
|
uint8 handle_data_len;
|
|
|
|
/* XXX: could be optimized out with some pointer math */
|
|
int32 owner_procno;
|
|
|
|
/* raw result of the IO operation */
|
|
int32 result;
|
|
|
|
/**
|
|
* In which list the handle is registered, depends on the state:
|
|
* - IDLE, in per-backend list
|
|
* - HANDED_OUT - not in a list
|
|
* - DEFINED - not in a list
|
|
* - STAGED - in per-backend staged array
|
|
* - SUBMITTED - in issuer's in_flight list
|
|
* - COMPLETED_IO - in issuer's in_flight list
|
|
* - COMPLETED_SHARED - in issuer's in_flight list
|
|
**/
|
|
dlist_node node;
|
|
|
|
struct ResourceOwnerData *resowner;
|
|
dlist_node resowner_node;
|
|
|
|
/* incremented every time the IO handle is reused */
|
|
uint64 generation;
|
|
|
|
/*
|
|
* To wait for the IO to complete other backends can wait on this CV. Note
|
|
* that, if in SUBMITTED state, a waiter first needs to check if it needs
|
|
* to do work via IoMethodOps->wait_one().
|
|
*/
|
|
ConditionVariable cv;
|
|
|
|
/* result of shared callback, passed to issuer callback */
|
|
PgAioResult distilled_result;
|
|
|
|
/*
|
|
* Index into PgAioCtl->iovecs and PgAioCtl->handle_data.
|
|
*
|
|
* At the moment there's no need to differentiate between the two, but
|
|
* that won't necessarily stay that way.
|
|
*/
|
|
uint32 iovec_off;
|
|
|
|
/*
|
|
* If not NULL, this memory location will be updated with information
|
|
* about the IOs completion iff the issuing backend learns about the IOs
|
|
* completion.
|
|
*/
|
|
PgAioReturn *report_return;
|
|
|
|
/* Data necessary for the IO to be performed */
|
|
PgAioOpData op_data;
|
|
|
|
/*
|
|
* Data necessary to identify the object undergoing IO to higher-level
|
|
* code. Needs to be sufficient to allow another backend to reopen the
|
|
* file.
|
|
*/
|
|
PgAioTargetData target_data;
|
|
};
|
|
|
|
|
|
typedef struct PgAioBackend
|
|
{
|
|
/* index into PgAioCtl->io_handles */
|
|
uint32 io_handle_off;
|
|
|
|
/* IO Handles that currently are not used */
|
|
dclist_head idle_ios;
|
|
|
|
/*
|
|
* Only one IO may be returned by pgaio_io_acquire()/pgaio_io_acquire_nb()
|
|
* without having been either defined (by actually associating it with IO)
|
|
* or released (with pgaio_io_release()). This restriction is necessary to
|
|
* guarantee that we always can acquire an IO. ->handed_out_io is used to
|
|
* enforce that rule.
|
|
*/
|
|
PgAioHandle *handed_out_io;
|
|
|
|
/* Are we currently in batchmode? See pgaio_enter_batchmode(). */
|
|
bool in_batchmode;
|
|
|
|
/*
|
|
* IOs that are defined, but not yet submitted.
|
|
*/
|
|
uint16 num_staged_ios;
|
|
PgAioHandle *staged_ios[PGAIO_SUBMIT_BATCH_SIZE];
|
|
|
|
/*
|
|
* List of in-flight IOs. Also contains IOs that aren't strictly speaking
|
|
* in-flight anymore, but have been waited-for and completed by another
|
|
* backend. Once this backend sees such an IO it'll be reclaimed.
|
|
*
|
|
* The list is ordered by submission time, with more recently submitted
|
|
* IOs being appended at the end.
|
|
*/
|
|
dclist_head in_flight_ios;
|
|
} PgAioBackend;
|
|
|
|
|
|
typedef struct PgAioCtl
|
|
{
|
|
int backend_state_count;
|
|
PgAioBackend *backend_state;
|
|
|
|
/*
|
|
* Array of iovec structs. Each iovec is owned by a specific backend. The
|
|
* allocation is in PgAioCtl to allow the maximum number of iovecs for
|
|
* individual IOs to be configurable with PGC_POSTMASTER GUC.
|
|
*/
|
|
uint32 iovec_count;
|
|
struct iovec *iovecs;
|
|
|
|
/*
|
|
* For, e.g., an IO covering multiple buffers in shared / temp buffers, we
|
|
* need to get Buffer IDs during completion to be able to change the
|
|
* BufferDesc state accordingly. This space can be used to store e.g.
|
|
* Buffer IDs. Note that the actual iovec might be shorter than this,
|
|
* because we combine neighboring pages into one larger iovec entry.
|
|
*/
|
|
uint64 *handle_data;
|
|
|
|
uint32 io_handle_count;
|
|
PgAioHandle *io_handles;
|
|
} PgAioCtl;
|
|
|
|
|
|
|
|
/*
|
|
* Callbacks used to implement an IO method.
|
|
*/
|
|
typedef struct IoMethodOps
|
|
{
|
|
/* properties */
|
|
|
|
/*
|
|
* If an FD is about to be closed, do we need to wait for all in-flight
|
|
* IOs referencing that FD?
|
|
*/
|
|
bool wait_on_fd_before_close;
|
|
|
|
|
|
/* global initialization */
|
|
|
|
/*
|
|
* Amount of additional shared memory to reserve for the io_method. Called
|
|
* just like a normal ipci.c style *Size() function. Optional.
|
|
*/
|
|
size_t (*shmem_size) (void);
|
|
|
|
/*
|
|
* Initialize shared memory. First time is true if AIO's shared memory was
|
|
* just initialized, false otherwise. Optional.
|
|
*/
|
|
void (*shmem_init) (bool first_time);
|
|
|
|
/*
|
|
* Per-backend initialization. Optional.
|
|
*/
|
|
void (*init_backend) (void);
|
|
|
|
|
|
/* handling of IOs */
|
|
|
|
/* optional */
|
|
bool (*needs_synchronous_execution) (PgAioHandle *ioh);
|
|
|
|
/*
|
|
* Start executing passed in IOs.
|
|
*
|
|
* Shall advance state to at least PGAIO_HS_SUBMITTED. (By the time this
|
|
* returns, other backends might have advanced the state further.)
|
|
*
|
|
* Will not be called if ->needs_synchronous_execution() returned true.
|
|
*
|
|
* num_staged_ios is <= PGAIO_SUBMIT_BATCH_SIZE.
|
|
*
|
|
* Always called in a critical section.
|
|
*/
|
|
int (*submit) (uint16 num_staged_ios, PgAioHandle **staged_ios);
|
|
|
|
/* ---
|
|
* Wait for the IO to complete. Optional.
|
|
*
|
|
* On return, state shall be on of
|
|
* - PGAIO_HS_COMPLETED_IO
|
|
* - PGAIO_HS_COMPLETED_SHARED
|
|
* - PGAIO_HS_COMPLETED_LOCAL
|
|
*
|
|
* The callback must not block if the handle is already in one of those
|
|
* states, or has been reused (see pgaio_io_was_recycled()). If, on
|
|
* return, the state is PGAIO_HS_COMPLETED_IO, state will reach
|
|
* PGAIO_HS_COMPLETED_SHARED without further intervention by the IO
|
|
* method.
|
|
*
|
|
* If not provided, it needs to be guaranteed that the IO method calls
|
|
* pgaio_io_process_completion() without further interaction by the
|
|
* issuing backend.
|
|
* ---
|
|
*/
|
|
void (*wait_one) (PgAioHandle *ioh,
|
|
uint64 ref_generation);
|
|
} IoMethodOps;
|
|
|
|
|
|
/* aio.c */
|
|
extern bool pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state);
|
|
extern void pgaio_io_stage(PgAioHandle *ioh, PgAioOp op);
|
|
extern void pgaio_io_process_completion(PgAioHandle *ioh, int result);
|
|
extern void pgaio_io_prepare_submit(PgAioHandle *ioh);
|
|
extern bool pgaio_io_needs_synchronous_execution(PgAioHandle *ioh);
|
|
extern const char *pgaio_io_get_state_name(PgAioHandle *ioh);
|
|
const char *pgaio_result_status_string(PgAioResultStatus rs);
|
|
extern void pgaio_shutdown(int code, Datum arg);
|
|
|
|
/* aio_callback.c */
|
|
extern void pgaio_io_call_stage(PgAioHandle *ioh);
|
|
extern void pgaio_io_call_complete_shared(PgAioHandle *ioh);
|
|
extern PgAioResult pgaio_io_call_complete_local(PgAioHandle *ioh);
|
|
|
|
/* aio_io.c */
|
|
extern void pgaio_io_perform_synchronously(PgAioHandle *ioh);
|
|
extern const char *pgaio_io_get_op_name(PgAioHandle *ioh);
|
|
extern bool pgaio_io_uses_fd(PgAioHandle *ioh, int fd);
|
|
extern int pgaio_io_get_iovec_length(PgAioHandle *ioh, struct iovec **iov);
|
|
|
|
/* aio_target.c */
|
|
extern bool pgaio_io_can_reopen(PgAioHandle *ioh);
|
|
extern void pgaio_io_reopen(PgAioHandle *ioh);
|
|
extern const char *pgaio_io_get_target_name(PgAioHandle *ioh);
|
|
|
|
|
|
/*
|
|
* The AIO subsystem has fairly verbose debug logging support. This can be
|
|
* enabled/disabled at build time. The reason for this is that
|
|
* a) the verbosity can make debugging things on higher levels hard
|
|
* b) even if logging can be skipped due to elevel checks, it still causes a
|
|
* measurable slowdown
|
|
*
|
|
* XXX: This likely should be eventually be disabled by default, at least in
|
|
* non-assert builds.
|
|
*/
|
|
#define PGAIO_VERBOSE 1
|
|
|
|
/*
|
|
* Simple ereport() wrapper that only logs if PGAIO_VERBOSE is defined.
|
|
*
|
|
* This intentionally still compiles the code, guarded by a constant if (0),
|
|
* if verbose logging is disabled, to make it less likely that debug logging
|
|
* is silently broken.
|
|
*
|
|
* The current definition requires passing at least one argument.
|
|
*/
|
|
#define pgaio_debug(elevel, msg, ...) \
|
|
do { \
|
|
if (PGAIO_VERBOSE) \
|
|
ereport(elevel, \
|
|
errhidestmt(true), errhidecontext(true), \
|
|
errmsg_internal(msg, \
|
|
__VA_ARGS__)); \
|
|
} while(0)
|
|
|
|
/*
|
|
* Simple ereport() wrapper. Note that the definition requires passing at
|
|
* least one argument.
|
|
*/
|
|
#define pgaio_debug_io(elevel, ioh, msg, ...) \
|
|
pgaio_debug(elevel, "io %-10d|op %-5s|target %-4s|state %-16s: " msg, \
|
|
pgaio_io_get_id(ioh), \
|
|
pgaio_io_get_op_name(ioh), \
|
|
pgaio_io_get_target_name(ioh), \
|
|
pgaio_io_get_state_name(ioh), \
|
|
__VA_ARGS__)
|
|
|
|
|
|
#ifdef USE_INJECTION_POINTS
|
|
|
|
extern void pgaio_io_call_inj(PgAioHandle *ioh, const char *injection_point);
|
|
|
|
/* just for use in tests, from within injection points */
|
|
extern PgAioHandle *pgaio_inj_io_get(void);
|
|
|
|
#else
|
|
|
|
#define pgaio_io_call_inj(ioh, injection_point) (void) 0
|
|
|
|
/*
|
|
* no fallback for pgaio_inj_io_get, all code using injection points better be
|
|
* guarded by USE_INJECTION_POINTS.
|
|
*/
|
|
|
|
#endif
|
|
|
|
|
|
/* Declarations for the tables of function pointers exposed by each IO method. */
|
|
extern PGDLLIMPORT const IoMethodOps pgaio_sync_ops;
|
|
extern PGDLLIMPORT const IoMethodOps pgaio_worker_ops;
|
|
#ifdef IOMETHOD_IO_URING_ENABLED
|
|
extern PGDLLIMPORT const IoMethodOps pgaio_uring_ops;
|
|
#endif
|
|
|
|
extern PGDLLIMPORT const IoMethodOps *pgaio_method_ops;
|
|
extern PGDLLIMPORT PgAioCtl *pgaio_ctl;
|
|
extern PGDLLIMPORT PgAioBackend *pgaio_my_backend;
|
|
|
|
|
|
|
|
#endif /* AIO_INTERNAL_H */
|