postgres/src/backend/nodes/queryjumblefuncs.c
David Rowley ad9a23bc4f Optimize Query jumble
f31aad9b0 adjusted query jumbling so it no longer ignores NULL nodes
during the jumble.  This added some overhead.  Here we tune a few
things to make jumbling faster again.  This makes jumbling perform
similar or even slightly faster than prior to that change.

Author: David Rowley <dgrowleyml@gmail.com>
Reviewed-by: Michael Paquier <michael@paquier.xyz>
Discussion: https://postgr.es/m/CAApHDvreP04nhTKuYsPw0F-YN+4nr4f=L72SPeFb81jfv+2c7w@mail.gmail.com
2025-03-27 18:34:34 +13:00

724 lines
19 KiB
C

/*-------------------------------------------------------------------------
*
* queryjumblefuncs.c
* Query normalization and fingerprinting.
*
* Normalization is a process whereby similar queries, typically differing only
* in their constants (though the exact rules are somewhat more subtle than
* that) are recognized as equivalent, and are tracked as a single entry. This
* is particularly useful for non-prepared queries.
*
* Normalization is implemented by fingerprinting queries, selectively
* serializing those fields of each query tree's nodes that are judged to be
* essential to the query. This is referred to as a query jumble. This is
* distinct from a regular serialization in that various extraneous
* information is ignored as irrelevant or not essential to the query, such
* as the collations of Vars and, most notably, the values of constants.
*
* This jumble is acquired at the end of parse analysis of each query, and
* a 64-bit hash of it is stored into the query's Query.queryId field.
* The server then copies this value around, making it available in plan
* tree(s) generated from the query. The executor can then use this value
* to blame query costs on the proper queryId.
*
* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/nodes/queryjumblefuncs.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/transam.h"
#include "catalog/pg_proc.h"
#include "common/hashfn.h"
#include "miscadmin.h"
#include "nodes/nodeFuncs.h"
#include "nodes/queryjumble.h"
#include "utils/lsyscache.h"
#include "parser/scansup.h"
#define JUMBLE_SIZE 1024 /* query serialization buffer size */
/* GUC parameters */
int compute_query_id = COMPUTE_QUERY_ID_AUTO;
/* Whether to merge constants in a list when computing query_id */
bool query_id_squash_values = false;
/*
* True when compute_query_id is ON or AUTO, and a module requests them.
*
* Note that IsQueryIdEnabled() should be used instead of checking
* query_id_enabled or compute_query_id directly when we want to know
* whether query identifiers are computed in the core or not.
*/
bool query_id_enabled = false;
static JumbleState *InitJumble(void);
static uint64 DoJumble(JumbleState *jstate, Node *node);
static void AppendJumble(JumbleState *jstate,
const unsigned char *item, Size size);
static void FlushPendingNulls(JumbleState *jstate);
static void RecordConstLocation(JumbleState *jstate,
int location, bool merged);
static void _jumbleNode(JumbleState *jstate, Node *node);
static void _jumbleElements(JumbleState *jstate, List *elements);
static void _jumbleA_Const(JumbleState *jstate, Node *node);
static void _jumbleList(JumbleState *jstate, Node *node);
static void _jumbleVariableSetStmt(JumbleState *jstate, Node *node);
static void _jumbleRangeTblEntry_eref(JumbleState *jstate,
RangeTblEntry *rte,
Alias *expr);
/*
* Given a possibly multi-statement source string, confine our attention to the
* relevant part of the string.
*/
const char *
CleanQuerytext(const char *query, int *location, int *len)
{
int query_location = *location;
int query_len = *len;
/* First apply starting offset, unless it's -1 (unknown). */
if (query_location >= 0)
{
Assert(query_location <= strlen(query));
query += query_location;
/* Length of 0 (or -1) means "rest of string" */
if (query_len <= 0)
query_len = strlen(query);
else
Assert(query_len <= strlen(query));
}
else
{
/* If query location is unknown, distrust query_len as well */
query_location = 0;
query_len = strlen(query);
}
/*
* Discard leading and trailing whitespace, too. Use scanner_isspace()
* not libc's isspace(), because we want to match the lexer's behavior.
*
* Note: the parser now strips leading comments and whitespace from the
* reported stmt_location, so this first loop will only iterate in the
* unusual case that the location didn't propagate to here. But the
* statement length will extend to the end-of-string or terminating
* semicolon, so the second loop often does something useful.
*/
while (query_len > 0 && scanner_isspace(query[0]))
query++, query_location++, query_len--;
while (query_len > 0 && scanner_isspace(query[query_len - 1]))
query_len--;
*location = query_location;
*len = query_len;
return query;
}
/*
* JumbleQuery
* Recursively process the given Query producing a 64-bit hash value by
* hashing the relevant fields and record that value in the Query's queryId
* field. Return the JumbleState object used for jumbling the query.
*/
JumbleState *
JumbleQuery(Query *query)
{
JumbleState *jstate;
Assert(IsQueryIdEnabled());
jstate = InitJumble();
query->queryId = DoJumble(jstate, (Node *) query);
/*
* If we are unlucky enough to get a hash of zero, use 1 instead for
* normal statements and 2 for utility queries.
*/
if (query->queryId == UINT64CONST(0))
{
if (query->utilityStmt)
query->queryId = UINT64CONST(2);
else
query->queryId = UINT64CONST(1);
}
return jstate;
}
/*
* Enables query identifier computation.
*
* Third-party plugins can use this function to inform core that they require
* a query identifier to be computed.
*/
void
EnableQueryId(void)
{
if (compute_query_id != COMPUTE_QUERY_ID_OFF)
query_id_enabled = true;
}
/*
* InitJumble
* Allocate a JumbleState object and make it ready to jumble.
*/
static JumbleState *
InitJumble(void)
{
JumbleState *jstate;
jstate = (JumbleState *) palloc(sizeof(JumbleState));
/* Set up workspace for query jumbling */
jstate->jumble = (unsigned char *) palloc(JUMBLE_SIZE);
jstate->jumble_len = 0;
jstate->clocations_buf_size = 32;
jstate->clocations = (LocationLen *) palloc(jstate->clocations_buf_size *
sizeof(LocationLen));
jstate->clocations_count = 0;
jstate->highest_extern_param_id = 0;
jstate->pending_nulls = 0;
#ifdef USE_ASSERT_CHECKING
jstate->total_jumble_len = 0;
#endif
return jstate;
}
/*
* DoJumble
* Jumble the given Node using the given JumbleState and return the resulting
* jumble hash.
*/
static uint64
DoJumble(JumbleState *jstate, Node *node)
{
/* Jumble the given node */
_jumbleNode(jstate, node);
/* Flush any pending NULLs before doing the final hash */
if (jstate->pending_nulls > 0)
FlushPendingNulls(jstate);
/* Process the jumble buffer and produce the hash value */
return DatumGetUInt64(hash_any_extended(jstate->jumble,
jstate->jumble_len,
0));
}
/*
* AppendJumbleInternal: Internal function for appending to the jumble buffer
*
* Note: Callers must ensure that size > 0.
*/
static pg_attribute_always_inline void
AppendJumbleInternal(JumbleState *jstate, const unsigned char *item,
Size size)
{
unsigned char *jumble = jstate->jumble;
Size jumble_len = jstate->jumble_len;
/* Ensure the caller didn't mess up */
Assert(size > 0);
/*
* Fast path for when there's enough space left in the buffer. This is
* worthwhile as means the memcpy can be inlined into very efficient code
* when 'size' is a compile-time constant.
*/
if (likely(size <= JUMBLE_SIZE - jumble_len))
{
memcpy(jumble + jumble_len, item, size);
jstate->jumble_len += size;
#ifdef USE_ASSERT_CHECKING
jstate->total_jumble_len += size;
#endif
return;
}
/*
* Whenever the jumble buffer is full, we hash the current contents and
* reset the buffer to contain just that hash value, thus relying on the
* hash to summarize everything so far.
*/
do
{
Size part_size;
if (unlikely(jumble_len >= JUMBLE_SIZE))
{
uint64 start_hash;
start_hash = DatumGetUInt64(hash_any_extended(jumble,
JUMBLE_SIZE, 0));
memcpy(jumble, &start_hash, sizeof(start_hash));
jumble_len = sizeof(start_hash);
}
part_size = Min(size, JUMBLE_SIZE - jumble_len);
memcpy(jumble + jumble_len, item, part_size);
jumble_len += part_size;
item += part_size;
size -= part_size;
#ifdef USE_ASSERT_CHECKING
jstate->total_jumble_len += part_size;
#endif
} while (size > 0);
jstate->jumble_len = jumble_len;
}
/*
* AppendJumble
* Add 'size' bytes of the given jumble 'value' to the jumble state
*/
static pg_noinline void
AppendJumble(JumbleState *jstate, const unsigned char *value, Size size)
{
if (jstate->pending_nulls > 0)
FlushPendingNulls(jstate);
AppendJumbleInternal(jstate, value, size);
}
/*
* AppendJumbleNull
* For jumbling NULL pointers
*/
static pg_attribute_always_inline void
AppendJumbleNull(JumbleState *jstate)
{
jstate->pending_nulls++;
}
/*
* AppendJumble8
* Add the first byte from the given 'value' pointer to the jumble state
*/
static pg_noinline void
AppendJumble8(JumbleState *jstate, const unsigned char *value)
{
if (jstate->pending_nulls > 0)
FlushPendingNulls(jstate);
AppendJumbleInternal(jstate, value, 1);
}
/*
* AppendJumble16
* Add the first 2 bytes from the given 'value' pointer to the jumble
* state.
*/
static pg_noinline void
AppendJumble16(JumbleState *jstate, const unsigned char *value)
{
if (jstate->pending_nulls > 0)
FlushPendingNulls(jstate);
AppendJumbleInternal(jstate, value, 2);
}
/*
* AppendJumble32
* Add the first 4 bytes from the given 'value' pointer to the jumble
* state.
*/
static pg_noinline void
AppendJumble32(JumbleState *jstate, const unsigned char *value)
{
if (jstate->pending_nulls > 0)
FlushPendingNulls(jstate);
AppendJumbleInternal(jstate, value, 4);
}
/*
* AppendJumble64
* Add the first 8 bytes from the given 'value' pointer to the jumble
* state.
*/
static pg_noinline void
AppendJumble64(JumbleState *jstate, const unsigned char *value)
{
if (jstate->pending_nulls > 0)
FlushPendingNulls(jstate);
AppendJumbleInternal(jstate, value, 8);
}
/*
* FlushPendingNulls
* Incorporate the pending_null value into the jumble buffer.
*
* Note: Callers must ensure that there's at least 1 pending NULL.
*/
static pg_attribute_always_inline void
FlushPendingNulls(JumbleState *jstate)
{
Assert(jstate->pending_nulls > 0);
AppendJumbleInternal(jstate,
(const unsigned char *) &jstate->pending_nulls, 4);
jstate->pending_nulls = 0;
}
/*
* Record location of constant within query string of query tree that is
* currently being walked.
*
* 'squashed' signals that the constant represents the first or the last
* element in a series of merged constants, and everything but the first/last
* element contributes nothing to the jumble hash.
*/
static void
RecordConstLocation(JumbleState *jstate, int location, bool squashed)
{
/* -1 indicates unknown or undefined location */
if (location >= 0)
{
/* enlarge array if needed */
if (jstate->clocations_count >= jstate->clocations_buf_size)
{
jstate->clocations_buf_size *= 2;
jstate->clocations = (LocationLen *)
repalloc(jstate->clocations,
jstate->clocations_buf_size *
sizeof(LocationLen));
}
jstate->clocations[jstate->clocations_count].location = location;
/* initialize lengths to -1 to simplify third-party module usage */
jstate->clocations[jstate->clocations_count].squashed = squashed;
jstate->clocations[jstate->clocations_count].length = -1;
jstate->clocations_count++;
}
}
/*
* Subroutine for _jumbleElements: Verify a few simple cases where we can
* deduce that the expression is a constant:
*
* - Ignore a possible wrapping RelabelType and CoerceViaIO.
* - If it's a FuncExpr, check that the function is an implicit
* cast and its arguments are Const.
* - Otherwise test if the expression is a simple Const.
*/
static bool
IsSquashableConst(Node *element)
{
if (IsA(element, RelabelType))
element = (Node *) ((RelabelType *) element)->arg;
if (IsA(element, CoerceViaIO))
element = (Node *) ((CoerceViaIO *) element)->arg;
if (IsA(element, FuncExpr))
{
FuncExpr *func = (FuncExpr *) element;
ListCell *temp;
if (func->funcformat != COERCE_IMPLICIT_CAST &&
func->funcformat != COERCE_EXPLICIT_CAST)
return false;
if (func->funcid > FirstGenbkiObjectId)
return false;
foreach(temp, func->args)
{
Node *arg = lfirst(temp);
if (!IsA(arg, Const)) /* XXX we could recurse here instead */
return false;
}
return true;
}
if (!IsA(element, Const))
return false;
return true;
}
/*
* Subroutine for _jumbleElements: Verify whether the provided list
* can be squashed, meaning it contains only constant expressions.
*
* Return value indicates if squashing is possible.
*
* Note that this function searches only for explicit Const nodes with
* possibly very simple decorations on top, and does not try to simplify
* expressions.
*/
static bool
IsSquashableConstList(List *elements, Node **firstExpr, Node **lastExpr)
{
ListCell *temp;
/*
* If squashing is disabled, or the list is too short, we don't try to
* squash it.
*/
if (!query_id_squash_values || list_length(elements) < 2)
return false;
foreach(temp, elements)
{
if (!IsSquashableConst(lfirst(temp)))
return false;
}
*firstExpr = linitial(elements);
*lastExpr = llast(elements);
return true;
}
#define JUMBLE_NODE(item) \
_jumbleNode(jstate, (Node *) expr->item)
#define JUMBLE_ELEMENTS(list) \
_jumbleElements(jstate, (List *) expr->list)
#define JUMBLE_LOCATION(location) \
RecordConstLocation(jstate, expr->location, false)
#define JUMBLE_FIELD(item) \
do { \
if (sizeof(expr->item) == 8) \
AppendJumble64(jstate, (const unsigned char *) &(expr->item)); \
else if (sizeof(expr->item) == 4) \
AppendJumble32(jstate, (const unsigned char *) &(expr->item)); \
else if (sizeof(expr->item) == 2) \
AppendJumble16(jstate, (const unsigned char *) &(expr->item)); \
else if (sizeof(expr->item) == 1) \
AppendJumble8(jstate, (const unsigned char *) &(expr->item)); \
else \
AppendJumble(jstate, (const unsigned char *) &(expr->item), sizeof(expr->item)); \
} while (0)
#define JUMBLE_STRING(str) \
do { \
if (expr->str) \
AppendJumble(jstate, (const unsigned char *) (expr->str), strlen(expr->str) + 1); \
else \
AppendJumbleNull(jstate); \
} while(0)
/* Function name used for the node field attribute custom_query_jumble. */
#define JUMBLE_CUSTOM(nodetype, item) \
_jumble##nodetype##_##item(jstate, expr, expr->item)
#include "queryjumblefuncs.funcs.c"
/*
* When query_id_squash_values is enabled, we jumble lists of constant
* elements as one individual item regardless of how many elements are
* in the list. This means different queries jumble to the same query_id,
* if the only difference is the number of elements in the list.
*
* If query_id_squash_values is disabled or the list is not "simple
* enough", we jumble each element normally.
*/
static void
_jumbleElements(JumbleState *jstate, List *elements)
{
Node *first,
*last;
if (IsSquashableConstList(elements, &first, &last))
{
/*
* If this list of elements is squashable, keep track of the location
* of its first and last elements. When reading back the locations
* array, we'll see two consecutive locations with ->squashed set to
* true, indicating the location of initial and final elements of this
* list.
*
* For the limited set of cases we support now (implicit coerce via
* FuncExpr, Const) it's fine to use exprLocation of the 'last'
* expression, but if more complex composite expressions are to be
* supported (e.g., OpExpr or FuncExpr as an explicit call), more
* sophisticated tracking will be needed.
*/
RecordConstLocation(jstate, exprLocation(first), true);
RecordConstLocation(jstate, exprLocation(last), true);
}
else
{
_jumbleNode(jstate, (Node *) elements);
}
}
static void
_jumbleNode(JumbleState *jstate, Node *node)
{
Node *expr = node;
#ifdef USE_ASSERT_CHECKING
Size prev_jumble_len = jstate->total_jumble_len;
#endif
if (expr == NULL)
{
AppendJumbleNull(jstate);
return;
}
/* Guard against stack overflow due to overly complex expressions */
check_stack_depth();
/*
* We always emit the node's NodeTag, then any additional fields that are
* considered significant, and then we recurse to any child nodes.
*/
JUMBLE_FIELD(type);
switch (nodeTag(expr))
{
#include "queryjumblefuncs.switch.c"
case T_List:
case T_IntList:
case T_OidList:
case T_XidList:
_jumbleList(jstate, expr);
break;
default:
/* Only a warning, since we can stumble along anyway */
elog(WARNING, "unrecognized node type: %d",
(int) nodeTag(expr));
break;
}
/* Special cases to handle outside the automated code */
switch (nodeTag(expr))
{
case T_Param:
{
Param *p = (Param *) node;
/*
* Update the highest Param id seen, in order to start
* normalization correctly.
*/
if (p->paramkind == PARAM_EXTERN &&
p->paramid > jstate->highest_extern_param_id)
jstate->highest_extern_param_id = p->paramid;
}
break;
default:
break;
}
/* Ensure we added something to the jumble buffer */
Assert(jstate->total_jumble_len > prev_jumble_len);
}
static void
_jumbleList(JumbleState *jstate, Node *node)
{
List *expr = (List *) node;
ListCell *l;
switch (expr->type)
{
case T_List:
foreach(l, expr)
_jumbleNode(jstate, lfirst(l));
break;
case T_IntList:
foreach(l, expr)
AppendJumble32(jstate, (const unsigned char *) &lfirst_int(l));
break;
case T_OidList:
foreach(l, expr)
AppendJumble32(jstate, (const unsigned char *) &lfirst_oid(l));
break;
case T_XidList:
foreach(l, expr)
AppendJumble32(jstate, (const unsigned char *) &lfirst_xid(l));
break;
default:
elog(ERROR, "unrecognized list node type: %d",
(int) expr->type);
return;
}
}
static void
_jumbleA_Const(JumbleState *jstate, Node *node)
{
A_Const *expr = (A_Const *) node;
JUMBLE_FIELD(isnull);
if (!expr->isnull)
{
JUMBLE_FIELD(val.node.type);
switch (nodeTag(&expr->val))
{
case T_Integer:
JUMBLE_FIELD(val.ival.ival);
break;
case T_Float:
JUMBLE_STRING(val.fval.fval);
break;
case T_Boolean:
JUMBLE_FIELD(val.boolval.boolval);
break;
case T_String:
JUMBLE_STRING(val.sval.sval);
break;
case T_BitString:
JUMBLE_STRING(val.bsval.bsval);
break;
default:
elog(ERROR, "unrecognized node type: %d",
(int) nodeTag(&expr->val));
break;
}
}
}
static void
_jumbleVariableSetStmt(JumbleState *jstate, Node *node)
{
VariableSetStmt *expr = (VariableSetStmt *) node;
JUMBLE_FIELD(kind);
JUMBLE_STRING(name);
/*
* Account for the list of arguments in query jumbling only if told by the
* parser.
*/
if (expr->jumble_args)
JUMBLE_NODE(args);
JUMBLE_FIELD(is_local);
JUMBLE_LOCATION(location);
}
/*
* Custom query jumble function for RangeTblEntry.eref.
*/
static void
_jumbleRangeTblEntry_eref(JumbleState *jstate,
RangeTblEntry *rte,
Alias *expr)
{
JUMBLE_FIELD(type);
/*
* This includes only the table name, the list of column names is ignored.
*/
JUMBLE_STRING(aliasname);
}