mirror of
https://github.com/postgres/postgres.git
synced 2026-01-14 02:01:53 +00:00
This fixes a poorly written integer comparison function which was performing subtraction in an attempt to return a negative value when a < b and a positive value when a > b, and 0 when the values were equal. Unfortunately that didn't always work correctly due to two's complement having the INT_MIN 1 further from zero than INT_MAX. This could result in an overflow and cause the comparison function to return an incorrect result, which would result in the binary search failing to find the value being searched for. This could cause poor selectivity estimates when the statistics stored the value of INT_MAX (2147483647) and the value being searched for was large enough to result in the binary search doing a comparison with that INT_MAX value. Author: Chao Li <li.evan.chao@gmail.com> Reviewed-by: David Rowley <dgrowleyml@gmail.com> Discussion: https://postgr.es/m/CAEoWx2ng1Ot5LoKbVU-Dh---dFTUZWJRH8wv2chBu29fnNDMaQ@mail.gmail.com Backpatch-through: 14
341 lines
8.7 KiB
C
341 lines
8.7 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* _int_selfuncs.c
|
|
* Functions for selectivity estimation of intarray operators
|
|
*
|
|
* Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
*
|
|
* IDENTIFICATION
|
|
* contrib/intarray/_int_selfuncs.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include "_int.h"
|
|
#include "access/htup_details.h"
|
|
#include "catalog/pg_operator.h"
|
|
#include "catalog/pg_statistic.h"
|
|
#include "catalog/pg_type.h"
|
|
#include "miscadmin.h"
|
|
#include "utils/fmgrprotos.h"
|
|
#include "utils/lsyscache.h"
|
|
#include "utils/selfuncs.h"
|
|
|
|
PG_FUNCTION_INFO_V1(_int_overlap_sel);
|
|
PG_FUNCTION_INFO_V1(_int_contains_sel);
|
|
PG_FUNCTION_INFO_V1(_int_contained_sel);
|
|
PG_FUNCTION_INFO_V1(_int_overlap_joinsel);
|
|
PG_FUNCTION_INFO_V1(_int_contains_joinsel);
|
|
PG_FUNCTION_INFO_V1(_int_contained_joinsel);
|
|
PG_FUNCTION_INFO_V1(_int_matchsel);
|
|
|
|
|
|
static Selectivity int_query_opr_selec(ITEM *item, Datum *mcelems, float4 *mcefreqs,
|
|
int nmcelems, float4 minfreq);
|
|
static int compare_val_int4(const void *a, const void *b);
|
|
|
|
/*
|
|
* Wrappers around the default array selectivity estimation functions.
|
|
*
|
|
* The default array selectivity operators for the @>, && and @< operators
|
|
* work fine for integer arrays. However, if we tried to just use arraycontsel
|
|
* and arraycontjoinsel directly as the cost estimator functions for our
|
|
* operators, they would not work as intended, because they look at the
|
|
* operator's OID. Our operators behave exactly like the built-in anyarray
|
|
* versions, but we must tell the cost estimator functions which built-in
|
|
* operators they correspond to. These wrappers just replace the operator
|
|
* OID with the corresponding built-in operator's OID, and call the built-in
|
|
* function.
|
|
*/
|
|
|
|
Datum
|
|
_int_overlap_sel(PG_FUNCTION_ARGS)
|
|
{
|
|
PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
|
|
PG_GETARG_DATUM(0),
|
|
ObjectIdGetDatum(OID_ARRAY_OVERLAP_OP),
|
|
PG_GETARG_DATUM(2),
|
|
PG_GETARG_DATUM(3)));
|
|
}
|
|
|
|
Datum
|
|
_int_contains_sel(PG_FUNCTION_ARGS)
|
|
{
|
|
PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
|
|
PG_GETARG_DATUM(0),
|
|
ObjectIdGetDatum(OID_ARRAY_CONTAINS_OP),
|
|
PG_GETARG_DATUM(2),
|
|
PG_GETARG_DATUM(3)));
|
|
}
|
|
|
|
Datum
|
|
_int_contained_sel(PG_FUNCTION_ARGS)
|
|
{
|
|
PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
|
|
PG_GETARG_DATUM(0),
|
|
ObjectIdGetDatum(OID_ARRAY_CONTAINED_OP),
|
|
PG_GETARG_DATUM(2),
|
|
PG_GETARG_DATUM(3)));
|
|
}
|
|
|
|
Datum
|
|
_int_overlap_joinsel(PG_FUNCTION_ARGS)
|
|
{
|
|
PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
|
|
PG_GETARG_DATUM(0),
|
|
ObjectIdGetDatum(OID_ARRAY_OVERLAP_OP),
|
|
PG_GETARG_DATUM(2),
|
|
PG_GETARG_DATUM(3),
|
|
PG_GETARG_DATUM(4)));
|
|
}
|
|
|
|
Datum
|
|
_int_contains_joinsel(PG_FUNCTION_ARGS)
|
|
{
|
|
PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
|
|
PG_GETARG_DATUM(0),
|
|
ObjectIdGetDatum(OID_ARRAY_CONTAINS_OP),
|
|
PG_GETARG_DATUM(2),
|
|
PG_GETARG_DATUM(3),
|
|
PG_GETARG_DATUM(4)));
|
|
}
|
|
|
|
Datum
|
|
_int_contained_joinsel(PG_FUNCTION_ARGS)
|
|
{
|
|
PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
|
|
PG_GETARG_DATUM(0),
|
|
ObjectIdGetDatum(OID_ARRAY_CONTAINED_OP),
|
|
PG_GETARG_DATUM(2),
|
|
PG_GETARG_DATUM(3),
|
|
PG_GETARG_DATUM(4)));
|
|
}
|
|
|
|
|
|
/*
|
|
* _int_matchsel -- restriction selectivity function for intarray @@ query_int
|
|
*/
|
|
Datum
|
|
_int_matchsel(PG_FUNCTION_ARGS)
|
|
{
|
|
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
|
|
|
|
List *args = (List *) PG_GETARG_POINTER(2);
|
|
int varRelid = PG_GETARG_INT32(3);
|
|
VariableStatData vardata;
|
|
Node *other;
|
|
bool varonleft;
|
|
Selectivity selec;
|
|
QUERYTYPE *query;
|
|
Datum *mcelems = NULL;
|
|
float4 *mcefreqs = NULL;
|
|
int nmcelems = 0;
|
|
float4 minfreq = 0.0;
|
|
float4 nullfrac = 0.0;
|
|
AttStatsSlot sslot;
|
|
|
|
/*
|
|
* If expression is not "variable @@ something" or "something @@ variable"
|
|
* then punt and return a default estimate.
|
|
*/
|
|
if (!get_restriction_variable(root, args, varRelid,
|
|
&vardata, &other, &varonleft))
|
|
PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
|
|
|
|
/*
|
|
* Variable should be int[]. We don't support cases where variable is
|
|
* query_int.
|
|
*/
|
|
if (vardata.vartype != INT4ARRAYOID)
|
|
PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
|
|
|
|
/*
|
|
* Can't do anything useful if the something is not a constant, either.
|
|
*/
|
|
if (!IsA(other, Const))
|
|
{
|
|
ReleaseVariableStats(vardata);
|
|
PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
|
|
}
|
|
|
|
/*
|
|
* The "@@" operator is strict, so we can cope with NULL right away.
|
|
*/
|
|
if (((Const *) other)->constisnull)
|
|
{
|
|
ReleaseVariableStats(vardata);
|
|
PG_RETURN_FLOAT8(0.0);
|
|
}
|
|
|
|
/* The caller made sure the const is a query, so get it now */
|
|
query = DatumGetQueryTypeP(((Const *) other)->constvalue);
|
|
|
|
/* Empty query matches nothing */
|
|
if (query->size == 0)
|
|
{
|
|
ReleaseVariableStats(vardata);
|
|
PG_RETURN_FLOAT8(0.0);
|
|
}
|
|
|
|
/*
|
|
* Get the statistics for the intarray column.
|
|
*
|
|
* We're interested in the Most-Common-Elements list, and the NULL
|
|
* fraction.
|
|
*/
|
|
if (HeapTupleIsValid(vardata.statsTuple))
|
|
{
|
|
Form_pg_statistic stats;
|
|
|
|
stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
|
|
nullfrac = stats->stanullfrac;
|
|
|
|
/*
|
|
* For an int4 array, the default array type analyze function will
|
|
* collect a Most Common Elements list, which is an array of int4s.
|
|
*/
|
|
if (get_attstatsslot(&sslot, vardata.statsTuple,
|
|
STATISTIC_KIND_MCELEM, InvalidOid,
|
|
ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
|
|
{
|
|
Assert(sslot.valuetype == INT4OID);
|
|
|
|
/*
|
|
* There should be three more Numbers than Values, because the
|
|
* last three (for intarray) cells are taken for minimal, maximal
|
|
* and nulls frequency. Punt if not.
|
|
*/
|
|
if (sslot.nnumbers == sslot.nvalues + 3)
|
|
{
|
|
/* Grab the minimal MCE frequency. */
|
|
minfreq = sslot.numbers[sslot.nvalues];
|
|
|
|
mcelems = sslot.values;
|
|
mcefreqs = sslot.numbers;
|
|
nmcelems = sslot.nvalues;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
memset(&sslot, 0, sizeof(sslot));
|
|
|
|
/* Process the logical expression in the query, using the stats */
|
|
selec = int_query_opr_selec(GETQUERY(query) + query->size - 1,
|
|
mcelems, mcefreqs, nmcelems, minfreq);
|
|
|
|
/* MCE stats count only non-null rows, so adjust for null rows. */
|
|
selec *= (1.0 - nullfrac);
|
|
|
|
free_attstatsslot(&sslot);
|
|
ReleaseVariableStats(vardata);
|
|
|
|
CLAMP_PROBABILITY(selec);
|
|
|
|
PG_RETURN_FLOAT8((float8) selec);
|
|
}
|
|
|
|
/*
|
|
* Estimate selectivity of single intquery operator
|
|
*/
|
|
static Selectivity
|
|
int_query_opr_selec(ITEM *item, Datum *mcelems, float4 *mcefreqs,
|
|
int nmcelems, float4 minfreq)
|
|
{
|
|
Selectivity selec;
|
|
|
|
/* since this function recurses, it could be driven to stack overflow */
|
|
check_stack_depth();
|
|
|
|
if (item->type == VAL)
|
|
{
|
|
Datum *searchres;
|
|
|
|
if (mcelems == NULL)
|
|
return (Selectivity) DEFAULT_EQ_SEL;
|
|
|
|
searchres = (Datum *) bsearch(&item->val, mcelems, nmcelems,
|
|
sizeof(Datum), compare_val_int4);
|
|
if (searchres)
|
|
{
|
|
/*
|
|
* The element is in MCELEM. Return precise selectivity (or at
|
|
* least as precise as ANALYZE could find out).
|
|
*/
|
|
selec = mcefreqs[searchres - mcelems];
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* The element is not in MCELEM. Estimate its frequency as half
|
|
* that of the least-frequent MCE. (We know it cannot be more
|
|
* than minfreq, and it could be a great deal less. Half seems
|
|
* like a good compromise.) For probably-historical reasons,
|
|
* clamp to not more than DEFAULT_EQ_SEL.
|
|
*/
|
|
selec = Min(DEFAULT_EQ_SEL, minfreq / 2);
|
|
}
|
|
}
|
|
else if (item->type == OPR)
|
|
{
|
|
/* Current query node is an operator */
|
|
Selectivity s1,
|
|
s2;
|
|
|
|
s1 = int_query_opr_selec(item - 1, mcelems, mcefreqs, nmcelems,
|
|
minfreq);
|
|
switch (item->val)
|
|
{
|
|
case (int32) '!':
|
|
selec = 1.0 - s1;
|
|
break;
|
|
|
|
case (int32) '&':
|
|
s2 = int_query_opr_selec(item + item->left, mcelems, mcefreqs,
|
|
nmcelems, minfreq);
|
|
selec = s1 * s2;
|
|
break;
|
|
|
|
case (int32) '|':
|
|
s2 = int_query_opr_selec(item + item->left, mcelems, mcefreqs,
|
|
nmcelems, minfreq);
|
|
selec = s1 + s2 - s1 * s2;
|
|
break;
|
|
|
|
default:
|
|
elog(ERROR, "unrecognized operator: %d", item->val);
|
|
selec = 0; /* keep compiler quiet */
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
elog(ERROR, "unrecognized int query item type: %u", item->type);
|
|
selec = 0; /* keep compiler quiet */
|
|
}
|
|
|
|
/* Clamp intermediate results to stay sane despite roundoff error */
|
|
CLAMP_PROBABILITY(selec);
|
|
|
|
return selec;
|
|
}
|
|
|
|
/*
|
|
* Comparison function for binary search in mcelem array.
|
|
*/
|
|
static int
|
|
compare_val_int4(const void *a, const void *b)
|
|
{
|
|
int32 key = *(int32 *) a;
|
|
int32 value = DatumGetInt32(*(const Datum *) b);
|
|
|
|
if (key < value)
|
|
return -1;
|
|
else if (key > value)
|
|
return 1;
|
|
else
|
|
return 0;
|
|
}
|