mirror of
https://github.com/zebrajr/postgres.git
synced 2025-12-07 12:20:31 +01:00
Per flame graph from Jelte Fennema, COPY FROM ... USING BINARY shows input validation taking at least 5% of the profile, so it's worth trying to be more efficient here. With this change, validation of pure ASCII is nearly 40% faster on contemporary Intel hardware. To make this change legible and easier to adopt to additional architectures, use helper functions to abstract the platform details away. Reviewed by Nathan Bossart Discussion: https://www.postgresql.org/message-id/CAFBsxsG%3Dk8t%3DC457FXnoBXb%3D8iA4OaZkbFogFMachWif7mNnww%40mail.gmail.com
252 lines
5.7 KiB
C
252 lines
5.7 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* simd.h
|
|
* Support for platform-specific vector operations.
|
|
*
|
|
* Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* src/include/port/simd.h
|
|
*
|
|
* NOTES
|
|
* - VectorN in this file refers to a register where the element operands
|
|
* are N bits wide. The vector width is platform-specific, so users that care
|
|
* about that will need to inspect "sizeof(VectorN)".
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#ifndef SIMD_H
|
|
#define SIMD_H
|
|
|
|
#if (defined(__x86_64__) || defined(_M_AMD64))
|
|
/*
|
|
* SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume
|
|
* that compilers targeting this architecture understand SSE2 intrinsics.
|
|
*
|
|
* We use emmintrin.h rather than the comprehensive header immintrin.h in
|
|
* order to exclude extensions beyond SSE2. This is because MSVC, at least,
|
|
* will allow the use of intrinsics that haven't been enabled at compile
|
|
* time.
|
|
*/
|
|
#include <emmintrin.h>
|
|
#define USE_SSE2
|
|
typedef __m128i Vector8;
|
|
|
|
#else
|
|
/*
|
|
* If no SIMD instructions are available, we can in some cases emulate vector
|
|
* operations using bitwise operations on unsigned integers.
|
|
*/
|
|
#define USE_NO_SIMD
|
|
typedef uint64 Vector8;
|
|
#endif
|
|
|
|
|
|
/* load/store operations */
|
|
static inline void vector8_load(Vector8 *v, const uint8 *s);
|
|
|
|
/* assignment operations */
|
|
static inline Vector8 vector8_broadcast(const uint8 c);
|
|
|
|
/* element-wise comparisons to a scalar */
|
|
static inline bool vector8_has(const Vector8 v, const uint8 c);
|
|
static inline bool vector8_has_zero(const Vector8 v);
|
|
static inline bool vector8_has_le(const Vector8 v, const uint8 c);
|
|
static inline bool vector8_is_highbit_set(const Vector8 v);
|
|
|
|
/* arithmetic operations */
|
|
static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2);
|
|
|
|
/* Different semantics for SIMD architectures. */
|
|
#ifndef USE_NO_SIMD
|
|
|
|
/* comparisons between vectors */
|
|
static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2);
|
|
|
|
#endif /* ! USE_NO_SIMD */
|
|
|
|
/*
|
|
* Load a chunk of memory into the given vector.
|
|
*/
|
|
static inline void
|
|
vector8_load(Vector8 *v, const uint8 *s)
|
|
{
|
|
#if defined(USE_SSE2)
|
|
*v = _mm_loadu_si128((const __m128i *) s);
|
|
#else
|
|
memcpy(v, s, sizeof(Vector8));
|
|
#endif
|
|
}
|
|
|
|
|
|
/*
|
|
* Create a vector with all elements set to the same value.
|
|
*/
|
|
static inline Vector8
|
|
vector8_broadcast(const uint8 c)
|
|
{
|
|
#if defined(USE_SSE2)
|
|
return _mm_set1_epi8(c);
|
|
#else
|
|
return ~UINT64CONST(0) / 0xFF * c;
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* Return true if any elements in the vector are equal to the given scalar.
|
|
*/
|
|
static inline bool
|
|
vector8_has(const Vector8 v, const uint8 c)
|
|
{
|
|
bool result;
|
|
|
|
/* pre-compute the result for assert checking */
|
|
#ifdef USE_ASSERT_CHECKING
|
|
bool assert_result = false;
|
|
|
|
for (int i = 0; i < sizeof(Vector8); i++)
|
|
{
|
|
if (((const uint8 *) &v)[i] == c)
|
|
{
|
|
assert_result = true;
|
|
break;
|
|
}
|
|
}
|
|
#endif /* USE_ASSERT_CHECKING */
|
|
|
|
#if defined(USE_NO_SIMD)
|
|
/* any bytes in v equal to c will evaluate to zero via XOR */
|
|
result = vector8_has_zero(v ^ vector8_broadcast(c));
|
|
#elif defined(USE_SSE2)
|
|
result = _mm_movemask_epi8(_mm_cmpeq_epi8(v, vector8_broadcast(c)));
|
|
#endif
|
|
|
|
Assert(assert_result == result);
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
* Convenience function equivalent to vector8_has(v, 0)
|
|
*/
|
|
static inline bool
|
|
vector8_has_zero(const Vector8 v)
|
|
{
|
|
#if defined(USE_NO_SIMD)
|
|
/*
|
|
* We cannot call vector8_has() here, because that would lead to a circular
|
|
* definition.
|
|
*/
|
|
return vector8_has_le(v, 0);
|
|
#elif defined(USE_SSE2)
|
|
return vector8_has(v, 0);
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* Return true if any elements in the vector are less than or equal to the
|
|
* given scalar.
|
|
*/
|
|
static inline bool
|
|
vector8_has_le(const Vector8 v, const uint8 c)
|
|
{
|
|
bool result = false;
|
|
#if defined(USE_SSE2)
|
|
__m128i sub;
|
|
#endif
|
|
|
|
/* pre-compute the result for assert checking */
|
|
#ifdef USE_ASSERT_CHECKING
|
|
bool assert_result = false;
|
|
|
|
for (int i = 0; i < sizeof(Vector8); i++)
|
|
{
|
|
if (((const uint8 *) &v)[i] <= c)
|
|
{
|
|
assert_result = true;
|
|
break;
|
|
}
|
|
}
|
|
#endif /* USE_ASSERT_CHECKING */
|
|
|
|
#if defined(USE_NO_SIMD)
|
|
|
|
/*
|
|
* To find bytes <= c, we can use bitwise operations to find bytes < c+1,
|
|
* but it only works if c+1 <= 128 and if the highest bit in v is not set.
|
|
* Adapted from
|
|
* https://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
|
|
*/
|
|
if ((int64) v >= 0 && c < 0x80)
|
|
result = (v - vector8_broadcast(c + 1)) & ~v & vector8_broadcast(0x80);
|
|
else
|
|
{
|
|
/* one byte at a time */
|
|
for (int i = 0; i < sizeof(Vector8); i++)
|
|
{
|
|
if (((const uint8 *) &v)[i] <= c)
|
|
{
|
|
result = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
#elif defined(USE_SSE2)
|
|
|
|
/*
|
|
* Use saturating subtraction to find bytes <= c, which will present as
|
|
* NUL bytes in 'sub'.
|
|
*/
|
|
sub = _mm_subs_epu8(v, vector8_broadcast(c));
|
|
result = vector8_has_zero(sub);
|
|
#endif
|
|
|
|
Assert(assert_result == result);
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
* Return true if the high bit of any element is set
|
|
*/
|
|
static inline bool
|
|
vector8_is_highbit_set(const Vector8 v)
|
|
{
|
|
#ifdef USE_SSE2
|
|
return _mm_movemask_epi8(v) != 0;
|
|
#else
|
|
return v & vector8_broadcast(0x80);
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* Return the bitwise OR of the inputs
|
|
*/
|
|
static inline Vector8
|
|
vector8_or(const Vector8 v1, const Vector8 v2)
|
|
{
|
|
#ifdef USE_SSE2
|
|
return _mm_or_si128(v1, v2);
|
|
#else
|
|
return v1 | v2;
|
|
#endif
|
|
}
|
|
|
|
|
|
/* Different semantics for SIMD architectures. */
|
|
#ifndef USE_NO_SIMD
|
|
|
|
/*
|
|
* Return a vector with all bits set in each lane where the the corresponding
|
|
* lanes in the inputs are equal.
|
|
*/
|
|
static inline Vector8
|
|
vector8_eq(const Vector8 v1, const Vector8 v2)
|
|
{
|
|
#ifdef USE_SSE2
|
|
return _mm_cmpeq_epi8(v1, v2);
|
|
#endif
|
|
}
|
|
|
|
#endif /* ! USE_NO_SIMD */
|
|
|
|
#endif /* SIMD_H */
|