diff --git a/CMakeLists.txt b/CMakeLists.txt index a40bc609a4..f5fa4e2c4d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -739,6 +739,7 @@ if(SDL_ASSEMBLY) cmake_pop_check_state() if(COMPILER_SUPPORTS_SSE4_1) set(HAVE_SSE4_1 TRUE) + sdl_glob_sources("${SDL3_SOURCE_DIR}/src/video/intrin/SDL_blit_A_sse4_1.c") endif() endif() if(SDL_SSE4_2) @@ -802,6 +803,7 @@ if(SDL_ASSEMBLY) return 0; }" COMPILER_SUPPORTS_AVX2) cmake_pop_check_state() + sdl_glob_sources("${SDL3_SOURCE_DIR}/src/video/intrin/SDL_blit_A_*.c") if(COMPILER_SUPPORTS_AVX2) set(HAVE_AVX2 TRUE) endif() diff --git a/VisualC-GDK/SDL/SDL.vcxproj b/VisualC-GDK/SDL/SDL.vcxproj index 67e74896cd..6c47cbaf10 100644 --- a/VisualC-GDK/SDL/SDL.vcxproj +++ b/VisualC-GDK/SDL/SDL.vcxproj @@ -529,6 +529,8 @@ + + @@ -817,6 +819,8 @@ $(IntDir)$(TargetName)_cpp.pch $(IntDir)$(TargetName)_cpp.pch + + @@ -863,4 +867,4 @@ - + \ No newline at end of file diff --git a/VisualC-GDK/SDL/SDL.vcxproj.filters b/VisualC-GDK/SDL/SDL.vcxproj.filters index 478d401f0c..873aa2e51f 100644 --- a/VisualC-GDK/SDL/SDL.vcxproj.filters +++ b/VisualC-GDK/SDL/SDL.vcxproj.filters @@ -458,8 +458,14 @@ + + video\intrin + + + video\intrin + - + \ No newline at end of file diff --git a/VisualC/SDL/SDL.vcxproj b/VisualC/SDL/SDL.vcxproj index 879c607e8d..914895231d 100644 --- a/VisualC/SDL/SDL.vcxproj +++ b/VisualC/SDL/SDL.vcxproj @@ -429,6 +429,8 @@ + + @@ -669,6 +671,8 @@ + + diff --git a/VisualC/SDL/SDL.vcxproj.filters b/VisualC/SDL/SDL.vcxproj.filters index 93db4f1994..82944ba417 100644 --- a/VisualC/SDL/SDL.vcxproj.filters +++ b/VisualC/SDL/SDL.vcxproj.filters @@ -172,6 +172,9 @@ {f48c2b17-1bee-4fec-a7c8-24cf619abe08} + + {653672cc-90ae-4eba-a256-6479f2c31804} + {00001967ea2801028a046a722a070000} @@ -868,6 +871,13 @@ + + video\intrin + + + video\intrin + + @@ -1515,6 +1525,13 @@ stdlib + + video\intrin + + + video\intrin + + @@ -1549,4 +1566,4 @@ - + \ No newline at end of file diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c index 083e85663b..5e35c9c7bb 100644 --- a/src/video/SDL_blit_A.c +++ b/src/video/SDL_blit_A.c @@ -24,6 +24,16 @@ #include "SDL_blit.h" +#ifdef SDL_SSE4_1_INTRINSICS +#include "intrin/SDL_blit_A_sse4.1.h" +#endif +#ifdef SDL_AVX2_INTRINSICS +#include "intrin/SDL_blit_A_avx2.h" +#endif +#if defined(SDL_SSE4_1_INTRINSICS) || defined(SDL_AVX2_INTRINSICS) +#include "SDL3/SDL_cpuinfo.h" +#endif + /* Functions to perform alpha blended blitting */ /* N->1 blending with per-surface alpha */ @@ -1296,6 +1306,20 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info) srcbpp = srcfmt->bytes_per_pixel; dstbpp = dstfmt->bytes_per_pixel; +#ifdef SDL_AVX2_INTRINSICS + if (srcbpp == 4 && dstbpp == 4 && width >= 4 && SDL_HasAVX2()) { + BlitNtoNPixelAlpha_AVX2(info); + return; + } +#endif + +#ifdef SDL_SSE4_1_INTRINSICS + if (srcbpp == 4 && dstbpp == 4 && width >= 2 && SDL_HasSSE41()) { + BlitNtoNPixelAlpha_SSE4_1(info); + return; + } +#endif + while (height--) { /* *INDENT-OFF* */ /* clang-format off */ DUFFS_LOOP4( @@ -1358,6 +1382,11 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface) return BlitNtoNPixelAlpha; case 4: +#if defined(SDL_SSE4_1_INTRINSICS) || defined(SDL_AVX2_INTRINSICS) + if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4) { + return BlitNtoNPixelAlpha; + } +#endif if (sf->Rmask == df->Rmask && sf->Gmask == df->Gmask && sf->Bmask == df->Bmask && sf->bytes_per_pixel == 4) { #ifdef SDL_MMX_INTRINSICS if (sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0 && sf->Ashift % 8 == 0 && sf->Aloss == 0) { @@ -1469,3 +1498,4 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface) } #endif /* SDL_HAVE_BLIT_A */ + diff --git a/src/video/intrin/SDL_blit_A_avx2.c b/src/video/intrin/SDL_blit_A_avx2.c new file mode 100644 index 0000000000..e9ebaa994a --- /dev/null +++ b/src/video/intrin/SDL_blit_A_avx2.c @@ -0,0 +1,126 @@ +#include "SDL_internal.h" + +#if SDL_HAVE_BLIT_A + +#ifdef SDL_AVX2_INTRINSICS + +#include "../SDL_blit.h" +#include "SDL_blit_A_sse4.1.h" + +#if !defined(_MSC_VER) || (defined(_MSC_VER) && defined(__clang__)) +__attribute__((target("avx2"))) +#endif +/** + * Using the AVX2 instruction set, blit eight pixels with alpha blending + * @param src A pointer to four 32-bit pixels of ARGB format to blit into dst + * @param dst A pointer to four 32-bit pixels of ARGB format to retain visual data for while alpha blending + * @return A 128-bit wide vector of four alpha-blended pixels in ARGB format + */ +__m128i MixRGBA_AVX2(__m128i src, __m128i dst) { + __m256i src_color = _mm256_cvtepu8_epi16(src); + __m256i dst_color = _mm256_cvtepu8_epi16(dst); + const __m256i SHUFFLE_ALPHA = _mm256_set_epi8( + -1, 30, -1, 30, -1, 30, -1, 30, + -1, 22, -1, 22, -1, 22, -1, 22, + -1, 14, -1, 14, -1, 14, -1, 14, + -1, 6, -1, 6, -1, 6, -1, 6); + __m256i alpha = _mm256_shuffle_epi8(src_color, SHUFFLE_ALPHA); + __m256i sub = _mm256_sub_epi16(src_color, dst_color); + __m256i mul = _mm256_mullo_epi16(sub, alpha); + /** + * With an 8-bit shuffle, one can only move integers within a lane. The 256-bit AVX2 lane is actually 4 64-bit + * lanes. We pack the integers into the start of each lane. The second shuffle operates on these 64-bit integers to + * put them into the correct order for transport back to the surface in the correct format. + */ + const __m256i SHUFFLE_REDUCE = _mm256_set_epi8( + -1, -1, -1, -1, -1, -1, -1, -1, + 31, 29, 27, 25, 23, 21, 19, 17, + -1, -1, -1, -1, -1, -1, -1, -1, + 15, 13, 11, 9, 7, 5, 3, 1); + __m256i reduced = _mm256_shuffle_epi8(mul, SHUFFLE_REDUCE); + __m256i packed = _mm256_permute4x64_epi64(reduced, _MM_SHUFFLE(3, 1, 2, 0)); + __m128i mix = _mm256_castsi256_si128(packed); + return _mm_add_epi8(mix, dst); +} + +#if !defined(_MSC_VER) || (defined(_MSC_VER) && defined(__clang__)) +__attribute__((target("avx2"))) +#endif +void BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info) +{ + int width = info->dst_w; + int height = info->dst_h; + Uint8 *src = info->src; + int srcskip = info->src_skip; + Uint8 *dst = info->dst; + int dstskip = info->dst_skip; + SDL_PixelFormat *srcfmt = info->src_fmt; + + int chunks = width / 4; + Uint8 *buf = SDL_malloc(sizeof(Uint8) * chunks * 16); + + while (height--) { + /* Process 4-wide chunks of source color data that may be in wrong format */ + for (int i = 0; i < chunks; i += 1) { + __m128i c_src = convertPixelFormatsx4(_mm_loadu_si128((__m128i*) (src + i * 16)), srcfmt); + _mm_store_si128((__m128i*)(buf + i * 16), c_src); + } + + /* Alpha-blend in 4-wide chunk from src into destination */ + for (int i = 0; i < chunks; i += 1) { + __m128i c_src = _mm_loadu_si128((__m128i*) (buf + i * 16)); + __m128i c_dst = _mm_loadu_si128((__m128i*) (dst + i * 16)); + __m128i c_mix = MixRGBA_AVX2(c_src, c_dst); + _mm_storeu_si128((__m128i*) (dst + i * 16), c_mix); + } + + /* Handle remaining pixels when width is not a multiple of 4 */ + if (width % 4 != 0) { + int remaining_pixels = width % 4; + int offset = width - remaining_pixels; + if (remaining_pixels >= 2) { + Uint32 *src_ptr = ((Uint32*)(src + (offset * 4))); + Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4))); + __m128i c_src = _mm_loadu_si64(src_ptr); + c_src = convertPixelFormatsx4(c_src, srcfmt); + __m128i c_dst = _mm_loadu_si64(dst_ptr); + __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst); + _mm_storeu_si64(dst_ptr, c_mix); + remaining_pixels -= 2; + offset += 2; + } + if (remaining_pixels == 1) { + Uint32 *src_ptr = ((Uint32*)(src + (offset * 4))); + Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4))); + Uint32 pixel = convertPixelFormat(*src_ptr, srcfmt); + /* Old GCC has bad or no _mm_loadu_si32 */ + #if defined(__GNUC__) && (__GNUC__ < 11) + __m128i c_src = _mm_set_epi32(0, 0, 0, pixel); + __m128i c_dst = _mm_set_epi32(0, 0, 0, *dst_ptr); + #else + __m128i c_src = _mm_loadu_si32(&pixel); + __m128i c_dst = _mm_loadu_si32(dst_ptr); + #endif + __m128i mixed_pixel = MixRGBA_SSE4_1(c_src, c_dst); + /* Old GCC has bad or no _mm_storeu_si32 */ + #if defined(__GNUC__) && (__GNUC__ < 11) + *dst_ptr = _mm_extract_epi32(mixed_pixel, 0); + #else + _mm_storeu_si32(dst_ptr, mixed_pixel); + #endif + } + } + + src += 4 * width; + dst += 4 * width; + + src += srcskip; + dst += dstskip; + } + SDL_free(buf); + +} + +#endif + +#endif diff --git a/src/video/intrin/SDL_blit_A_avx2.h b/src/video/intrin/SDL_blit_A_avx2.h new file mode 100644 index 0000000000..c3fc7b1117 --- /dev/null +++ b/src/video/intrin/SDL_blit_A_avx2.h @@ -0,0 +1,7 @@ +#ifndef SDL_SDL_BLIT_A_AVX2_H +#define SDL_SDL_BLIT_A_AVX2_H +#if !defined(_MSC_VER) || (defined(_MSC_VER) && defined(__clang__)) +__attribute__((target("avx2"))) +#endif +void BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info); +#endif //SDL_SDL_BLIT_A_AVX2_H diff --git a/src/video/intrin/SDL_blit_A_sse4.1.h b/src/video/intrin/SDL_blit_A_sse4.1.h new file mode 100644 index 0000000000..47be0dd582 --- /dev/null +++ b/src/video/intrin/SDL_blit_A_sse4.1.h @@ -0,0 +1,24 @@ +#ifndef SDL_SDL_BLIT_A_SSE4_1_H +#define SDL_SDL_BLIT_A_SSE4_1_H + +#ifdef SDL_SSE4_1_INTRINSICS +Uint32 convertPixelFormat(Uint32 color, const SDL_PixelFormat* srcFormat); + +#if !defined(_MSC_VER) || (defined(_MSC_VER) && defined(__clang__)) +__attribute__((target("sse4.1"))) +#endif +__m128i convertPixelFormatsx4(__m128i colors, const SDL_PixelFormat* srcFormat); + +#if !defined(_MSC_VER) || (defined(_MSC_VER) && defined(__clang__)) +__attribute__((target("sse4.1"))) +#endif +__m128i MixRGBA_SSE4_1(__m128i src, __m128i dst); + +#if !defined(_MSC_VER) || (defined(_MSC_VER) && defined(__clang__)) +__attribute__((target("sse4.1"))) +#endif +void BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo *info); + +#endif + +#endif //SDL_SDL_BLIT_A_SSE4_1_H diff --git a/src/video/intrin/SDL_blit_A_sse4_1.c b/src/video/intrin/SDL_blit_A_sse4_1.c new file mode 100644 index 0000000000..2e7d913916 --- /dev/null +++ b/src/video/intrin/SDL_blit_A_sse4_1.c @@ -0,0 +1,146 @@ +#include "SDL_internal.h" + +#if SDL_HAVE_BLIT_A + +#ifdef SDL_SSE4_1_INTRINSICS + +#include "../SDL_blit.h" + +#if !defined(_MSC_VER) || (defined(_MSC_VER) && defined(__clang__)) +__attribute__((target("sse4.1"))) +#endif +/** + * Using the SSE4.1 instruction set, blit four pixels with alpha blending + * @param src A pointer to two 32-bit pixels of ARGB format to blit into dst + * @param dst A pointer to two 32-bit pixels of ARGB format to retain visual data for while alpha blending + * @return A 128-bit wide vector of two alpha-blended pixels in ARGB format + */ +__m128i MixRGBA_SSE4_1(__m128i src, __m128i dst) { + __m128i src_color = _mm_cvtepu8_epi16(src); + __m128i dst_color = _mm_cvtepu8_epi16(dst); + /** + * Combines a shuffle and an _mm_cvtepu8_epi16 operation into one operation by moving the lower 8 bits of the alpha + * channel around to create 16-bit integers. + */ + const __m128i SHUFFLE_ALPHA = _mm_set_epi8( + -1, 7, -1, 7, -1, 7, -1, 7, + -1, 3, -1, 3, -1, 3, -1, 3); + __m128i alpha = _mm_shuffle_epi8(src, SHUFFLE_ALPHA); + __m128i sub = _mm_sub_epi16(src_color, dst_color); + __m128i mul = _mm_mullo_epi16(sub, alpha); + const __m128i SHUFFLE_REDUCE = _mm_set_epi8( + -1, -1, -1, -1, -1, -1, -1, -1, + 15, 13, 11, 9, 7, 5, 3, 1); + __m128i reduced = _mm_shuffle_epi8(mul, SHUFFLE_REDUCE); + + return _mm_add_epi8(reduced, dst); +} + +Uint32 convertPixelFormat(Uint32 color, const SDL_PixelFormat* srcFormat) { + Uint8 a = (color >> srcFormat->Ashift) & 0xFF; + Uint8 r = (color >> srcFormat->Rshift) & 0xFF; + Uint8 g = (color >> srcFormat->Gshift) & 0xFF; + Uint8 b = (color >> srcFormat->Bshift) & 0xFF; + + return (a << 24) | (r << 16) | (g << 8) | b; +} + +#if !defined(_MSC_VER) || (defined(_MSC_VER) && defined(__clang__)) +__attribute__((target("sse4.1"))) +#endif +/* + * This helper function converts arbitrary pixel format data into ARGB form with a 4 pixel-wide shuffle + */ +__m128i convertPixelFormatsx4(__m128i colors, const SDL_PixelFormat* srcFormat) { + // Create shuffle masks based on the source SDL_PixelFormat to ARGB + __m128i srcShuffleMask = _mm_set_epi8( + srcFormat->Ashift / 8 + 12, srcFormat->Rshift / 8 + 12, srcFormat->Gshift / 8 + 12, srcFormat->Bshift / 8 + 12, + srcFormat->Ashift / 8 + 8, srcFormat->Rshift / 8 + 8, srcFormat->Gshift / 8 + 8, srcFormat->Bshift / 8 + 8, + srcFormat->Ashift / 8 + 4, srcFormat->Rshift / 8 + 4, srcFormat->Gshift / 8 + 4, srcFormat->Bshift / 8 + 4, + srcFormat->Ashift / 8, srcFormat->Rshift / 8, srcFormat->Gshift / 8, srcFormat->Bshift / 8 + ); + + // Shuffle the colors + return _mm_shuffle_epi8(colors, srcShuffleMask); +} + +#if !defined(_MSC_VER) || (defined(_MSC_VER) && defined(__clang__)) +__attribute__((target("sse4.1"))) +#endif +void BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo* info) { + int width = info->dst_w; + int height = info->dst_h; + Uint8 *src = info->src; + int srcskip = info->src_skip; + Uint8 *dst = info->dst; + int dstskip = info->dst_skip; + SDL_PixelFormat *srcfmt = info->src_fmt; + + int chunks = width / 4; + Uint8 *buffer = (Uint8*)SDL_malloc(chunks * 16 * sizeof(Uint8)); + + while (height--) { + /* Process 4-wide chunks of source color data that may be in wrong format into buffer */ + for (int i = 0; i < chunks; i += 1) { + __m128i colors = _mm_loadu_si128((__m128i*)(src + i * 16)); + _mm_storeu_si128((__m128i*)(buffer + i * 16), convertPixelFormatsx4(colors, srcfmt)); + } + + /* Alpha-blend in 2-wide chunks from buffer into destination */ + for (int i = 0; i < chunks * 2; i += 1) { + __m128i c_src = _mm_loadu_si64((buffer + (i * 8))); + __m128i c_dst = _mm_loadu_si64((dst + i * 8)); + __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst); + _mm_storeu_si64(dst + i * 8, c_mix); + } + + /* Handle remaining pixels when width is not a multiple of 4 */ + if (width % 4 != 0) { + int remaining_pixels = width % 4; + int offset = width - remaining_pixels; + if (remaining_pixels >= 2) { + Uint32 *src_ptr = ((Uint32*)(src + (offset * 4))); + Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4))); + __m128i c_src = _mm_loadu_si64(src_ptr); + c_src = convertPixelFormatsx4(c_src, srcfmt); + __m128i c_dst = _mm_loadu_si64(dst_ptr); + __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst); + _mm_storeu_si64(dst_ptr, c_mix); + remaining_pixels -= 2; + offset += 2; + } + if (remaining_pixels == 1) { + Uint32 *src_ptr = ((Uint32*)(src + (offset * 4))); + Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4))); + Uint32 pixel = convertPixelFormat(*src_ptr, srcfmt); + /* Old GCC has bad or no _mm_loadu_si32 */ + #if defined(__GNUC__) && (__GNUC__ < 11) + __m128i c_src = _mm_set_epi32(0, 0, 0, pixel); + __m128i c_dst = _mm_set_epi32(0, 0, 0, *dst_ptr); + #else + __m128i c_src = _mm_loadu_si32(&pixel); + __m128i c_dst = _mm_loadu_si32(dst_ptr); + #endif + __m128i mixed_pixel = MixRGBA_SSE4_1(c_src, c_dst); + /* Old GCC has bad or no _mm_storeu_si32 */ + #if defined(__GNUC__) && (__GNUC__ < 11) + *dst_ptr = _mm_extract_epi32(mixed_pixel, 0); + #else + _mm_storeu_si32(dst_ptr, mixed_pixel); + #endif + } + } + + src += 4 * width; + dst += 4 * width; + + src += srcskip; + dst += dstskip; + } + + SDL_free(buffer); +} + +#endif + +#endif