diff --git a/VisualC-GDK/SDL/SDL.vcxproj b/VisualC-GDK/SDL/SDL.vcxproj index 28c678bb52..28fdc639ff 100644 --- a/VisualC-GDK/SDL/SDL.vcxproj +++ b/VisualC-GDK/SDL/SDL.vcxproj @@ -551,8 +551,6 @@ - - @@ -824,8 +822,6 @@ - - diff --git a/VisualC-GDK/SDL/SDL.vcxproj.filters b/VisualC-GDK/SDL/SDL.vcxproj.filters index 06543fe5df..069e3bd14a 100644 --- a/VisualC-GDK/SDL/SDL.vcxproj.filters +++ b/VisualC-GDK/SDL/SDL.vcxproj.filters @@ -458,12 +458,6 @@ - - video - - - video - diff --git a/VisualC-WinRT/SDL-UWP.vcxproj b/VisualC-WinRT/SDL-UWP.vcxproj index 8e11e9ba3f..32966b4fed 100644 --- a/VisualC-WinRT/SDL-UWP.vcxproj +++ b/VisualC-WinRT/SDL-UWP.vcxproj @@ -178,8 +178,6 @@ - - @@ -549,8 +547,6 @@ - - diff --git a/VisualC-WinRT/SDL-UWP.vcxproj.filters b/VisualC-WinRT/SDL-UWP.vcxproj.filters index 21d0415627..46d67b1459 100644 --- a/VisualC-WinRT/SDL-UWP.vcxproj.filters +++ b/VisualC-WinRT/SDL-UWP.vcxproj.filters @@ -952,12 +952,6 @@ Header Files - - Source Files - - - Source Files - @@ -978,11 +972,5 @@ Source Files - - Source Files - - - Source Files - \ No newline at end of file diff --git a/VisualC/SDL/SDL.vcxproj b/VisualC/SDL/SDL.vcxproj index f015e5a59f..879c607e8d 100644 --- a/VisualC/SDL/SDL.vcxproj +++ b/VisualC/SDL/SDL.vcxproj @@ -457,8 +457,6 @@ - - @@ -682,8 +680,6 @@ - - diff --git a/VisualC/SDL/SDL.vcxproj.filters b/VisualC/SDL/SDL.vcxproj.filters index 5d10b7e74f..faa6a9d555 100644 --- a/VisualC/SDL/SDL.vcxproj.filters +++ b/VisualC/SDL/SDL.vcxproj.filters @@ -871,12 +871,6 @@ - - video - - - video - @@ -1525,12 +1519,6 @@ stdlib - - video - - - video - diff --git a/src/video/SDL_blit.h b/src/video/SDL_blit.h index 387db2f2ca..5facdd5159 100644 --- a/src/video/SDL_blit.h +++ b/src/video/SDL_blit.h @@ -526,27 +526,65 @@ extern SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface); ALPHA_BLEND_CHANNEL(sG, dG, A); \ ALPHA_BLEND_CHANNEL(sB, dB, A); \ } while (0) -/* Blend two 32-bit pixels with the same format */ -#define ALPHA_BLEND_RGBA_4(src, dst, ashift) \ - do { \ - Uint32 srcA = (src >> ashift) & 0xFF; \ - src |= ((Uint32)0xFF) << ashift; \ - \ - Uint32 srcRB = src & 0x00FF00FF; \ - Uint32 dstRB = dst & 0x00FF00FF; \ - \ - Uint32 srcGA = (src >> 8) & 0x00FF00FF; \ - Uint32 dstGA = (dst >> 8) & 0x00FF00FF; \ - \ - Uint32 resRB = ((srcRB - dstRB) * srcA) + (dstRB << 8) - dstRB; \ - resRB += 0x00010001; \ - resRB += (resRB >> 8) & 0x00FF00FF; \ - resRB = (resRB >> 8) & 0x00FF00FF; \ - Uint32 resGA = ((srcGA - dstGA) * srcA) + (dstGA << 8) - dstGA; \ - resGA += 0x00010001; \ - resGA += (resGA >> 8) & 0x00FF00FF; \ - resGA &= 0xFF00FF00; \ - dst = resRB | resGA; \ + +/* Blend two 8888 pixels with the same format */ +/* Calculates dst = ((src * factor) + (dst * (255 - factor))) / 255 */ +/* FIXME: SDL_SIZE_MAX might not be an integer literal */ +#if defined(SIZE_MAX) && (SIZE_MAX == 0xffffffffffffffff) +#define FACTOR_BLEND_8888(src, dst, factor) \ + do { \ + Uint64 src64 = src; \ + src64 = (src64 | (src64 << 24)) & 0x00FF00FF00FF00FF; \ + \ + Uint64 dst64 = dst; \ + dst64 = (dst64 | (dst64 << 24)) & 0x00FF00FF00FF00FF; \ + \ + dst64 = ((src64 - dst64) * factor) + (dst64 << 8) - dst64; \ + dst64 += 0x0001000100010001; \ + dst64 += (dst64 >> 8) & 0x00FF00FF00FF00FF; \ + dst64 &= 0xFF00FF00FF00FF00; \ + \ + dst = (Uint32)((dst64 >> 8) | (dst64 >> 32)); \ + } while (0) +#else +#define FACTOR_BLEND_8888(src, dst, factor) \ + do { \ + Uint32 src02 = src & 0x00FF00FF; \ + Uint32 dst02 = dst & 0x00FF00FF; \ + \ + Uint32 src13 = (src >> 8) & 0x00FF00FF; \ + Uint32 dst13 = (dst >> 8) & 0x00FF00FF; \ + \ + Uint32 res02 = ((src02 - dst02) * factor) + (dst02 << 8) - dst02; \ + res02 += 0x00010001; \ + res02 += (res02 >> 8) & 0x00FF00FF; \ + res02 = (res02 >> 8) & 0x00FF00FF; \ + \ + Uint32 res13 = ((src13 - dst13) * factor) + (dst13 << 8) - dst13; \ + res13 += 0x00010001; \ + res13 += (res13 >> 8) & 0x00FF00FF; \ + res13 &= 0xFF00FF00; \ + dst = res02 | res13; \ + } while (0) +#endif + +/* Alpha blend two 8888 pixels with the same formats. */ +#define ALPHA_BLEND_8888(src, dst, fmt) \ + do { \ + Uint32 srcA = (src >> fmt->Ashift) & 0xFF; \ + Uint32 tmp = src | fmt->Amask; \ + FACTOR_BLEND_8888(tmp, dst, srcA); \ + } while (0) + +/* Alpha blend two 8888 pixels with differing formats. */ +#define ALPHA_BLEND_SWIZZLE_8888(src, dst, srcfmt, dstfmt) \ + do { \ + Uint32 srcA = (src >> srcfmt->Ashift) & 0xFF; \ + Uint32 tmp = (((src >> srcfmt->Rshift) & 0xFF) << dstfmt->Rshift) | \ + (((src >> srcfmt->Gshift) & 0xFF) << dstfmt->Gshift) | \ + (((src >> srcfmt->Bshift) & 0xFF) << dstfmt->Bshift) | \ + dstfmt->Amask; \ + FACTOR_BLEND_8888(tmp, dst, srcA); \ } while (0) /* Blend the RGBA values of two pixels */ #define ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA) \ diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c index 4dd67e341b..51f1076547 100644 --- a/src/video/SDL_blit_A.c +++ b/src/video/SDL_blit_A.c @@ -24,16 +24,6 @@ #include "SDL_blit.h" -#ifdef SDL_SSE4_1_INTRINSICS -#include "SDL_blit_A_sse4_1.h" -#endif -#ifdef SDL_AVX2_INTRINSICS -#include "SDL_blit_A_avx2.h" -#endif -#if defined(SDL_SSE4_1_INTRINSICS) || defined(SDL_AVX2_INTRINSICS) -#include "SDL3/SDL_cpuinfo.h" -#endif - /* Functions to perform alpha blended blitting */ /* N->1 blending with per-surface alpha */ @@ -327,98 +317,6 @@ static void SDL_TARGETING("mmx") BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info) } } -/* fast ARGB888->(A)RGB888 blending with pixel alpha */ -static void SDL_TARGETING("mmx") BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info) -{ - int width = info->dst_w; - int height = info->dst_h; - Uint32 *srcp = (Uint32 *)info->src; - int srcskip = info->src_skip >> 2; - Uint32 *dstp = (Uint32 *)info->dst; - int dstskip = info->dst_skip >> 2; - SDL_PixelFormat *sf = info->src_fmt; - Uint32 amask = sf->Amask; - Uint32 ashift = sf->Ashift; - Uint64 multmask, multmask2; - - __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2, mm_one_alpha; - - mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ - if (amask == 0xFF000000) { /* 1 in the alpha channel -> mm_one_alpha */ - mm_one_alpha = _mm_set_pi16(1, 0, 0, 0); - } else if (amask == 0x00FF0000) { - mm_one_alpha = _mm_set_pi16(0, 1, 0, 0); - } else if (amask == 0x0000FF00) { - mm_one_alpha = _mm_set_pi16(0, 0, 1, 0); - } else { - mm_one_alpha = _mm_set_pi16(0, 0, 0, 1); - } - - multmask = 0x00FF; - multmask <<= ((Uint64)ashift * 2); - multmask2 = 0x00FF00FF00FF00FFULL; - - while (height--) { - /* *INDENT-OFF* */ /* clang-format off */ - DUFFS_LOOP4({ - Uint32 alpha = *srcp & amask; - if (alpha == 0) { - /* do nothing */ - } else if (alpha == amask) { - *dstp = *srcp; - } else { - src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */ - src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */ - - dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */ - dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ - - mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */ - mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */ - mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ - mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */ - mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha */ - mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha */ - - /* - Alpha blending is: - dstRGB = (srcRGB * srcA) + (dstRGB * (1-srcA)) - dstA = srcA + (dstA * (1-srcA)) * - - Here, 'src1' is: - srcRGB * srcA - srcA - And 'dst1' is: - dstRGB * (1-srcA) - dstA * (1-srcA) - so that *dstp is 'src1 + dst1' - - src1 is computed using mullo_pi16: (X * mask) >> 8, but is approximate for srcA ((srcA * 255) >> 8). - - need to a 1 to get an exact result: (srcA * 256) >> 8 == srcA - */ - mm_alpha = _mm_add_pi16(mm_alpha, mm_one_alpha); - - /* blend */ - src1 = _mm_mullo_pi16(src1, mm_alpha); - src1 = _mm_srli_pi16(src1, 8); - dst1 = _mm_mullo_pi16(dst1, mm_alpha2); - dst1 = _mm_srli_pi16(dst1, 8); - dst1 = _mm_add_pi16(src1, dst1); - dst1 = _mm_packs_pu16(dst1, mm_zero); - - *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ - } - ++srcp; - ++dstp; - }, width); - /* *INDENT-ON* */ /* clang-format on */ - srcp += srcskip; - dstp += dstskip; - } - _mm_empty(); -} - #endif /* SDL_MMX_INTRINSICS */ /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ @@ -1121,6 +1019,247 @@ static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info) } } +/* Fast 32-bit RGBA->RGBA blending with pixel alpha */ +static void Blit8888to8888PixelAlpha(SDL_BlitInfo *info) +{ + int width = info->dst_w; + int height = info->dst_h; + Uint8 *src = info->src; + int srcskip = info->src_skip; + Uint8 *dst = info->dst; + int dstskip = info->dst_skip; + SDL_PixelFormat *srcfmt = info->src_fmt; + + while (height--) { + int i = 0; + + for (; i < width; ++i) { + Uint32 src32 = *(Uint32 *)src; + Uint32 dst32 = *(Uint32 *)dst; + ALPHA_BLEND_8888(src32, dst32, srcfmt); + *(Uint32 *)dst = dst32; + src += 4; + dst += 4; + } + + src += srcskip; + dst += dstskip; + } +} + +/* Fast 32-bit RGBA->RGB(A) blending with pixel alpha and src swizzling */ +static void Blit8888to8888PixelAlphaSwizzle(SDL_BlitInfo *info) +{ + int width = info->dst_w; + int height = info->dst_h; + Uint8 *src = info->src; + int srcskip = info->src_skip; + Uint8 *dst = info->dst; + int dstskip = info->dst_skip; + SDL_PixelFormat *srcfmt = info->src_fmt; + SDL_PixelFormat *dstfmt = info->dst_fmt; + + while (height--) { + int i = 0; + + for (; i < width; ++i) { + Uint32 src32 = *(Uint32 *)src; + Uint32 dst32 = *(Uint32 *)dst; + ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt); + *(Uint32 *)dst = dst32; + src += 4; + dst += 4; + } + + src += srcskip; + dst += dstskip; + } +} + +#ifdef SDL_SSE4_1_INTRINSICS + +static void SDL_TARGETING("sse4.1") Blit8888to8888PixelAlphaSwizzleSSE41(SDL_BlitInfo *info) +{ + int width = info->dst_w; + int height = info->dst_h; + Uint8 *src = info->src; + int srcskip = info->src_skip; + Uint8 *dst = info->dst; + int dstskip = info->dst_skip; + SDL_PixelFormat *srcfmt = info->src_fmt; + SDL_PixelFormat *dstfmt = info->dst_fmt; + + // The byte offsets for the start of each pixel + const __m128i mask_offsets = _mm_set_epi8( + 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0); + + const __m128i convert_mask = _mm_add_epi32( + _mm_set1_epi32( + ((srcfmt->Rshift >> 3) << dstfmt->Rshift) | + ((srcfmt->Gshift >> 3) << dstfmt->Gshift) | + ((srcfmt->Bshift >> 3) << dstfmt->Bshift)), + mask_offsets); + + const __m128i alpha_splat_mask = _mm_add_epi8(_mm_set1_epi8(srcfmt->Ashift >> 3), mask_offsets); + const __m128i alpha_fill_mask = _mm_set1_epi32((int)dstfmt->Amask); + + while (height--) { + int i = 0; + + for (; i + 4 <= width; i += 4) { + // Load 4 src pixels + __m128i src128 = _mm_loadu_si128((__m128i *)src); + + // Load 4 dst pixels + __m128i dst128 = _mm_loadu_si128((__m128i *)dst); + + // Extract the alpha from each pixel and splat it into all the channels + __m128i srcA = _mm_shuffle_epi8(src128, alpha_splat_mask); + + // Convert to dst format + src128 = _mm_shuffle_epi8(src128, convert_mask); + + // Set the alpha channels of src to 255 + src128 = _mm_or_si128(src128, alpha_fill_mask); + + __m128i src_lo = _mm_unpacklo_epi8(src128, _mm_setzero_si128()); + __m128i src_hi = _mm_unpackhi_epi8(src128, _mm_setzero_si128()); + + __m128i dst_lo = _mm_unpacklo_epi8(dst128, _mm_setzero_si128()); + __m128i dst_hi = _mm_unpackhi_epi8(dst128, _mm_setzero_si128()); + + __m128i srca_lo = _mm_unpacklo_epi8(srcA, _mm_setzero_si128()); + __m128i srca_hi = _mm_unpackhi_epi8(srcA, _mm_setzero_si128()); + + // dst = ((src - dst) * srcA) + ((dst << 8) - dst) + dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srca_lo), + _mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo)); + dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srca_hi), + _mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi)); + + // dst += 0x1U (use 0x80 to round instead of floor) + dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1)); + dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1)); + + // dst += dst >> 8 + dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8); + dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8); + + // Blend the pixels together and save the result + _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(dst_lo, dst_hi)); + + src += 16; + dst += 16; + } + + for (; i < width; ++i) { + Uint32 src32 = *(Uint32 *)src; + Uint32 dst32 = *(Uint32 *)dst; + ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt); + *(Uint32 *)dst = dst32; + src += 4; + dst += 4; + } + + src += srcskip; + dst += dstskip; + } +} + +#endif + +#ifdef SDL_AVX2_INTRINSICS + +static void SDL_TARGETING("avx2") Blit8888to8888PixelAlphaSwizzleAVX2(SDL_BlitInfo *info) +{ + int width = info->dst_w; + int height = info->dst_h; + Uint8 *src = info->src; + int srcskip = info->src_skip; + Uint8 *dst = info->dst; + int dstskip = info->dst_skip; + SDL_PixelFormat *srcfmt = info->src_fmt; + SDL_PixelFormat *dstfmt = info->dst_fmt; + + // The byte offsets for the start of each pixel + const __m256i mask_offsets = _mm256_set_epi8( + 28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0); + + const __m256i convert_mask = _mm256_add_epi32( + _mm256_set1_epi32( + ((srcfmt->Rshift >> 3) << dstfmt->Rshift) | + ((srcfmt->Gshift >> 3) << dstfmt->Gshift) | + ((srcfmt->Bshift >> 3) << dstfmt->Bshift)), + mask_offsets); + + const __m256i alpha_splat_mask = _mm256_add_epi8(_mm256_set1_epi8(srcfmt->Ashift >> 3), mask_offsets); + const __m256i alpha_fill_mask = _mm256_set1_epi32((int)dstfmt->Amask); + + while (height--) { + int i = 0; + + for (; i + 8 <= width; i += 8) { + // Load 8 src pixels + __m256i src256 = _mm256_loadu_si256((__m256i *)src); + + // Load 8 dst pixels + __m256i dst256 = _mm256_loadu_si256((__m256i *)dst); + + // Extract the alpha from each pixel and splat it into all the channels + __m256i srcA = _mm256_shuffle_epi8(src256, alpha_splat_mask); + + // Convert to dst format + src256 = _mm256_shuffle_epi8(src256, convert_mask); + + // Set the alpha channels of src to 255 + src256 = _mm256_or_si256(src256, alpha_fill_mask); + + __m256i src_lo = _mm256_unpacklo_epi8(src256, _mm256_setzero_si256()); + __m256i src_hi = _mm256_unpackhi_epi8(src256, _mm256_setzero_si256()); + + __m256i dst_lo = _mm256_unpacklo_epi8(dst256, _mm256_setzero_si256()); + __m256i dst_hi = _mm256_unpackhi_epi8(dst256, _mm256_setzero_si256()); + + __m256i srca_lo = _mm256_unpacklo_epi8(srcA, _mm256_setzero_si256()); + __m256i srca_hi = _mm256_unpackhi_epi8(srcA, _mm256_setzero_si256()); + + // dst = ((src - dst) * srcA) + ((dst << 8) - dst) + dst_lo = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_lo, dst_lo), srca_lo), + _mm256_sub_epi16(_mm256_slli_epi16(dst_lo, 8), dst_lo)); + dst_hi = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_hi, dst_hi), srca_hi), + _mm256_sub_epi16(_mm256_slli_epi16(dst_hi, 8), dst_hi)); + + // dst += 0x1U (use 0x80 to round instead of floor) + dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1)); + dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1)); + + // dst += dst >> 8 + dst_lo = _mm256_srli_epi16(_mm256_add_epi16(dst_lo, _mm256_srli_epi16(dst_lo, 8)), 8); + dst_hi = _mm256_srli_epi16(_mm256_add_epi16(dst_hi, _mm256_srli_epi16(dst_hi, 8)), 8); + + // Blend the pixels together and save the result + _mm256_storeu_si256((__m256i *)dst, _mm256_packus_epi16(dst_lo, dst_hi)); + + src += 32; + dst += 32; + } + + for (; i < width; ++i) { + Uint32 src32 = *(Uint32 *)src; + Uint32 dst32 = *(Uint32 *)dst; + ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt); + *(Uint32 *)dst = dst32; + src += 4; + dst += 4; + } + + src += srcskip; + dst += dstskip; + } +} + +#endif + /* General (slow) N->N blending with pixel alpha */ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info) { @@ -1134,7 +1273,6 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info) SDL_PixelFormat *dstfmt = info->dst_fmt; int srcbpp; int dstbpp; - SDL_bool freeFormat; Uint32 Pixel; unsigned sR, sG, sB, sA; unsigned dR, dG, dB, dA; @@ -1142,62 +1280,23 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info) /* Set up some basic variables */ srcbpp = srcfmt->bytes_per_pixel; dstbpp = dstfmt->bytes_per_pixel; - freeFormat = SDL_FALSE; - -#ifdef SDL_AVX2_INTRINSICS - if (srcbpp == 4 && dstbpp == 4 && width >= 4 && SDL_HasAVX2()) { - BlitNtoNPixelAlpha_AVX2(info); - return; - } -#endif - -#ifdef SDL_SSE4_1_INTRINSICS - // TODO: Re-enable - if (srcbpp == 4 && dstbpp == 4 && width >= 2 && SDL_HasSSE41()) { - BlitNtoNPixelAlpha_SSE4_1(info); - return; - } -#endif - /* Handle case where bad input sent */ - if (dstfmt->Ashift == 0 && dstfmt->Ashift == dstfmt->Bshift) { - dstfmt = SDL_CreatePixelFormat(SDL_PIXELFORMAT_ARGB8888); - freeFormat = SDL_TRUE; - } while (height--) { - /* if (srcbpp == 4 && dstbpp == 4 && dstfmt->Ashift == 24 && dstfmt->Rshift == 16 && dstfmt->Gshift == 8 && - dstfmt->Bshift == 0) { - DUFFS_LOOP4( - { - PIXEL_TO_ARGB_PIXEL(*(Uint32 *) src, srcfmt, Pixel); - Uint32 blended = *(Uint32 *) dst; - ALPHA_BLEND_RGBA_4(Pixel, blended); - *(Uint32*)dst = blended; - src += srcbpp; - dst += dstbpp; - }, - width); - } else { */ - /* *INDENT-OFF* */ /* clang-format off */ - DUFFS_LOOP4( - { - DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA); - if (sA) { - DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA); - ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA); - ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); - } - src += srcbpp; - dst += dstbpp; - }, - width); - /* *INDENT-ON* */ /* clang-format on */ - src += srcskip; - dst += dstskip; - // } - } - if (freeFormat) { - SDL_DestroyPixelFormat(dstfmt); + DUFFS_LOOP4( + { + DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA); + if (sA) { + DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA); + ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA); + ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); + } + src += srcbpp; + dst += dstbpp; + }, + width); + /* *INDENT-ON* */ /* clang-format on */ + src += srcskip; + dst += dstskip; } } @@ -1229,19 +1328,23 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface) return BlitNtoNPixelAlpha; case 4: -#if defined(SDL_SSE4_1_INTRINSICS) || defined(SDL_AVX2_INTRINSICS) - if (sf->bytes_per_pixel == 4 && df->bytes_per_pixel == 4 && (SDL_HasSSE41() || SDL_HasAVX2())) { - return BlitNtoNPixelAlpha; - } -#endif - if (sf->Rmask == df->Rmask && sf->Gmask == df->Gmask && sf->Bmask == df->Bmask && sf->bytes_per_pixel == 4) { -#ifdef SDL_MMX_INTRINSICS - if (sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0 && sf->Ashift % 8 == 0 && sf->Aloss == 0) { - if (SDL_HasMMX()) { - return BlitRGBtoRGBPixelAlphaMMX; - } + if (SDL_PIXELLAYOUT(sf->format) == SDL_PACKEDLAYOUT_8888 && sf->Amask && + SDL_PIXELLAYOUT(df->format) == SDL_PACKEDLAYOUT_8888) { +#ifdef SDL_AVX2_INTRINSICS + if (SDL_HasAVX2()) { + return Blit8888to8888PixelAlphaSwizzleAVX2; + } +#endif +#ifdef SDL_SSE4_1_INTRINSICS + if (SDL_HasSSE41()) { + return Blit8888to8888PixelAlphaSwizzleSSE41; + } +#endif + if (sf->format == df->format) { + return Blit8888to8888PixelAlpha; + } else { + return Blit8888to8888PixelAlphaSwizzle; } -#endif /* SDL_MMX_INTRINSICS */ } return BlitNtoNPixelAlpha; diff --git a/src/video/SDL_blit_A_avx2.c b/src/video/SDL_blit_A_avx2.c deleted file mode 100644 index 4421aed0e2..0000000000 --- a/src/video/SDL_blit_A_avx2.c +++ /dev/null @@ -1,124 +0,0 @@ -#include "SDL_internal.h" - -#if SDL_HAVE_BLIT_A - -#ifdef SDL_AVX2_INTRINSICS - -#include "SDL_blit.h" - -void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info) -{ - int width = info->dst_w; - int height = info->dst_h; - Uint8 *src = info->src; - int srcskip = info->src_skip; - Uint8 *dst = info->dst; - int dstskip = info->dst_skip; - SDL_PixelFormat *srcfmt = info->src_fmt; - SDL_PixelFormat *dstfmt = info->dst_fmt; - - // The byte offsets for the start of each pixel - const __m256i mask_offsets = _mm256_set_epi8( - 28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0); - - const __m256i convert_mask = _mm256_add_epi32( - _mm256_set1_epi32( - ((srcfmt->Rshift >> 3) << dstfmt->Rshift) | - ((srcfmt->Gshift >> 3) << dstfmt->Gshift) | - ((srcfmt->Bshift >> 3) << dstfmt->Bshift)), - mask_offsets); - - const __m256i alpha_splat_mask = _mm256_add_epi8(_mm256_set1_epi8(srcfmt->Ashift >> 3), mask_offsets); - const __m256i alpha_fill_mask = _mm256_set1_epi32((int)dstfmt->Amask); - - while (height--) { - int i = 0; - - for (; i + 8 <= width; i += 8) { - // Load 8 src pixels - __m256i src256 = _mm256_loadu_si256((__m256i *)src); - - // Load 8 dst pixels - __m256i dst256 = _mm256_loadu_si256((__m256i *)dst); - - // Extract the alpha from each pixel and splat it into all the channels - __m256i srcA = _mm256_shuffle_epi8(src256, alpha_splat_mask); - - // Convert to dst format - src256 = _mm256_shuffle_epi8(src256, convert_mask); - - // Set the alpha channels of src to 255 - src256 = _mm256_or_si256(src256, alpha_fill_mask); - - __m256i src_lo = _mm256_unpacklo_epi8(src256, _mm256_setzero_si256()); - __m256i src_hi = _mm256_unpackhi_epi8(src256, _mm256_setzero_si256()); - - __m256i dst_lo = _mm256_unpacklo_epi8(dst256, _mm256_setzero_si256()); - __m256i dst_hi = _mm256_unpackhi_epi8(dst256, _mm256_setzero_si256()); - - __m256i srca_lo = _mm256_unpacklo_epi8(srcA, _mm256_setzero_si256()); - __m256i srca_hi = _mm256_unpackhi_epi8(srcA, _mm256_setzero_si256()); - - // dst = ((src - dst) * srcA) + ((dst << 8) - dst) - dst_lo = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_lo, dst_lo), srca_lo), - _mm256_sub_epi16(_mm256_slli_epi16(dst_lo, 8), dst_lo)); - dst_hi = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_hi, dst_hi), srca_hi), - _mm256_sub_epi16(_mm256_slli_epi16(dst_hi, 8), dst_hi)); - - // dst += 0x1U (use 0x80 to round instead of floor) - dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1)); - dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1)); - - // dst += dst >> 8 - dst_lo = _mm256_srli_epi16(_mm256_add_epi16(dst_lo, _mm256_srli_epi16(dst_lo, 8)), 8); - dst_hi = _mm256_srli_epi16(_mm256_add_epi16(dst_hi, _mm256_srli_epi16(dst_hi, 8)), 8); - - // Blend the pixels together and save the result - _mm256_storeu_si256((__m256i *)dst, _mm256_packus_epi16(dst_lo, dst_hi)); - - src += 32; - dst += 32; - } - - for (; i < width; ++i) { - Uint32 src32 = *(Uint32 *)src; - Uint32 dst32 = *(Uint32 *)dst; - - Uint32 srcA = (src32 >> srcfmt->Ashift) & 0xFF; - - src32 = (((src32 >> srcfmt->Rshift) & 0xFF) << dstfmt->Rshift) | - (((src32 >> srcfmt->Gshift) & 0xFF) << dstfmt->Gshift) | - (((src32 >> srcfmt->Bshift) & 0xFF) << dstfmt->Bshift) | - dstfmt->Amask; - - Uint32 srcRB = src32 & 0x00FF00FF; - Uint32 dstRB = dst32 & 0x00FF00FF; - - Uint32 srcGA = (src32 >> 8) & 0x00FF00FF; - Uint32 dstGA = (dst32 >> 8) & 0x00FF00FF; - - Uint32 resRB = ((srcRB - dstRB) * srcA) + (dstRB << 8) - dstRB; - resRB += 0x00010001; - resRB += (resRB >> 8) & 0x00FF00FF; - resRB = (resRB >> 8) & 0x00FF00FF; - - Uint32 resGA = ((srcGA - dstGA) * srcA) + (dstGA << 8) - dstGA; - resGA += 0x00010001; - resGA += (resGA >> 8) & 0x00FF00FF; - resGA &= 0xFF00FF00; - dst32 = resRB | resGA; - - *(Uint32 *)dst = dst32; - - src += 4; - dst += 4; - } - - src += srcskip; - dst += dstskip; - } -} - -#endif - -#endif diff --git a/src/video/SDL_blit_A_avx2.h b/src/video/SDL_blit_A_avx2.h deleted file mode 100644 index 61eab95424..0000000000 --- a/src/video/SDL_blit_A_avx2.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef SDL_SDL_BLIT_A_AVX2_H -#define SDL_SDL_BLIT_A_AVX2_H -void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info); -#endif //SDL_SDL_BLIT_A_AVX2_H diff --git a/src/video/SDL_blit_A_sse4_1.c b/src/video/SDL_blit_A_sse4_1.c deleted file mode 100644 index 425f5f0281..0000000000 --- a/src/video/SDL_blit_A_sse4_1.c +++ /dev/null @@ -1,124 +0,0 @@ -#include "SDL_internal.h" - -#if SDL_HAVE_BLIT_A - -#ifdef SDL_SSE4_1_INTRINSICS - -#include "SDL_blit.h" - -void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo *info) -{ - int width = info->dst_w; - int height = info->dst_h; - Uint8 *src = info->src; - int srcskip = info->src_skip; - Uint8 *dst = info->dst; - int dstskip = info->dst_skip; - SDL_PixelFormat *srcfmt = info->src_fmt; - SDL_PixelFormat *dstfmt = info->dst_fmt; - - // The byte offsets for the start of each pixel - const __m128i mask_offsets = _mm_set_epi8( - 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0); - - const __m128i convert_mask = _mm_add_epi32( - _mm_set1_epi32( - ((srcfmt->Rshift >> 3) << dstfmt->Rshift) | - ((srcfmt->Gshift >> 3) << dstfmt->Gshift) | - ((srcfmt->Bshift >> 3) << dstfmt->Bshift)), - mask_offsets); - - const __m128i alpha_splat_mask = _mm_add_epi8(_mm_set1_epi8(srcfmt->Ashift >> 3), mask_offsets); - const __m128i alpha_fill_mask = _mm_set1_epi32((int)dstfmt->Amask); - - while (height--) { - int i = 0; - - for (; i + 4 <= width; i += 4) { - // Load 4 src pixels - __m128i src128 = _mm_loadu_si128((__m128i *)src); - - // Load 4 dst pixels - __m128i dst128 = _mm_loadu_si128((__m128i *)dst); - - // Extract the alpha from each pixel and splat it into all the channels - __m128i srcA = _mm_shuffle_epi8(src128, alpha_splat_mask); - - // Convert to dst format - src128 = _mm_shuffle_epi8(src128, convert_mask); - - // Set the alpha channels of src to 255 - src128 = _mm_or_si128(src128, alpha_fill_mask); - - __m128i src_lo = _mm_unpacklo_epi8(src128, _mm_setzero_si128()); - __m128i src_hi = _mm_unpackhi_epi8(src128, _mm_setzero_si128()); - - __m128i dst_lo = _mm_unpacklo_epi8(dst128, _mm_setzero_si128()); - __m128i dst_hi = _mm_unpackhi_epi8(dst128, _mm_setzero_si128()); - - __m128i srca_lo = _mm_unpacklo_epi8(srcA, _mm_setzero_si128()); - __m128i srca_hi = _mm_unpackhi_epi8(srcA, _mm_setzero_si128()); - - // dst = ((src - dst) * srcA) + ((dst << 8) - dst) - dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srca_lo), - _mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo)); - dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srca_hi), - _mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi)); - - // dst += 0x1U (use 0x80 to round instead of floor) - dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1)); - dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1)); - - // dst += dst >> 8 - dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8); - dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8); - - // Blend the pixels together and save the result - _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(dst_lo, dst_hi)); - - src += 16; - dst += 16; - } - - for (; i < width; ++i) { - Uint32 src32 = *(Uint32 *)src; - Uint32 dst32 = *(Uint32 *)dst; - - Uint32 srcA = (src32 >> srcfmt->Ashift) & 0xFF; - - src32 = (((src32 >> srcfmt->Rshift) & 0xFF) << dstfmt->Rshift) | - (((src32 >> srcfmt->Gshift) & 0xFF) << dstfmt->Gshift) | - (((src32 >> srcfmt->Bshift) & 0xFF) << dstfmt->Bshift) | - dstfmt->Amask; - - Uint32 srcRB = src32 & 0x00FF00FF; - Uint32 dstRB = dst32 & 0x00FF00FF; - - Uint32 srcGA = (src32 >> 8) & 0x00FF00FF; - Uint32 dstGA = (dst32 >> 8) & 0x00FF00FF; - - Uint32 resRB = ((srcRB - dstRB) * srcA) + (dstRB << 8) - dstRB; - resRB += 0x00010001; - resRB += (resRB >> 8) & 0x00FF00FF; - resRB = (resRB >> 8) & 0x00FF00FF; - - Uint32 resGA = ((srcGA - dstGA) * srcA) + (dstGA << 8) - dstGA; - resGA += 0x00010001; - resGA += (resGA >> 8) & 0x00FF00FF; - resGA &= 0xFF00FF00; - dst32 = resRB | resGA; - - *(Uint32 *)dst = dst32; - - src += 4; - dst += 4; - } - - src += srcskip; - dst += dstskip; - } -} - -#endif - -#endif diff --git a/src/video/SDL_blit_A_sse4_1.h b/src/video/SDL_blit_A_sse4_1.h deleted file mode 100644 index 56c5907cbe..0000000000 --- a/src/video/SDL_blit_A_sse4_1.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef SDL_SDL_BLIT_A_SSE4_1_H -#define SDL_SDL_BLIT_A_SSE4_1_H -void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo *info); -#endif //SDL_SDL_BLIT_A_SSE4_1_H