mirror of https://github.com/OpenRCT2/OpenRCT2.git
Move SSE4.1 code to its own file, detect SSE4.1 in runtime
This commit is contained in:
parent
c04e720f9f
commit
645b36169d
|
@ -0,0 +1,73 @@
|
|||
#pragma region Copyright (c) 2014-2017 OpenRCT2 Developers
|
||||
/*****************************************************************************
|
||||
* OpenRCT2, an open source clone of Roller Coaster Tycoon 2.
|
||||
*
|
||||
* OpenRCT2 is the work of many authors, a full list can be found in contributors.md
|
||||
* For more information, visit https://github.com/OpenRCT2/OpenRCT2
|
||||
*
|
||||
* OpenRCT2 is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* A full copy of the GNU General Public License can be found in licence.txt
|
||||
*****************************************************************************/
|
||||
#pragma endregion
|
||||
|
||||
#include "../common.h"
|
||||
#include "../core/Guard.hpp"
|
||||
#include "drawing.h"
|
||||
|
||||
#ifdef __SSE4_1__
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
void mask_sse4_1(sint32 width, sint32 height, const uint8 * RESTRICT maskSrc, const uint8 * RESTRICT colourSrc,
|
||||
uint8 * RESTRICT dst, sint32 maskWrap, sint32 colourWrap, sint32 dstWrap)
|
||||
{
|
||||
if (width == 32)
|
||||
{
|
||||
const __m128i zero128 = {};
|
||||
for (sint32 yy = 0; yy < height; yy++)
|
||||
{
|
||||
sint32 colourStep = yy * (colourWrap + 32);
|
||||
sint32 maskStep = yy * (maskWrap + 32);
|
||||
sint32 dstStep = yy * (dstWrap + 32);
|
||||
|
||||
// first half
|
||||
const __m128i colour1 = _mm_lddqu_si128((const __m128i *)(colourSrc + colourStep));
|
||||
const __m128i mask1 = _mm_lddqu_si128((const __m128i *)(maskSrc + maskStep));
|
||||
const __m128i dest1 = _mm_lddqu_si128((const __m128i *)(dst + dstStep));
|
||||
const __m128i mc1 = _mm_and_si128(colour1, mask1);
|
||||
const __m128i saturate1 = _mm_cmpeq_epi8(mc1, zero128);
|
||||
// _mm_blendv_epi8 is SSE4.1
|
||||
const __m128i blended1 = _mm_blendv_epi8(mc1, dest1, saturate1);
|
||||
|
||||
// second half
|
||||
const __m128i colour2 = _mm_lddqu_si128((const __m128i *)(colourSrc + 16 + colourStep));
|
||||
const __m128i mask2 = _mm_lddqu_si128((const __m128i *)(maskSrc + 16 + maskStep));
|
||||
const __m128i dest2 = _mm_lddqu_si128((const __m128i *)(dst + 16 + dstStep));
|
||||
const __m128i mc2 = _mm_and_si128(colour2, mask2);
|
||||
const __m128i saturate2 = _mm_cmpeq_epi8(mc2, zero128);
|
||||
// _mm_blendv_epi8 is SSE4.1
|
||||
const __m128i blended2 = _mm_blendv_epi8(mc2, dest2, saturate2);
|
||||
|
||||
_mm_storeu_si128((__m128i *)(dst + dstStep), blended1);
|
||||
_mm_storeu_si128((__m128i *)(dst + 16 + dstStep), blended2);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
mask_scalar(width, height, maskSrc, colourSrc, dst, maskWrap, colourWrap, dstWrap);
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
void mask_sse4_1(sint32 width, sint32 height, const uint8 * RESTRICT maskSrc, const uint8 * RESTRICT colourSrc,
|
||||
uint8 * RESTRICT dst, sint32 maskWrap, sint32 colourWrap, sint32 dstWrap)
|
||||
{
|
||||
openrct2_assert(false, "SSE 4.1 function called on a CPU that doesn't support SSE 4.1");
|
||||
}
|
||||
|
||||
#endif // __SSE4_1__
|
|
@ -16,10 +16,6 @@
|
|||
|
||||
#include <memory>
|
||||
|
||||
#ifdef __SSE4_1__
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#include "../common.h"
|
||||
#include "../config/Config.h"
|
||||
#include "../Context.h"
|
||||
|
@ -158,8 +154,8 @@ static void read_and_convert_gxdat(IStream * stream, size_t count, bool is_rctc,
|
|||
Memory::Free(g1Elements32);
|
||||
}
|
||||
|
||||
static void mask_scalar(sint32 width, sint32 height, const uint8 * RESTRICT maskSrc, const uint8 * RESTRICT colourSrc,
|
||||
uint8 * RESTRICT dst, sint32 maskWrap, sint32 colourWrap, sint32 dstWrap)
|
||||
void mask_scalar(sint32 width, sint32 height, const uint8 * RESTRICT maskSrc, const uint8 * RESTRICT colourSrc,
|
||||
uint8 * RESTRICT dst, sint32 maskWrap, sint32 colourWrap, sint32 dstWrap)
|
||||
{
|
||||
for (sint32 yy = 0; yy < height; yy++)
|
||||
{
|
||||
|
@ -181,41 +177,6 @@ static void mask_scalar(sint32 width, sint32 height, const uint8 * RESTRICT mask
|
|||
}
|
||||
}
|
||||
|
||||
#ifdef __SSE4_1__
|
||||
static void mask_sse4_1(sint32 width, sint32 height, const uint8 * RESTRICT maskSrc, const uint8 * RESTRICT colourSrc,
|
||||
uint8 * RESTRICT dst, sint32 maskWrap, sint32 colourWrap, sint32 dstWrap)
|
||||
{
|
||||
const __m128i zero128 = {};
|
||||
for (sint32 yy = 0; yy < height; yy++)
|
||||
{
|
||||
sint32 colourStep = yy * (colourWrap + 32);
|
||||
sint32 maskStep = yy * (maskWrap + 32);
|
||||
sint32 dstStep = yy * (dstWrap + 32);
|
||||
|
||||
// first half
|
||||
const __m128i colour1 = _mm_lddqu_si128((const __m128i *)(colourSrc + colourStep));
|
||||
const __m128i mask1 = _mm_lddqu_si128((const __m128i *)(maskSrc + maskStep));
|
||||
const __m128i dest1 = _mm_lddqu_si128((const __m128i *)(dst + dstStep));
|
||||
const __m128i mc1 = _mm_and_si128(colour1, mask1);
|
||||
const __m128i saturate1 = _mm_cmpeq_epi8(mc1, zero128);
|
||||
// _mm_blendv_epi8 is SSE4.1
|
||||
const __m128i blended1 = _mm_blendv_epi8(mc1, dest1, saturate1);
|
||||
|
||||
// second half
|
||||
const __m128i colour2 = _mm_lddqu_si128((const __m128i *)(colourSrc + 16 + colourStep));
|
||||
const __m128i mask2 = _mm_lddqu_si128((const __m128i *)(maskSrc + 16 + maskStep));
|
||||
const __m128i dest2 = _mm_lddqu_si128((const __m128i *)(dst + 16 + dstStep));
|
||||
const __m128i mc2 = _mm_and_si128(colour2, mask2);
|
||||
const __m128i saturate2 = _mm_cmpeq_epi8(mc2, zero128);
|
||||
// _mm_blendv_epi8 is SSE4.1
|
||||
const __m128i blended2 = _mm_blendv_epi8(mc2, dest2, saturate2);
|
||||
|
||||
_mm_storeu_si128((__m128i *)(dst + dstStep), blended1);
|
||||
_mm_storeu_si128((__m128i *)(dst + 16 + dstStep), blended2);
|
||||
}
|
||||
}
|
||||
#endif // __SSE4_1__
|
||||
|
||||
extern "C"
|
||||
{
|
||||
static void * _g1Buffer = nullptr;
|
||||
|
@ -789,15 +750,7 @@ extern "C"
|
|||
sint32 colourWrap = imgColour->width - width;
|
||||
sint32 dstWrap = ((dpi->width + dpi->pitch) - width);
|
||||
|
||||
#ifdef __SSE4_1__
|
||||
if (width == 32)
|
||||
{
|
||||
mask_sse4_1(width, height, maskSrc, colourSrc, dst, maskWrap, colourWrap, dstWrap);
|
||||
}
|
||||
else
|
||||
#endif // __SSE4_1__
|
||||
// fallback scalar code
|
||||
mask_scalar(width, height, maskSrc, colourSrc, dst, maskWrap, colourWrap, dstWrap);
|
||||
mask_fn(width, height, maskSrc, colourSrc, dst, maskWrap, colourWrap, dstWrap);
|
||||
}
|
||||
|
||||
const rct_g1_element * gfx_get_g1_element(sint32 image_id)
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include "../object.h"
|
||||
#include "../OpenRCT2.h"
|
||||
#include "../platform/platform.h"
|
||||
#include "../util/Util.h"
|
||||
#include "../world/water.h"
|
||||
#include "drawing.h"
|
||||
|
||||
|
@ -470,6 +471,23 @@ const translucent_window_palette TranslucentWindowPalettes[COLOUR_COUNT] = {
|
|||
{PALETTE_TRANSLUCENT_LIGHT_PINK, PALETTE_TRANSLUCENT_LIGHT_PINK_HIGHLIGHT, PALETTE_TRANSLUCENT_LIGHT_PINK_SHADOW},
|
||||
};
|
||||
|
||||
void (*mask_fn)(sint32 width, sint32 height, const uint8 * RESTRICT maskSrc, const uint8 * RESTRICT colourSrc,
|
||||
uint8 * RESTRICT dst, sint32 maskWrap, sint32 colourWrap, sint32 dstWrap) = NULL;
|
||||
|
||||
void mask_init()
|
||||
{
|
||||
if (sse41_available())
|
||||
{
|
||||
log_verbose("registering SSE4.1 mask function");
|
||||
mask_fn = mask_sse4_1;
|
||||
}
|
||||
else
|
||||
{
|
||||
log_verbose("registering scalar mask function");
|
||||
mask_fn = mask_scalar;
|
||||
}
|
||||
}
|
||||
|
||||
void gfx_draw_pixel(rct_drawpixelinfo *dpi, sint32 x, sint32 y, sint32 colour)
|
||||
{
|
||||
gfx_fill_rect(dpi, x, y, x, y, colour);
|
||||
|
|
|
@ -358,6 +358,15 @@ sint32 scrolling_text_setup(paint_session * session, rct_string_id stringId, uin
|
|||
|
||||
rct_size16 FASTCALL gfx_get_sprite_size(uint32 image_id);
|
||||
|
||||
void mask_sse4_1(sint32 width, sint32 height, const uint8 * RESTRICT maskSrc, const uint8 * RESTRICT colourSrc,
|
||||
uint8 * RESTRICT dst, sint32 maskWrap, sint32 colourWrap, sint32 dstWrap);
|
||||
void mask_scalar(sint32 width, sint32 height, const uint8 * RESTRICT maskSrc, const uint8 * RESTRICT colourSrc,
|
||||
uint8 * RESTRICT dst, sint32 maskWrap, sint32 colourWrap, sint32 dstWrap);
|
||||
void mask_init();
|
||||
|
||||
extern void (*mask_fn)(sint32 width, sint32 height, const uint8 * RESTRICT maskSrc, const uint8 * RESTRICT colourSrc,
|
||||
uint8 * RESTRICT dst, sint32 maskWrap, sint32 colourWrap, sint32 dstWrap);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -222,6 +222,7 @@ void core_init()
|
|||
|
||||
platform_ticks_init();
|
||||
bitcount_init();
|
||||
mask_init();
|
||||
|
||||
#if defined(__APPLE__) && (__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101200)
|
||||
kern_return_t ret = mach_timebase_info(&_mach_base_info);
|
||||
|
|
|
@ -204,6 +204,19 @@ static bool cpuid_x86(uint32 * cpuid_outdata, sint32 eax)
|
|||
}
|
||||
#endif // OPENRCT2_X86
|
||||
|
||||
bool sse41_available()
|
||||
{
|
||||
#ifdef OPENRCT2_X86
|
||||
// SSE4.1 support is declared as the 19th bit of ECX with CPUID(EAX = 1).
|
||||
uint32 regs[4] = { 0 };
|
||||
if (cpuid_x86(regs, 1))
|
||||
{
|
||||
return (regs[2] & (1 << 19));
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool bitcount_popcnt_available()
|
||||
{
|
||||
#ifdef OPENRCT2_X86
|
||||
|
|
|
@ -41,6 +41,8 @@ void path_end_with_separator(utf8 *path, size_t size);
|
|||
bool readentirefile(const utf8 *path, void **outBuffer, size_t *outLength);
|
||||
bool writeentirefile(const utf8 * path, const void * buffer, size_t length);
|
||||
|
||||
bool sse41_available();
|
||||
|
||||
sint32 bitscanforward(sint32 source);
|
||||
void bitcount_init();
|
||||
sint32 bitcount(uint32 source);
|
||||
|
|
Loading…
Reference in New Issue