Optimise SawyerChunkReader for MSVC debug builds

- Change std::copy_n and std::fill_n back to std::memcpy and std::memset. They do not have the overhead of checks.
- Change std::malloc to HeapAlloc as 16 MiB allocations are very slow due to it initialising all the memory to 0xCC.
This commit is contained in:
Ted John 2018-05-27 16:30:19 +01:00
parent 3d98e1ad1d
commit c28a42d877
2 changed files with 67 additions and 30 deletions

View File

@ -1,4 +1,4 @@
#pragma region Copyright (c) 2014-2017 OpenRCT2 Developers
#pragma region Copyright (c) 2014-2018 OpenRCT2 Developers
/*****************************************************************************
* OpenRCT2, an open source clone of Roller Coaster Tycoon 2.
*
@ -14,12 +14,17 @@
*****************************************************************************/
#pragma endregion
#include <algorithm>
#include "../core/IStream.hpp"
#include "../core/Math.hpp"
#include "../core/Memory.hpp"
#include "SawyerChunkReader.h"
// malloc is very slow for large allocations in MSVC debug builds as it allocates
// memory on a special debug heap and then initialises all the memory to 0xCC.
#if defined(_WIN32) && defined(DEBUG)
#define __USE_HEAP_ALLOC__
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif
// Allow chunks to be uncompressed to a maximum of 16 MiB
constexpr size_t MAX_UNCOMPRESSED_CHUNK_SIZE = 16 * 1024 * 1024;
@ -74,22 +79,10 @@ std::shared_ptr<SawyerChunk> SawyerChunkReader::ReadChunk()
throw SawyerChunkException(EXCEPTION_MSG_CORRUPT_CHUNK_SIZE);
}
// Allow 16MiB for chunk data
size_t bufferSize = MAX_UNCOMPRESSED_CHUNK_SIZE;
uint8 * buffer = Memory::Allocate<uint8>(bufferSize);
if (buffer == nullptr)
{
throw std::runtime_error("Unable to allocate buffer.");
}
size_t uncompressedLength = DecodeChunk(buffer, bufferSize, compressedData.get(), header);
auto buffer = (uint8 *)AllocateLargeTempBuffer();
size_t uncompressedLength = DecodeChunk(buffer, MAX_UNCOMPRESSED_CHUNK_SIZE, compressedData.get(), header);
Guard::Assert(uncompressedLength != 0, "Encountered zero-sized chunk!");
buffer = Memory::Reallocate(buffer, uncompressedLength);
if (buffer == nullptr)
{
throw std::runtime_error("Unable to reallocate buffer.");
}
buffer = (uint8 *)FinaliseLargeTempBuffer(buffer, uncompressedLength);
return std::make_shared<SawyerChunk>((SAWYER_ENCODING)header.encoding, buffer, uncompressedLength);
}
default:
@ -111,16 +104,16 @@ void SawyerChunkReader::ReadChunk(void * dst, size_t length)
auto chunkLength = chunk->GetLength();
if (chunkLength > length)
{
std::copy_n(chunkData, length, (uint8 *)dst);
std::memcpy(dst, chunkData, length);
}
else
{
std::copy_n(chunkData, chunkLength, (uint8 *)dst);
std::memcpy(dst, chunkData, chunkLength);
auto remainingLength = length - chunkLength;
if (remainingLength > 0)
{
auto offset = (uint8 *)dst + chunkLength;
std::fill_n(offset, remainingLength, 0);
std::memset(offset, 0, remainingLength);
}
}
}
@ -135,7 +128,7 @@ size_t SawyerChunkReader::DecodeChunk(void * dst, size_t dstCapacity, const void
{
throw SawyerChunkException(EXCEPTION_MSG_DESTINATION_TOO_SMALL);
}
std::copy_n((const uint8 *)src, header.length, (uint8 *)dst);
std::memcpy(dst, src, header.length);
resultLength = header.length;
break;
case CHUNK_ENCODING_RLE:
@ -155,10 +148,11 @@ size_t SawyerChunkReader::DecodeChunk(void * dst, size_t dstCapacity, const void
size_t SawyerChunkReader::DecodeChunkRLERepeat(void * dst, size_t dstCapacity, const void * src, size_t srcLength)
{
auto immBufferLength = MAX_UNCOMPRESSED_CHUNK_SIZE;
auto immBuffer = std::make_unique<uint8[]>(immBufferLength);
auto immLength = DecodeChunkRLE(immBuffer.get(), immBufferLength, src, srcLength);
return DecodeChunkRepeat(dst, dstCapacity, immBuffer.get(), immLength);
auto immBuffer = AllocateLargeTempBuffer();
auto immLength = DecodeChunkRLE(immBuffer, MAX_UNCOMPRESSED_CHUNK_SIZE, src, srcLength);
auto size = DecodeChunkRepeat(dst, dstCapacity, immBuffer, immLength);
FreeLargeTempBuffer(immBuffer);
return size;
}
size_t SawyerChunkReader::DecodeChunkRLE(void * dst, size_t dstCapacity, const void * src, size_t srcLength)
@ -183,7 +177,7 @@ size_t SawyerChunkReader::DecodeChunkRLE(void * dst, size_t dstCapacity, const v
throw SawyerChunkException(EXCEPTION_MSG_DESTINATION_TOO_SMALL);
}
std::fill_n(dst8, count, src8[i]);
std::memset(dst8, src8[i], count);
dst8 += count;
}
else
@ -197,7 +191,7 @@ size_t SawyerChunkReader::DecodeChunkRLE(void * dst, size_t dstCapacity, const v
throw SawyerChunkException(EXCEPTION_MSG_DESTINATION_TOO_SMALL);
}
std::copy_n(src8 + i + 1, rleCodeByte + 1, dst8);
std::memcpy(dst8, src8 + i + 1, rleCodeByte + 1);
dst8 += rleCodeByte + 1;
i += rleCodeByte + 1;
}
@ -226,7 +220,7 @@ size_t SawyerChunkReader::DecodeChunkRepeat(void * dst, size_t dstCapacity, cons
throw SawyerChunkException(EXCEPTION_MSG_DESTINATION_TOO_SMALL);
}
std::copy_n(copySrc, count, dst8);
std::memcpy(dst8, copySrc, count);
dst8 += count;
}
}
@ -250,3 +244,42 @@ size_t SawyerChunkReader::DecodeChunkRotate(void * dst, size_t dstCapacity, cons
}
return srcLength;
}
void * SawyerChunkReader::AllocateLargeTempBuffer()
{
#ifdef __USE_HEAP_ALLOC__
auto buffer = HeapAlloc(GetProcessHeap(), 0, MAX_UNCOMPRESSED_CHUNK_SIZE);
#else
auto buffer = std::malloc(MAX_UNCOMPRESSED_CHUNK_SIZE);
#endif
if (buffer == nullptr)
{
throw std::runtime_error("Unable to allocate large temporary buffer.");
}
return buffer;
}
void * SawyerChunkReader::FinaliseLargeTempBuffer(void * buffer, size_t len)
{
#ifdef __USE_HEAP_ALLOC__
auto finalBuffer = std::malloc(len);
std::memcpy(finalBuffer, buffer, len);
HeapFree(GetProcessHeap(), 0, buffer);
#else
auto finalBuffer = (uint8 *)std::realloc(buffer, len);
#endif
if (finalBuffer == nullptr)
{
throw std::runtime_error("Unable to allocate final buffer.");
}
return finalBuffer;
}
void SawyerChunkReader::FreeLargeTempBuffer(void * buffer)
{
#ifdef __USE_HEAP_ALLOC__
HeapFree(GetProcessHeap(), 0, buffer);
#else
std::free(buffer);
#endif
}

View File

@ -75,4 +75,8 @@ private:
static size_t DecodeChunkRLE(void * dst, size_t dstCapacity, const void * src, size_t srcLength);
static size_t DecodeChunkRepeat(void * dst, size_t dstCapacity, const void * src, size_t srcLength);
static size_t DecodeChunkRotate(void * dst, size_t dstCapacity, const void * src, size_t srcLength);
static void * AllocateLargeTempBuffer();
static void * FinaliseLargeTempBuffer(void * buffer, size_t len);
static void FreeLargeTempBuffer(void * buffer);
};