leopard/tests/benchmark.cpp

790 lines
23 KiB
C++

/*
Copyright (c) 2017 Christopher A. Taylor. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Leopard-RS nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#include "../LeopardCommon.h"
#include "../LeopardFF8.h"
#include "../LeopardFF16.h"
#include "../leopard.h"
#include <memory>
#include <vector>
#include <iostream>
#include <string>
using namespace std;
//#define TEST_DATA_ALL_SAME
//#define TEST_LOSE_FIRST_K_PACKETS
//------------------------------------------------------------------------------
// Windows
#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#ifndef _WINSOCKAPI_
#define DID_DEFINE_WINSOCKAPI
#define _WINSOCKAPI_
#endif
#ifndef NOMINMAX
#define NOMINMAX
#endif
#ifndef _WIN32_WINNT
#define _WIN32_WINNT 0x0601 /* Windows 7+ */
#endif
#include <windows.h>
#endif
#ifdef DID_DEFINE_WINSOCKAPI
#undef _WINSOCKAPI_
#undef DID_DEFINE_WINSOCKAPI
#endif
//------------------------------------------------------------------------------
// Threads
static bool SetCurrentThreadPriority()
{
#ifdef _WIN32
return 0 != ::SetThreadPriority(::GetCurrentThread(), THREAD_PRIORITY_ABOVE_NORMAL);
#else
return -1 != nice(2);
#endif
}
//------------------------------------------------------------------------------
// Timing
static uint64_t GetTimeUsec()
{
#ifdef _WIN32
LARGE_INTEGER timeStamp = {};
if (!::QueryPerformanceCounter(&timeStamp))
return 0;
static double PerfFrequencyInverse = 0.;
if (PerfFrequencyInverse == 0.)
{
LARGE_INTEGER freq = {};
if (!::QueryPerformanceFrequency(&freq) || freq.QuadPart == 0)
return 0;
PerfFrequencyInverse = 1000000. / (double)freq.QuadPart;
}
return (uint64_t)(PerfFrequencyInverse * timeStamp.QuadPart);
#else
struct timeval tv;
gettimeofday(&tv, nullptr);
return 1000000 * tv.tv_sec + tv.tv_usec;
#endif // _WIN32
}
//------------------------------------------------------------------------------
// PCG PRNG
// From http://www.pcg-random.org/
class PCGRandom
{
public:
inline void Seed(uint64_t y, uint64_t x = 0)
{
State = 0;
Inc = (y << 1u) | 1u;
Next();
State += x;
Next();
}
inline uint32_t Next()
{
const uint64_t oldstate = State;
State = oldstate * UINT64_C(6364136223846793005) + Inc;
const uint32_t xorshifted = (uint32_t)(((oldstate >> 18) ^ oldstate) >> 27);
const uint32_t rot = oldstate >> 59;
return (xorshifted >> rot) | (xorshifted << ((uint32_t)(-(int32_t)rot) & 31));
}
uint64_t State = 0, Inc = 0;
};
//------------------------------------------------------------------------------
// Self-Checking Packet
static void WriteRandomSelfCheckingPacket(PCGRandom& prng, void* packet, unsigned bytes)
{
uint8_t* buffer = (uint8_t*)packet;
#ifdef TEST_DATA_ALL_SAME
if (bytes != 0)
#else
if (bytes < 16)
#endif
{
LEO_DEBUG_ASSERT(bytes >= 2);
buffer[0] = (uint8_t)prng.Next();
for (unsigned i = 1; i < bytes; ++i)
{
buffer[i] = buffer[0];
}
}
else
{
uint32_t crc = bytes;
*(uint32_t*)(buffer + 4) = bytes;
for (unsigned i = 8; i < bytes; ++i)
{
uint8_t v = (uint8_t)prng.Next();
buffer[i] = v;
crc = (crc << 3) | (crc >> (32 - 3));
crc += v;
}
*(uint32_t*)buffer = crc;
}
}
static bool CheckPacket(const void* packet, unsigned bytes)
{
uint8_t* buffer = (uint8_t*)packet;
#ifdef TEST_DATA_ALL_SAME
if (bytes != 0)
#else
if (bytes < 16)
#endif
{
if (bytes < 2)
return false;
uint8_t v = buffer[0];
for (unsigned i = 1; i < bytes; ++i)
{
if (buffer[i] != v)
return false;
}
}
else
{
uint32_t crc = bytes;
uint32_t readBytes = *(uint32_t*)(buffer + 4);
if (readBytes != bytes)
return false;
for (unsigned i = 8; i < bytes; ++i)
{
uint8_t v = buffer[i];
crc = (crc << 3) | (crc >> (32 - 3));
crc += v;
}
uint32_t readCRC = *(uint32_t*)buffer;
if (readCRC != crc)
return false;
}
return true;
}
//------------------------------------------------------------------------------
// FunctionTimer
class FunctionTimer
{
public:
FunctionTimer(const std::string& name)
{
FunctionName = name;
}
void BeginCall()
{
LEO_DEBUG_ASSERT(t0 == 0);
t0 = GetTimeUsec();
}
void EndCall()
{
LEO_DEBUG_ASSERT(t0 != 0);
uint64_t t1 = GetTimeUsec();
++Invokations;
TotalUsec += t1 - t0;
t0 = 0;
}
void Reset()
{
LEO_DEBUG_ASSERT(t0 == 0);
t0 = 0;
Invokations = 0;
TotalUsec = 0;
}
void Print(unsigned trials)
{
cout << FunctionName << " called " << Invokations / (float)trials << " times per trial. " << TotalUsec / (double)Invokations << " usec avg. " << TotalUsec / (float)trials << " usec for each of " << trials << " trials" << endl;
}
uint64_t t0 = 0;
uint64_t Invokations = 0;
uint64_t TotalUsec = 0;
std::string FunctionName;
};
//------------------------------------------------------------------------------
// Utility: Deck Shuffling function
/*
Given a PRNG, generate a deck of cards in a random order.
The deck will contain elements with values between 0 and count - 1.
*/
static void ShuffleDeck16(PCGRandom &prng, uint16_t * LEO_RESTRICT deck, uint32_t count)
{
deck[0] = 0;
// If we can unroll 4 times,
if (count <= 256)
{
for (uint32_t ii = 1;;)
{
uint32_t jj, rv = prng.Next();
// 8-bit unroll
switch (count - ii)
{
default:
jj = (uint8_t)rv % ii;
deck[ii] = deck[jj];
deck[jj] = ii;
++ii;
jj = (uint8_t)(rv >> 8) % ii;
deck[ii] = deck[jj];
deck[jj] = ii;
++ii;
jj = (uint8_t)(rv >> 16) % ii;
deck[ii] = deck[jj];
deck[jj] = ii;
++ii;
jj = (uint8_t)(rv >> 24) % ii;
deck[ii] = deck[jj];
deck[jj] = ii;
++ii;
break;
case 3:
jj = (uint8_t)rv % ii;
deck[ii] = deck[jj];
deck[jj] = ii;
++ii;
case 2:
jj = (uint8_t)(rv >> 8) % ii;
deck[ii] = deck[jj];
deck[jj] = ii;
++ii;
case 1:
jj = (uint8_t)(rv >> 16) % ii;
deck[ii] = deck[jj];
deck[jj] = ii;
case 0:
return;
}
}
}
else
{
// For each deck entry,
for (uint32_t ii = 1;;)
{
uint32_t jj, rv = prng.Next();
// 16-bit unroll
switch (count - ii)
{
default:
jj = (uint16_t)rv % ii;
deck[ii] = deck[jj];
deck[jj] = ii;
++ii;
jj = (uint16_t)(rv >> 16) % ii;
deck[ii] = deck[jj];
deck[jj] = ii;
++ii;
break;
case 1:
jj = (uint16_t)rv % ii;
deck[ii] = deck[jj];
deck[jj] = ii;
case 0:
return;
}
}
}
}
//------------------------------------------------------------------------------
// SIMD-Safe Aligned Memory Allocations
static const unsigned kAlignmentBytes = LEO_ALIGN_BYTES;
LEO_FORCE_INLINE unsigned NextAlignedOffset(unsigned offset)
{
return (offset + kAlignmentBytes - 1) & ~(kAlignmentBytes - 1);
}
static LEO_FORCE_INLINE uint8_t* SIMDSafeAllocate(size_t size)
{
uint8_t* data = (uint8_t*)calloc(1, kAlignmentBytes + size);
if (!data)
return nullptr;
unsigned offset = (unsigned)((uintptr_t)data % kAlignmentBytes);
data += kAlignmentBytes - offset;
data[-1] = (uint8_t)offset;
return data;
}
static LEO_FORCE_INLINE void SIMDSafeFree(void* ptr)
{
if (!ptr)
return;
uint8_t* data = (uint8_t*)ptr;
unsigned offset = data[-1];
if (offset >= kAlignmentBytes)
{
LEO_DEBUG_BREAK; // Should never happen
return;
}
data -= kAlignmentBytes - offset;
free(data);
}
//------------------------------------------------------------------------------
// Tests
struct TestParameters
{
#ifdef LEO_HAS_FF16
unsigned original_count = 1000; // under 65536
unsigned recovery_count = 100; // under 65536 - original_count
#else
unsigned original_count = 100; // under 65536
unsigned recovery_count = 10; // under 65536 - original_count
#endif
unsigned buffer_bytes = 64000; // multiple of 64 bytes
unsigned loss_count = 10; // some fraction of original_count
unsigned seed = 0;
bool multithreaded = true;
};
static void BasicTest(const TestParameters& params)
{
static const unsigned kTrials = 10;
std::vector<uint8_t*> original_data(params.original_count);
const unsigned encode_work_count = leo_encode_work_count(params.original_count, params.recovery_count);
const unsigned decode_work_count = leo_decode_work_count(params.original_count, params.recovery_count);
std::vector<uint8_t*> encode_work_data(encode_work_count);
std::vector<uint8_t*> decode_work_data(decode_work_count);
FunctionTimer t_mem_alloc("memory_allocation");
FunctionTimer t_leo_encode("leo_encode");
FunctionTimer t_leo_decode("leo_decode");
FunctionTimer t_mem_free("memory_free");
const uint64_t total_bytes = (uint64_t)params.buffer_bytes * params.original_count;
for (unsigned trial = 0; trial < kTrials; ++trial)
{
// Allocate memory:
t_mem_alloc.BeginCall();
for (unsigned i = 0, count = params.original_count; i < count; ++i)
original_data[i] = SIMDSafeAllocate(params.buffer_bytes);
for (unsigned i = 0, count = encode_work_count; i < count; ++i)
encode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes);
for (unsigned i = 0, count = decode_work_count; i < count; ++i)
decode_work_data[i] = SIMDSafeAllocate(params.buffer_bytes);
t_mem_alloc.EndCall();
// Generate data:
PCGRandom prng;
prng.Seed(params.seed, trial);
for (unsigned i = 0; i < params.original_count; ++i)
WriteRandomSelfCheckingPacket(prng, original_data[i], params.buffer_bytes);
// Encode:
t_leo_encode.BeginCall();
LeopardResult encodeResult = leo_encode(
params.buffer_bytes,
params.original_count,
params.recovery_count,
encode_work_count,
(void**)&original_data[0],
(void**)&encode_work_data[0], // recovery data written here
params.multithreaded ? LeopardFlags_Multithreaded : LeopardFlags_Defaults
);
t_leo_encode.EndCall();
if (encodeResult != Leopard_Success)
{
cout << "Error: Leopard encode failed with result=" << encodeResult << endl;
LEO_DEBUG_BREAK;
return;
}
// Lose random original data:
std::vector<uint16_t> original_losses(params.original_count);
ShuffleDeck16(prng, &original_losses[0], params.original_count);
for (unsigned i = 0, count = params.loss_count; i < count; ++i)
{
const unsigned loss_index = original_losses[i];
SIMDSafeFree(original_data[loss_index]);
original_data[loss_index] = nullptr;
}
// Lose random recovery data:
const unsigned recovery_loss_count = params.recovery_count - params.loss_count;
std::vector<uint16_t> recovery_losses(params.recovery_count);
ShuffleDeck16(prng, &recovery_losses[0], params.recovery_count);
for (unsigned i = 0, count = recovery_loss_count; i < count; ++i)
{
const unsigned loss_index = recovery_losses[i];
SIMDSafeFree(encode_work_data[loss_index]);
encode_work_data[loss_index] = nullptr;
}
// Decode:
t_leo_decode.BeginCall();
LeopardResult decodeResult = leo_decode(
params.buffer_bytes,
params.original_count,
params.recovery_count,
decode_work_count,
(void**)&original_data[0],
(void**)&encode_work_data[0],
(void**)&decode_work_data[0],
params.multithreaded ? LeopardFlags_Multithreaded : LeopardFlags_Defaults);
t_leo_decode.EndCall();
if (decodeResult != Leopard_Success)
{
cout << "Error: Leopard decode failed with result=" << decodeResult << endl;
LEO_DEBUG_BREAK;
return;
}
#if 0
for (unsigned i = 0; i < params.original_count; ++i)
{
if (!original_data[i])
{
cout << "Checking " << i << endl;
if (!CheckPacket(decode_work_data[i], params.buffer_bytes))
{
cout << "Error: Data was corrupted" << endl;
LEO_DEBUG_BREAK;
return;
}
}
}
#endif
// Free memory:
t_mem_free.BeginCall();
for (unsigned i = 0, count = params.original_count; i < count; ++i)
SIMDSafeFree(original_data[i]);
for (unsigned i = 0, count = encode_work_count; i < count; ++i)
SIMDSafeFree(encode_work_data[i]);
for (unsigned i = 0, count = decode_work_count; i < count; ++i)
SIMDSafeFree(decode_work_data[i]);
t_mem_free.EndCall();
}
t_mem_alloc.Print(kTrials);
t_leo_encode.Print(kTrials);
t_leo_decode.Print(kTrials);
t_mem_free.Print(kTrials);
float encode_input_MBPS = total_bytes * kTrials / (float)(t_leo_encode.TotalUsec);
float encode_output_MBPS = params.buffer_bytes * (uint64_t)params.recovery_count * kTrials / (float)(t_leo_encode.TotalUsec);
float decode_input_MBPS = total_bytes * kTrials / (float)(t_leo_decode.TotalUsec);
float decode_output_MBPS = params.buffer_bytes * (uint64_t)params.loss_count * kTrials / (float)(t_leo_decode.TotalUsec);
cout << "Leopard Encoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << encode_input_MBPS << " MB/s, Output=" << encode_output_MBPS << " MB/s" << endl;
cout << "Leopard Decoder(" << total_bytes / 1000000.f << " MB in " << params.original_count << " pieces, " << params.loss_count << " losses): Input=" << decode_input_MBPS << " MB/s, Output=" << decode_output_MBPS << " MB/s" << endl << endl;
}
//------------------------------------------------------------------------------
// Parallel XOR Benchmark
// Demonstrate about 10% performance boost by doing parallel rows for XORs
void ParallelXORBenchmark()
{
FunctionTimer t_1("xor_mem");
FunctionTimer t_4("xor_mem4");
static const unsigned buffer_bytes = 4096;
static const unsigned buffer_count = 1024;
uint8_t* buffers_x[buffer_count] = {};
uint8_t* buffers_y[buffer_count] = {};
for (unsigned i = 0; i < buffer_count; ++i)
{
buffers_x[i] = SIMDSafeAllocate(buffer_bytes);
buffers_y[i] = SIMDSafeAllocate(buffer_bytes);
}
static const unsigned iteration_count = 1000;
for (unsigned i = 0; i < iteration_count; ++i)
{
t_1.BeginCall();
for (unsigned j = 0; j < buffer_count; ++j)
leopard::xor_mem(
buffers_x[j], buffers_y[j],
buffer_bytes);
t_1.EndCall();
}
for (unsigned i = 0; i < iteration_count; ++i)
{
t_4.BeginCall();
for (unsigned j = 0; j < buffer_count; j += 4)
leopard::xor_mem4(
buffers_x[j], buffers_y[j],
buffers_x[j + 1], buffers_y[j + 1],
buffers_x[j + 2], buffers_y[j + 2],
buffers_x[j + 3], buffers_y[j + 3],
buffer_bytes);
t_4.EndCall();
}
for (unsigned i = 0; i < buffer_count; ++i)
{
SIMDSafeFree(buffers_x[i]);
SIMDSafeFree(buffers_y[i]);
}
t_1.Print(iteration_count);
t_4.Print(iteration_count);
}
//------------------------------------------------------------------------------
// Parallel Butterfly8 Benchmark
#ifdef LEO_HAS_FF8
// Demonstrate performance boost by doing parallel rows for Butterfly8s
void ParallelButterfly8Benchmark()
{
FunctionTimer t_1("8-bit fft_butterfly");
FunctionTimer t_4("8-bit fft_butterfly4");
static const unsigned buffer_bytes = 4096;
static const unsigned buffer_count = 1024;
uint8_t* buffers_x[buffer_count] = {};
uint8_t* buffers_y[buffer_count] = {};
for (unsigned i = 0; i < buffer_count; ++i)
{
buffers_x[i] = SIMDSafeAllocate(buffer_bytes);
buffers_y[i] = SIMDSafeAllocate(buffer_bytes);
}
static const unsigned iteration_count = 1000;
for (unsigned i = 0; i < iteration_count; ++i)
{
leopard::ff8::ffe_t m = (leopard::ff8::ffe_t)(i + 2);
t_1.BeginCall();
for (unsigned j = 0; j < buffer_count; ++j)
leopard::ff8::fft_butterfly(
buffers_x[j], buffers_y[j],
m,
buffer_bytes);
t_1.EndCall();
}
for (unsigned i = 0; i < iteration_count; ++i)
{
leopard::ff8::ffe_t m = (leopard::ff8::ffe_t)(i + 2);
t_4.BeginCall();
for (unsigned j = 0; j < buffer_count; j += 4)
leopard::ff8::fft_butterfly4(
buffers_x[j], buffers_y[j],
buffers_x[j + 1], buffers_y[j + 1],
buffers_x[j + 2], buffers_y[j + 2],
buffers_x[j + 3], buffers_y[j + 3],
m,
buffer_bytes);
t_4.EndCall();
}
for (unsigned i = 0; i < buffer_count; ++i)
{
SIMDSafeFree(buffers_x[i]);
SIMDSafeFree(buffers_y[i]);
}
t_1.Print(iteration_count);
t_4.Print(iteration_count);
}
#endif // LEO_HAS_FF8
//------------------------------------------------------------------------------
// Parallel Butterfly16 Benchmark
#ifdef LEO_HAS_FF16
// Demonstrate performance boost by doing parallel rows for Butterfly16s
void ParallelButterfly16Benchmark()
{
FunctionTimer t_1("16-bit fft_butterfly");
FunctionTimer t_4("16-bit fft_butterfly4");
static const unsigned buffer_bytes = 4096;
static const unsigned buffer_count = 1024;
uint8_t* buffers_x[buffer_count] = {};
uint8_t* buffers_y[buffer_count] = {};
for (unsigned i = 0; i < buffer_count; ++i)
{
buffers_x[i] = SIMDSafeAllocate(buffer_bytes);
buffers_y[i] = SIMDSafeAllocate(buffer_bytes);
}
static const unsigned iteration_count = 100;
for (unsigned i = 0; i < iteration_count; ++i)
{
leopard::ff16::ffe_t m = (leopard::ff16::ffe_t)(i + 2);
t_1.BeginCall();
for (unsigned j = 0; j < buffer_count; ++j)
leopard::ff16::fft_butterfly(
buffers_x[j], buffers_y[j],
m,
buffer_bytes);
t_1.EndCall();
}
for (unsigned i = 0; i < iteration_count; ++i)
{
leopard::ff16::ffe_t m = (leopard::ff16::ffe_t)(i + 2);
t_4.BeginCall();
for (unsigned j = 0; j < buffer_count; j += 4)
leopard::ff16::fft_butterfly4(
buffers_x[j], buffers_y[j],
buffers_x[j + 1], buffers_y[j + 1],
buffers_x[j + 2], buffers_y[j + 2],
buffers_x[j + 3], buffers_y[j + 3],
m,
buffer_bytes);
t_4.EndCall();
}
for (unsigned i = 0; i < buffer_count; ++i)
{
SIMDSafeFree(buffers_x[i]);
SIMDSafeFree(buffers_y[i]);
}
t_1.Print(iteration_count);
t_4.Print(iteration_count);
}
#endif // LEO_HAS_FF8
//------------------------------------------------------------------------------
// Entrypoint
int main(int argc, char **argv)
{
SetCurrentThreadPriority();
FunctionTimer t_leo_init("leo_init");
t_leo_init.BeginCall();
if (0 != leo_init())
{
cout << "Failed to initialize" << endl;
return -1;
}
t_leo_init.EndCall();
t_leo_init.Print(1);
ParallelXORBenchmark();
#ifdef LEO_HAS_FF8
ParallelButterfly8Benchmark();
#endif // LEO_HAS_FF8
#ifdef LEO_HAS_FF16
ParallelButterfly16Benchmark();
#endif // LEO_HAS_FF16
TestParameters params;
if (argc >= 2)
params.original_count = atoi(argv[1]);
if (argc >= 3)
params.recovery_count = atoi(argv[2]);
if (argc >= 4)
params.buffer_bytes = atoi(argv[3]);
if (argc >= 5)
params.loss_count = atoi(argv[4]);
if (argc >= 6)
params.multithreaded = (atoi(argv[5]) != 0);
cout << "Parameters: [original count=" << params.original_count << "] [recovery count=" << params.recovery_count << "] [buffer bytes=" << params.buffer_bytes << "] [loss count=" << params.loss_count << "] [random seed=" << params.seed << "]" << endl;
BasicTest(params);
getchar();
return 0;
}