Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Property pool hlsl #94

Draft
wants to merge 14 commits into
base: master
Choose a base branch
from
Draft
24 changes: 24 additions & 0 deletions 66_PropertyPools/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
include(common RESULT_VARIABLE RES)
if(NOT RES)
message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
endif()

nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")

if(NBL_EMBED_BUILTIN_RESOURCES)
set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
set(RESOURCE_DIR "app_resources")

get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)

file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
endforeach()

ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")

LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
endif()
20 changes: 20 additions & 0 deletions 66_PropertyPools/app_resources/common.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#include "nbl/builtin/hlsl/cpp_compat.hlsl"

// Unfortunately not every piece of C++14 metaprogramming syntax is available in HLSL 202x
// https://github.com/microsoft/DirectXShaderCompiler/issues/5751#issuecomment-1800847954
typedef nbl::hlsl::float32_t3 input_t;
Comment on lines +1 to +5

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove files you're not using

typedef nbl::hlsl::float32_t output_t;

NBL_CONSTEXPR_STATIC_INLINE uint32_t MaxPossibleElementCount = 1 << 20;

struct PushConstantData
{
uint64_t inputAddress;
uint64_t outputAddress;
uint32_t dataElementCount;
};

NBL_CONSTEXPR uint32_t WorkgroupSize = 256;

// Yes we do have our own re-creation of C++'s STL in HLSL2021 !
#include "nbl/builtin/hlsl/limits.hlsl"
33 changes: 33 additions & 0 deletions 66_PropertyPools/app_resources/shader.comp.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#include "common.hlsl"

// just a small test
#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"

[[vk::push_constant]] PushConstantData pushConstants;

// does absolutely nothing, a later example will show how it gets used
template<typename capability_traits=nbl::hlsl::jit::device_capabilities_traits>
void dummyTraitTest() {}

[numthreads(WorkgroupSize,1,1)]
void main(uint32_t3 ID : SV_DispatchThreadID)
{
dummyTraitTest();
if (ID.x>=pushConstants.dataElementCount)
return;

const input_t self = vk::RawBufferLoad<input_t>(pushConstants.inputAddress+sizeof(input_t)*ID.x);

nbl::hlsl::Xoroshiro64StarStar rng = nbl::hlsl::Xoroshiro64StarStar::construct(uint32_t2(pushConstants.dataElementCount,ID.x)^0xdeadbeefu);

float32_t acc = nbl::hlsl::numeric_limits<float32_t>::max;
const static uint32_t OthersToTest = 15;
[[unroll(OthersToTest)]]
for (uint32_t i=0; i<OthersToTest; i++)
{
const uint32_t offset = rng() % pushConstants.dataElementCount;
const input_t other = vk::RawBufferLoad<input_t>(pushConstants.inputAddress+sizeof(input_t)*offset);
acc = min(length(other-self),acc);
}
vk::RawBufferStore<float32_t>(pushConstants.outputAddress+sizeof(float32_t)*ID.x,acc);
}
28 changes: 28 additions & 0 deletions 66_PropertyPools/config.json.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"enableParallelBuild": true,
"threadsPerBuildProcess" : 2,
"isExecuted": false,
"scriptPath": "",
"cmake": {
"configurations": [ "Release", "Debug", "RelWithDebInfo" ],
"buildModes": [],
"requiredOptions": []
},
"profiles": [
{
"backend": "vulkan", // should be none
"platform": "windows",
"buildModes": [],
"runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
"gpuArchitectures": []
}
],
"dependencies": [],
"data": [
{
"dependencies": [],
"command": [""],
"outputs": []
}
]
}
205 changes: 205 additions & 0 deletions 66_PropertyPools/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
// This file is part of the "Nabla Engine".
// For conditions of distribution and use, see copyright notice in nabla.h


#include "nbl/video/surface/CSurfaceVulkan.h"
#include "nbl/video/alloc/SubAllocatedDescriptorSet.h"

#include "../common/BasicMultiQueueApplication.hpp"
#include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp"

using namespace nbl;
using namespace core;
using namespace system;
using namespace ui;
using namespace asset;
using namespace video;

#include "app_resources/common.hlsl"
#include "nbl/builtin/hlsl/bit.hlsl"

// In this application we'll cover buffer streaming, Buffer Device Address (BDA) and push constants
class PropertyPoolsApp final : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication
{
using device_base_t = examples::MonoDeviceApplication;
using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication;

smart_refctd_ptr<CPropertyPoolHandler> m_propertyPoolHandler;
smart_refctd_ptr<IGPUBuffer> m_scratchBuffer;
smart_refctd_ptr<IGPUBuffer> m_addressBuffer;
smart_refctd_ptr<IGPUBuffer> m_transferSrcBuffer;
smart_refctd_ptr<IGPUBuffer> m_transferDstBuffer;
std::vector<uint16_t> m_data;

// The pool cache is just a formalized way of round-robining command pools and resetting + reusing them after their most recent submit signals finished.
// Its a little more ergonomic to use if you don't have a 1:1 mapping between frames and pools.
smart_refctd_ptr<nbl::video::ICommandPoolCache> m_poolCache;

// This example really lets the advantages of a timeline semaphore shine through!
smart_refctd_ptr<ISemaphore> m_timeline;
uint64_t m_iteration = 0;
constexpr static inline uint64_t MaxIterations = 200;

static constexpr uint64_t TransfersAmount = 1024;
static constexpr uint64_t MaxValuesPerTransfer = 512;


public:
// Yay thanks to multiple inheritance we cannot forward ctors anymore
PropertyPoolsApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
system::IApplicationFramework(_localInputCWD,_localOutputCWD,_sharedInputCWD,_sharedOutputCWD) {}

// we stuff all our work here because its a "single shot" app
bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
{
using nbl::video::IGPUDescriptorSetLayout;

// Remember to call the base class initialization!
if (!device_base_t::onAppInitialized(std::move(system)))
return false;
if (!asset_base_t::onAppInitialized(std::move(system)))
return false;

m_propertyPoolHandler = core::make_smart_refctd_ptr<CPropertyPoolHandler>(core::smart_refctd_ptr(m_device));

auto createBuffer = [&](uint64_t size, core::bitflag<asset::IBuffer::E_USAGE_FLAGS> flags, const char* name, bool hostVisible)
{
video::IGPUBuffer::SCreationParams creationParams;
creationParams.size = ((size + 3) / 4) * 4; // Align
creationParams.usage = flags
| asset::IBuffer::EUF_STORAGE_BUFFER_BIT
| asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT
| asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF;

auto buffer = m_device->createBuffer(std::move(creationParams));
nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buffer->getMemoryReqs();
if (hostVisible)
reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
m_device->allocate(reqs, buffer.get(), nbl::video::IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT);
buffer->setObjectDebugName(name);

return buffer;
};

m_scratchBuffer = createBuffer(sizeof(nbl::hlsl::property_pools::TransferRequest) * TransfersAmount, core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT), "m_scratchBuffer", true);
m_addressBuffer = createBuffer(sizeof(uint32_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_NONE), "m_addressBuffer", false);
m_transferSrcBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT), "m_transferSrcBuffer", false);
m_transferDstBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_NONE), "m_transferDstBuffer", true);

for (uint16_t i = 0; i < uint16_t((uint32_t(1) << 16) - 1); i++)
m_data.push_back(i);

// We'll allow subsequent iterations to overlap each other on the GPU, the only limiting factors are
// the amount of memory in the streaming buffers and the number of commandpools we can use simultaenously.
constexpr auto MaxConcurrency = 64;

// Since this time we don't throw the Command Pools away and we'll reset them instead, we don't create the pools with the transient flag
m_poolCache = ICommandPoolCache::create(core::smart_refctd_ptr(m_device),getComputeQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::NONE,MaxConcurrency);

// In contrast to fences, we just need one semaphore to rule all dispatches
m_timeline = m_device->createSemaphore(m_iteration);

return true;
}

// Ok this time we'll actually have a work loop (maybe just for the sake of future WASM so we don't timeout a Browser Tab with an unresponsive script)
bool keepRunning() override { return m_iteration<MaxIterations; }

// Finally the first actual work-loop
void workLoopBody() override
{
IQueue* const queue = getComputeQueue();

// Obtain our command pool once one gets recycled
uint32_t poolIx;
do
{
poolIx = m_poolCache->acquirePool();
} while (poolIx==ICommandPoolCache::invalid_index);

smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
{
m_poolCache->getPool(poolIx)->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1},core::smart_refctd_ptr(m_logger));
// lets record, its still a one time submit because we have to re-record with different push constants each time
cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);

// COMMAND RECORDING
uint32_t dataSize = (((sizeof(uint16_t) * m_data.size()) + 3) / 4) * 4;
uint32_t maxUpload = 65536;
for (uint32_t offset = 0; offset < dataSize; offset += maxUpload)
{
cmdbuf->updateBuffer({ offset, maxUpload, core::smart_refctd_ptr<video::IGPUBuffer>(m_transferSrcBuffer) }, &m_data[offset / sizeof(uint16_t)]);
}
CPropertyPoolHandler::TransferRequest transferRequest;
transferRequest.memblock = asset::SBufferRange<video::IGPUBuffer> { 0, sizeof(uint16_t) * m_data.size(), core::smart_refctd_ptr<video::IGPUBuffer>(m_transferSrcBuffer) };
transferRequest.elementSize = 1;
transferRequest.elementCount = (m_data.size() * sizeof(uint16_t)) / sizeof(uint32_t);
transferRequest.buffer = asset::SBufferBinding<video::IGPUBuffer> { 0, core::smart_refctd_ptr<video::IGPUBuffer>(m_transferDstBuffer) };
transferRequest.srcAddressesOffset = IPropertyPool::invalid;
transferRequest.dstAddressesOffset = IPropertyPool::invalid;

m_propertyPoolHandler->transferProperties(cmdbuf.get(),
asset::SBufferBinding<video::IGPUBuffer>{0, core::smart_refctd_ptr(m_scratchBuffer)},
asset::SBufferBinding<video::IGPUBuffer>{0, core::smart_refctd_ptr(m_addressBuffer)},
&transferRequest, &transferRequest + 1,
m_logger.get(), 0, m_data.size()
);

auto result = cmdbuf->end();
assert(result);
}


const auto savedIterNum = m_iteration++;
{
const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo =
{
.cmdbuf = cmdbuf.get()
};
const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo =
{
.semaphore = m_timeline.get(),
.value = m_iteration,
.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
};
// Generally speaking we don't need to wait on any semaphore because in this example every dispatch gets its own clean piece of memory to use
// from the point of view of the GPU. Implicit domain operations between Host and Device happen upon a submit and a semaphore/fence signal operation,
// this ensures we can touch the input and get accurate values from the output memory using the CPU before and after respectively, each submit becoming PENDING.
// If we actually cared about this submit seeing the memory accesses of a previous dispatch we could add a semaphore wait
const IQueue::SSubmitInfo submitInfo = {
.waitSemaphores = {},
.commandBuffers = {&cmdbufInfo,1},
.signalSemaphores = {&signalInfo,1}
};

queue->startCapture();
auto statusCode = queue->submit({ &submitInfo,1 });
queue->endCapture();
assert(statusCode == IQueue::RESULT::SUCCESS);
}

{
ISemaphore::SWaitInfo infos[1] = {{.semaphore=m_timeline.get(),.value=m_iteration}};
m_device->blockForSemaphores(infos);

// Readback ds
// (we'll read back the destination buffer and check that copy went through as expected)
auto mem = m_transferDstBuffer->getBoundMemory(); // Scratch buffer has the transfer requests
void* ptr = mem.memory->map({ mem.offset, mem.memory->getAllocationSize() });

for (uint32_t i = 0; i < 1024; /*m_data.size();*/ i++)
{
uint16_t expected = reinterpret_cast<uint16_t*>(ptr)[i];
uint16_t actual = m_data[i];
std::printf("%i, ", expected);
assert(expected == actual);
}
std::printf("\n");
bool success = mem.memory->unmap();
assert(success);
}
}
};

NBL_MAIN_FUNC(PropertyPoolsApp)
50 changes: 50 additions & 0 deletions 66_PropertyPools/pipeline.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import org.DevshGraphicsProgramming.Agent
import org.DevshGraphicsProgramming.BuilderInfo
import org.DevshGraphicsProgramming.IBuilder

class CStreamingAndBufferDeviceAddressBuilder extends IBuilder
{
public CStreamingAndBufferDeviceAddressBuilder(Agent _agent, _info)
devshgraphicsprogramming marked this conversation as resolved.
Show resolved Hide resolved
{
super(_agent, _info)
}

@Override
public boolean prepare(Map axisMapping)
{
return true
}

@Override
public boolean build(Map axisMapping)
{
IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")

def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
def nameOfConfig = getNameOfConfig(config)

agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")

return true
}

@Override
public boolean test(Map axisMapping)
{
return true
}

@Override
public boolean install(Map axisMapping)
{
return true
}
}

def create(Agent _agent, _info)
{
return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info)
}

return this
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -65,5 +65,6 @@ if(NBL_BUILD_EXAMPLES)
#add_subdirectory(61_UI EXCLUDE_FROM_ALL)
add_subdirectory(62_CAD EXCLUDE_FROM_ALL)
add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL)
add_subdirectory(66_PropertyPools EXCLUDE_FROM_ALL)
add_subdirectory(0_ImportanceSamplingEnvMaps EXCLUDE_FROM_ALL) #TODO: integrate back into 42
endif()