Thanks for using Compiler Explorer
HLSL
CMake
hlsl source #1
Output
Compile to binary object
Link to binary
Execute the code
Intel asm syntax
Demangle identifiers
Verbose demangling
Filters
Unused labels
Library functions
Directives
Comments
Horizontal whitespace
Debug intrinsics
Compiler
NSC (release)
Options
Source code
#include "nbl/builtin/hlsl/glsl_compat/subgroup_ballot.hlsl" #include "nbl/builtin/hlsl/concepts.hlsl" namespace nbl { namespace hlsl { #define NBL_CONCEPT_NAME ScalarizedLoopFunctor #define NBL_CONCEPT_TPLT_PRM_KINDS (typename) #define NBL_CONCEPT_TPLT_PRM_NAMES (T) #define NBL_CONCEPT_PARAM_0 (f, T) #define NBL_CONCEPT_PARAM_1 (ix, uint32_t) #define NBL_CONCEPT_PARAM_2 (inv, uint16_t) #define NBL_CONCEPT_PARAM_3 (data, typename T::load_type) NBL_CONCEPT_BEGIN(4) #define f NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 #define ix NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 #define inv NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 #define data NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_3 NBL_CONCEPT_END( ((NBL_CONCEPT_REQ_TYPE)(T::load_type)) ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((f.load(ix)), ::nbl::hlsl::is_same_v, typename T::load_type)) ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((f(data,inv)), ::nbl::hlsl::is_same_v, void)) ); #undef f #undef ix #undef inv #undef data #include <nbl/builtin/hlsl/concepts/__end.hlsl> template<typename F, uint16_t SubgroupSizeLog2 NBL_PRIMARY_REQUIRES(ScalarizedLoopFunctor<F>) struct ScalarizedLoop { NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(1)<<SubgroupSizeLog2; static void __call(NBL_REF_ARG(F) f, int32_t count) { const int32_t subgroupInvocation = spirv::SubgroupLocalInvocationId; typename F::load_type data; int32_t b; for (b=0; b<count-SubgroupSize; b+=SubgroupSize) { // Load 32/64 unique items at once in outer loop. 32x/64x reduction in mem loads. data = f.load(b+subgroupInvocation); [[unroll]] for (uint16_t j=0; j<SubgroupSize; j++) f(data,j); } // unroll properly const uint16_t finalIters = _static_cast<uint16_t>(count-b); if (subgroupInvocation<finalIters) { data = f.load(b+subgroupInvocation); for (uint16_t j=0; j<finalIters; j++) f(data,j); } } }; } } RWTexture2D<float4> output; Texture2D<uint2> lightStartCounts; struct LightData { // In real impl light data would be position, radius, color, etc. float32_t4 dummy; }; StructuredBuffer<LightData> lightDatas; [numthreads(8, 8, 1)] void main() { using namespace nbl::hlsl; uint2 lightStartCount = lightStartCounts[glsl::gl_WorkGroupID().xy]; struct Functor { using load_type = LightData; load_type load(const uint32_t ix) { return lightDatas[lightDataBegin+ix]; } void operator()(const load_type l, const uint16_t subgroupInvocation) { // unfortunately SPIR-V does not extend `OpGroupNonUniformBroadcast` to work on `OpTypeStruct` lightAccumulator += glsl::subgroupBroadcast(l.dummy,subgroupInvocation); } float32_t4 lightAccumulator; uint32_t lightDataBegin; }; Functor f; f.lightAccumulator = 0.f; f.lightDataBegin = lightStartCount.x; ScalarizedLoop<Functor,32>::__call(f,lightStartCount.y); output[glsl::gl_GlobalInvocationID().xy] = f.lightAccumulator; }
Become a Patron
Sponsor on GitHub
Donate via PayPal
Source on GitHub
Mailing list
Installed libraries
Wiki
Report an issue
How it works
Contact the author
CE on Mastodon
CE on Bluesky
About the author
Statistics
Changelog
Version tree