/* Copyright (C) 2024 Wildfire Games.
* This file is part of 0 A.D.
*
* 0 A.D. is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 2 of the License, or
* (at your option) any later version.
*
* 0 A.D. is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with 0 A.D. If not, see .
*/
#include "precompiled.h"
#include "renderer/GPUSkinnedModelRenderer.h"
#include "graphics/Color.h"
#include "graphics/LightEnv.h"
#include "graphics/Model.h"
#include "graphics/ModelDef.h"
#include "graphics/ShaderManager.h"
#include "maths/MathUtil.h"
#include "maths/Vector3D.h"
#include "maths/Vector4D.h"
#include "ps/CLogger.h"
#include "ps/containers/StaticVector.h"
#include "ps/CStrInternStatic.h"
#include "renderer/Renderer.h"
#include "renderer/RenderModifiers.h"
#include "renderer/VertexArray.h"
#include "third_party/mikktspace/weldmesh.h"
namespace
{
// We have the following structure for input and output vertices:
//
// Created on a model load and read by a compute shader for each frame where
// the model is visible.
// InputVertex(size/stride=64):
// vec4/CVector4D tangent (offset=0)
// vec3/CVector3D normal (offset=16)
// vec3/CVector3D position (offset=32)
//
// Written by a compute shader for each frame where the model is visible.
// OutputPosition(size/stride=16):
// vec3/CVector3D position (offset=0)
//
// Written by a compute shader for each frame where the model is visible.
// Each component is 16-bits float to consume less memory.
// OutputNormalTangent(size/stride=16)
// 16bits vec3 normal (offset=0)
// 16bits vec4 tangent (offset=8)
constexpr uint32_t INPUT_VERTEX_ATTRIBUTE_ALIGNMENT{16};
constexpr uint32_t INPUT_VERTEX_TANGENT_OFFSET{0};
constexpr uint32_t INPUT_VERTEX_NORMAL_OFFSET{16};
constexpr uint32_t INPUT_VERTEX_POSITION_OFFSET{32};
constexpr uint32_t OUTPUT_POSITION_STRIDE{16};
constexpr uint32_t OUTPUT_NORMAL_TANGENT_STRIDE{16};
constexpr uint32_t OUTPUT_NORMAL_OFFSET{0};
constexpr uint32_t OUTPUT_TANGENT_OFFSET{8};
class ModelDefRData : public CModelDefRPrivate
{
public:
// Static per-CModel vertex array
VertexArray m_Array;
// Position and normals are static
VertexArray::Attribute m_Position;
VertexArray::Attribute m_Normal;
VertexArray::Attribute m_Tangent;
VertexArray m_BlendArray;
VertexArray::Attribute m_BlendJoints;
VertexArray::Attribute m_BlendWeights;
VertexArray m_UVArray;
// The number of UVs is determined by the model
std::vector m_UVs;
Renderer::Backend::IVertexInputLayout* m_VertexInputLayout{nullptr};
// Indices are the same for all models, so share them
VertexIndexArray m_IndexArray;
ModelDefRData(const CModelDefPtr& mdef);
};
ModelDefRData::ModelDefRData(const CModelDefPtr& modelDef)
: m_IndexArray(Renderer::Backend::IBuffer::Usage::TRANSFER_DST),
m_Array(Renderer::Backend::IBuffer::Type::VERTEX,
Renderer::Backend::IBuffer::Usage::TRANSFER_DST |
Renderer::Backend::IBuffer::Usage::STORAGE),
m_BlendArray(Renderer::Backend::IBuffer::Type::VERTEX,
Renderer::Backend::IBuffer::Usage::TRANSFER_DST |
Renderer::Backend::IBuffer::Usage::STORAGE),
m_UVArray(Renderer::Backend::IBuffer::Type::VERTEX,
Renderer::Backend::IBuffer::Usage::TRANSFER_DST)
{
m_Position.format = Renderer::Backend::Format::R32G32B32_SFLOAT;
m_Array.AddAttribute(&m_Position);
m_Normal.format = Renderer::Backend::Format::R32G32B32_SFLOAT;
m_Array.AddAttribute(&m_Normal);
// TODO: switch to 16-bits tangents when possible.
m_Tangent.format = Renderer::Backend::Format::R32G32B32A32_SFLOAT;
m_Array.AddAttribute(&m_Tangent);
m_UVs.resize(modelDef->GetNumUVsPerVertex());
for (uint32_t index{0}; index < modelDef->GetNumUVsPerVertex(); ++index)
{
m_UVs[index].format = Renderer::Backend::Format::R32G32_SFLOAT;
m_UVArray.AddAttribute(&m_UVs[index]);
}
// We can't use a lot of bones because it costs uniform memory. Recommended
// number of bones per model is 32.
// Add 1 to NumBones because of the special 'root' bone.
if (modelDef->GetNumBones() + 1 > 192)
LOGERROR("Model '%s' has too many bones %zu/192", modelDef->GetName().string8().c_str(), modelDef->GetNumBones() + 1);
ENSURE(modelDef->GetNumBones() + 1 <= 192);
m_BlendJoints.format = Renderer::Backend::Format::R8G8B8A8_UINT;
m_BlendArray.AddAttribute(&m_BlendJoints);
m_BlendWeights.format = Renderer::Backend::Format::R8G8B8A8_UNORM;
m_BlendArray.AddAttribute(&m_BlendWeights);
// Generate tangents for the geometry:
// floats per vertex; position + normal + tangent + UV*sets + GPUskinning (joint index and weight)
const uint32_t numberOfFloatsPerVertex{
static_cast(3 + 3 + 4 + 2 * modelDef->GetNumUVsPerVertex() + 8)};
// the tangent generation can increase the number of vertices temporarily
// so reserve a bit more memory to avoid reallocations in GenTangents (in most cases)
std::vector newVertices;
newVertices.reserve(numberOfFloatsPerVertex * modelDef->GetNumVertices() * 2);
// Generate the tangents.
ModelRenderer::GenTangents(modelDef, newVertices, true);
// How many vertices do we have after generating tangents?
const uint32_t newNumberOfVertices{static_cast(newVertices.size()) / numberOfFloatsPerVertex};
std::vector remapTable(newNumberOfVertices);
std::vector vertexDataOut(newNumberOfVertices * numberOfFloatsPerVertex);
// Re-weld the mesh to remove duplicated vertices.
const int finalNumberOfVertices{WeldMesh(
remapTable.data(), vertexDataOut.data(), newVertices.data(), newNumberOfVertices, numberOfFloatsPerVertex)};
// Copy the model data to graphics memory.
m_Array.SetNumberOfVertices(finalNumberOfVertices);
m_Array.SetMinimumAttributeAlignment(INPUT_VERTEX_ATTRIBUTE_ALIGNMENT);
m_Array.Layout();
m_BlendArray.SetNumberOfVertices(finalNumberOfVertices);
m_BlendArray.Layout();
m_UVArray.SetNumberOfVertices(finalNumberOfVertices);
m_UVArray.Layout();
VertexArrayIterator positionIt{m_Position.GetIterator()};
VertexArrayIterator normalIt{m_Normal.GetIterator()};
VertexArrayIterator tangentIt{m_Tangent.GetIterator()};
VertexArrayIterator blendJointsIt{m_BlendJoints.GetIterator()};
VertexArrayIterator blendWeightsIt{m_BlendWeights.GetIterator()};
// Copy everything into the vertex array.
for (int index{0}; index < finalNumberOfVertices; ++index)
{
uint32_t inputDataOffset{numberOfFloatsPerVertex * index};
positionIt[index] = CVector3D{
vertexDataOut[inputDataOffset + 0],
vertexDataOut[inputDataOffset + 1],
vertexDataOut[inputDataOffset + 2]};
inputDataOffset += 3;
normalIt[index] = CVector3D{
vertexDataOut[inputDataOffset + 0],
vertexDataOut[inputDataOffset + 1],
vertexDataOut[inputDataOffset + 2]};
inputDataOffset += 3;
tangentIt[index] = CVector4D{
vertexDataOut[inputDataOffset + 0],
vertexDataOut[inputDataOffset + 1],
vertexDataOut[inputDataOffset + 2],
vertexDataOut[inputDataOffset + 3]};
inputDataOffset += 4;
for (uint32_t j{0}; j < 4; ++j)
{
blendJointsIt[index][j] = static_cast(vertexDataOut[inputDataOffset + 0 + 2 * j]);
blendWeightsIt[index][j] = static_cast(vertexDataOut[inputDataOffset + 1 + 2 * j]);
}
inputDataOffset += 8;
for (uint32_t uvIndex{0}; uvIndex < modelDef->GetNumUVsPerVertex(); uvIndex++)
{
VertexArrayIterator UVit{m_UVs[uvIndex].GetIterator()};
UVit[index][0] = vertexDataOut[inputDataOffset + 0 + 2 * uvIndex];
UVit[index][1] = vertexDataOut[inputDataOffset + 1 + 2 * uvIndex];
}
}
// Upload vertex data.
m_Array.Upload();
m_Array.FreeBackingStore();
m_BlendArray.Upload();
m_BlendArray.FreeBackingStore();
if (m_UVArray.GetStride() > 0)
{
m_UVArray.Upload();
m_UVArray.FreeBackingStore();
}
ENSURE(m_Array.GetStride() == INPUT_VERTEX_ATTRIBUTE_ALIGNMENT * 4);
ENSURE(m_Position.offset == INPUT_VERTEX_POSITION_OFFSET);
ENSURE(m_Normal.offset == INPUT_VERTEX_NORMAL_OFFSET);
ENSURE(m_Tangent.offset == INPUT_VERTEX_TANGENT_OFFSET);
ENSURE(m_BlendArray.GetStride() == 8);
ENSURE(m_BlendJoints.offset % 4 == 0);
ENSURE(m_BlendWeights.offset % 4 == 0);
m_IndexArray.SetNumberOfVertices(modelDef->GetNumFaces() * 3);
m_IndexArray.Layout();
// Re-index geometry and upload index.
VertexArrayIterator indices{m_IndexArray.GetIterator()};
for (uint32_t index{0}; index < modelDef->GetNumFaces() * 3; ++index)
indices[index] = remapTable[index];
m_IndexArray.Upload();
m_IndexArray.FreeBackingStore();
constexpr size_t MAX_UV{2};
PS::StaticVector attributes{
{Renderer::Backend::VertexAttributeStream::POSITION,
m_Position.format, 0, OUTPUT_POSITION_STRIDE,
Renderer::Backend::VertexAttributeRate::PER_VERTEX, 0},
{Renderer::Backend::VertexAttributeStream::NORMAL,
Renderer::Backend::Format::R16G16B16_SFLOAT, OUTPUT_NORMAL_OFFSET, OUTPUT_NORMAL_TANGENT_STRIDE,
Renderer::Backend::VertexAttributeRate::PER_VERTEX, 1},
{Renderer::Backend::VertexAttributeStream::UV2,
Renderer::Backend::Format::R16G16B16A16_SFLOAT, OUTPUT_TANGENT_OFFSET, OUTPUT_NORMAL_TANGENT_STRIDE,
Renderer::Backend::VertexAttributeRate::PER_VERTEX, 1}
};
for (size_t uv{0}; uv < std::min(MAX_UV, modelDef->GetNumUVsPerVertex()); ++uv)
{
const Renderer::Backend::VertexAttributeStream stream =
static_cast(
static_cast(Renderer::Backend::VertexAttributeStream::UV0) + uv);
attributes.push_back({
stream, m_UVs[uv].format, m_UVs[uv].offset, m_UVArray.GetStride(),
Renderer::Backend::VertexAttributeRate::PER_VERTEX, 2});
}
m_VertexInputLayout = g_Renderer.GetVertexInputLayout({attributes.begin(), attributes.end()});
}
struct ModelRData : public CModelRData
{
// We have a separate position array because we don't need other attributes
// for some passes (like shadows).
CVertexBufferManager::Handle m_PositionHandle;
CVertexBufferManager::Handle m_NormalTangentHandle;
ModelRData(const void* key)
: CModelRData(key)
{}
};
} // anonymous namespace
struct GPUSkinnedModelModelRenderer::Internals
{
// Previously prepared modeldef
ModelDefRData* modelDefRData;
// Shader technique for models with up to 64 bones.
CShaderTechniquePtr skinningShaderTechnique64;
// Shader technique for models with up to 192 bones.
CShaderTechniquePtr skinningShaderTechnique192;
};
GPUSkinnedModelModelRenderer::GPUSkinnedModelModelRenderer()
: m(std::make_unique())
{
m->modelDefRData = nullptr;
CShaderDefines shaderDefines64;
shaderDefines64.Add(CStrIntern{"MAX_BONES"}, CStrIntern{"64"});
m->skinningShaderTechnique64 = g_Renderer.GetShaderManager().LoadEffect(str_compute_skinning, shaderDefines64);
CShaderDefines shaderDefines192;
shaderDefines192.Add(CStrIntern{"MAX_BONES"}, CStrIntern{"192"});
m->skinningShaderTechnique192 = g_Renderer.GetShaderManager().LoadEffect(str_compute_skinning, shaderDefines192);
}
GPUSkinnedModelModelRenderer::~GPUSkinnedModelModelRenderer() = default;
CModelRData* GPUSkinnedModelModelRenderer::CreateModelData(const void* key, CModel* model)
{
ENSURE(model->IsSkinned());
CModelDefPtr modelDef{model->GetModelDef()};
ModelDefRData* modelDefRData{static_cast(modelDef->GetRenderData(m.get()))};
if (!modelDefRData)
{
modelDefRData = new ModelDefRData(modelDef);
modelDef->SetRenderData(m.get(), modelDefRData);
}
ModelRData* modelRData{new ModelRData(key)};
const size_t numberOfVertices{modelDefRData->m_Array.GetNumberOfVertices()};
modelRData->m_PositionHandle = g_Renderer.GetVertexBufferManager().AllocateChunk(
OUTPUT_POSITION_STRIDE, numberOfVertices, Renderer::Backend::IBuffer::Type::VERTEX,
Renderer::Backend::IBuffer::Usage::STORAGE,
nullptr, CVertexBufferManager::Group::WATER);
modelRData->m_NormalTangentHandle = g_Renderer.GetVertexBufferManager().AllocateChunk(
OUTPUT_NORMAL_TANGENT_STRIDE, numberOfVertices, Renderer::Backend::IBuffer::Type::VERTEX,
Renderer::Backend::IBuffer::Usage::STORAGE,
nullptr, CVertexBufferManager::Group::WATER);
return modelRData;
}
void GPUSkinnedModelModelRenderer::UpdateModelsData(
Renderer::Backend::IDeviceCommandContext* deviceCommandContext,
PS::span models)
{
if (models.empty())
return;
GPU_SCOPED_LABEL(deviceCommandContext, "Compute Skinning");
// Models with up to 192 bones.
std::vector models192;
deviceCommandContext->InsertMemoryBarrier(
Renderer::Backend::PipelineStage::VERTEX_INPUT, Renderer::Backend::PipelineStage::COMPUTE_SHADER,
Renderer::Backend::Access::VERTEX_ATTRIBUTE_READ | Renderer::Backend::Access::INDEX_READ,
Renderer::Backend::Access::SHADER_READ | Renderer::Backend::Access::SHADER_WRITE);
deviceCommandContext->BeginComputePass();
deviceCommandContext->SetComputePipelineState(
m->skinningShaderTechnique64->GetComputePipelineState());
for (CModel* model : models)
{
ENSURE(model->IsSkinned());
CModelDefPtr modelDef{model->GetModelDef()};
if (modelDef->GetNumBones() + 1 > 64)
{
models192.emplace_back(model);
continue;
}
CModelRData* rdata{static_cast(model->GetRenderData())};
UpdateModelData(deviceCommandContext, m->skinningShaderTechnique64->GetShader(), model, rdata, rdata->m_UpdateFlags);
}
if (!models192.empty())
{
deviceCommandContext->SetComputePipelineState(
m->skinningShaderTechnique192->GetComputePipelineState());
for (CModel* model : models192)
{
CModelRData* rdata{static_cast(model->GetRenderData())};
UpdateModelData(deviceCommandContext, m->skinningShaderTechnique192->GetShader(), model, rdata, rdata->m_UpdateFlags);
}
}
deviceCommandContext->EndComputePass();
deviceCommandContext->InsertMemoryBarrier(
Renderer::Backend::PipelineStage::COMPUTE_SHADER, Renderer::Backend::PipelineStage::VERTEX_INPUT,
Renderer::Backend::Access::SHADER_READ | Renderer::Backend::Access::SHADER_WRITE,
Renderer::Backend::Access::VERTEX_ATTRIBUTE_READ | Renderer::Backend::Access::INDEX_READ);
}
void GPUSkinnedModelModelRenderer::UpdateModelData(
Renderer::Backend::IDeviceCommandContext* deviceCommandContext,
Renderer::Backend::IShaderProgram* shaderProgram,
CModel* model, CModelRData* data, int updateflags)
{
CModelDefPtr modelDef{model->GetModelDef()};
ModelDefRData* modelDefRData{static_cast(modelDef->GetRenderData(m.get()))};
ModelRData* modelRData{static_cast(data)};
if (updateflags & RENDERDATA_UPDATE_VERTICES)
{
constexpr uint32_t threadGroupWorkRegionDim{64};
const uint32_t vertexCount{static_cast(modelRData->m_PositionHandle->m_Count)};
const uint32_t dispatchGroupCountX{DivideRoundUp(vertexCount, threadGroupWorkRegionDim)};
// Bind matrices for current animation state.
// Add 1 to NumBones because of the special 'root' bone.
deviceCommandContext->SetUniform(
shaderProgram->GetBindingSlot(str_skinBlendMatrices),
PS::span(
model->GetAnimatedBoneMatrices()[0]._data,
model->GetAnimatedBoneMatrices()[0].AsFloatArray().size() * (modelDef->GetNumBones() + 1)));
ENSURE(modelRData->m_PositionHandle->m_Count == modelRData->m_NormalTangentHandle->m_Count);
deviceCommandContext->SetUniform(shaderProgram->GetBindingSlot(str_vertexCount),
static_cast(vertexCount));
deviceCommandContext->SetUniform(shaderProgram->GetBindingSlot(str_offset),
static_cast(modelDefRData->m_Array.GetOffset()),
static_cast(modelDefRData->m_BlendArray.GetOffset()),
static_cast(modelRData->m_PositionHandle->m_Index),
static_cast(modelRData->m_NormalTangentHandle->m_Index));
deviceCommandContext->SetStorageBuffer(shaderProgram->GetBindingSlot(str_InputVertices), modelDefRData->m_Array.GetBuffer());
deviceCommandContext->SetStorageBuffer(shaderProgram->GetBindingSlot(str_SkinData), modelDefRData->m_BlendArray.GetBuffer());
deviceCommandContext->SetStorageBuffer(shaderProgram->GetBindingSlot(str_OutputPositions), modelRData->m_PositionHandle->m_Owner->GetBuffer());
deviceCommandContext->SetStorageBuffer(shaderProgram->GetBindingSlot(str_OutputNormalsTangents), modelRData->m_NormalTangentHandle->m_Owner->GetBuffer());
deviceCommandContext->Dispatch(dispatchGroupCountX, 1, 1);
}
}
void GPUSkinnedModelModelRenderer::UploadModelsData(
Renderer::Backend::IDeviceCommandContext* UNUSED(deviceCommandContext),
PS::span UNUSED(models))
{
}
void GPUSkinnedModelModelRenderer::PrepareModelDef(
Renderer::Backend::IDeviceCommandContext* deviceCommandContext,
const CModelDef& def)
{
m->modelDefRData = static_cast(def.GetRenderData(m.get()));
ENSURE(m->modelDefRData);
deviceCommandContext->SetVertexInputLayout(m->modelDefRData->m_VertexInputLayout);
deviceCommandContext->SetIndexBuffer(m->modelDefRData->m_IndexArray.GetBuffer());
if (m->modelDefRData->m_UVArray.GetStride() > 0)
{
const uint32_t firstVertexOffset{
m->modelDefRData->m_UVArray.GetOffset() * m->modelDefRData->m_UVArray.GetStride()};
deviceCommandContext->SetVertexBuffer(
2, m->modelDefRData->m_UVArray.GetBuffer(), firstVertexOffset);
}
}
void GPUSkinnedModelModelRenderer::RenderModel(
Renderer::Backend::IDeviceCommandContext* deviceCommandContext,
Renderer::Backend::IShaderProgram* UNUSED(shader), CModel* model, CModelRData* data)
{
ModelRData* modelRData{static_cast(data)};
// Render the lot.
const size_t numberOfFaces{model->GetModelDef()->GetNumFaces()};
deviceCommandContext->SetVertexBuffer(
0, modelRData->m_PositionHandle->m_Owner->GetBuffer(),
modelRData->m_PositionHandle->m_Index * OUTPUT_POSITION_STRIDE);
deviceCommandContext->SetVertexBuffer(
1, modelRData->m_NormalTangentHandle->m_Owner->GetBuffer(),
modelRData->m_NormalTangentHandle->m_Index * OUTPUT_NORMAL_TANGENT_STRIDE);
deviceCommandContext->DrawIndexed(
m->modelDefRData->m_IndexArray.GetOffset(), numberOfFaces * 3, 0);
// Bump stats.
g_Renderer.m_Stats.m_DrawCalls++;
g_Renderer.m_Stats.m_ModelTris += numberOfFaces;
}