/* Copyright (C) 2024 Wildfire Games.
 * This file is part of 0 A.D.
 *
 * 0 A.D. is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * 0 A.D. is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with 0 A.D.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "precompiled.h"

#include "VertexBuffer.h"

#include "lib/sysdep/cpu.h"
#include "ps/CLogger.h"
#include "renderer/backend/IDevice.h"
#include "renderer/Renderer.h"

#include <algorithm>
#include <cstring>
#include <iterator>

// Absolute maximum (bytewise) size of each GL vertex buffer object.
// Make it large enough for the maximum feasible mesh size (64K vertexes,
// 64 bytes per vertex in InstancingModelRenderer).
// TODO: measure what influence this has on performance
constexpr std::size_t MAX_VB_SIZE_BYTES = 4 * 1024 * 1024;

CVertexBuffer::CVertexBuffer(
	Renderer::Backend::IDevice* device, const char* name, const size_t vertexSize,
	const Renderer::Backend::IBuffer::Type type, const uint32_t usage)
	: CVertexBuffer(device, name, vertexSize, type, usage, MAX_VB_SIZE_BYTES)
{
}

CVertexBuffer::CVertexBuffer(
	Renderer::Backend::IDevice* device, const char* name, const size_t vertexSize,
	const Renderer::Backend::IBuffer::Type type, const uint32_t usage,
	const size_t maximumBufferSize)
	: m_VertexSize(vertexSize), m_HasNeededChunks(false)
{
	size_t size = maximumBufferSize;

	if (type == Renderer::Backend::IBuffer::Type::VERTEX)
	{
		// We want to store 16-bit indices to any vertex in a buffer, so the
		// buffer must never be bigger than vertexSize*64K bytes since we can
		// address at most 64K of them with 16-bit indices
		size = std::min(size, vertexSize * 65536);
	}
	else if (type == Renderer::Backend::IBuffer::Type::INDEX)
	{
		ENSURE(vertexSize == sizeof(u16));
	}

	// store max/free vertex counts
	m_MaxVertices = m_FreeVertices = size / vertexSize;

	m_Buffer = device->CreateBuffer(
		name, type, m_MaxVertices * m_VertexSize, usage);

	// create sole free chunk
	VBChunk* chunk = new VBChunk;
	chunk->m_Owner = this;
	chunk->m_Count = m_FreeVertices;
	chunk->m_Index = 0;
	m_FreeList.emplace_back(chunk);
}

CVertexBuffer::~CVertexBuffer()
{
	// Must have released all chunks before destroying the buffer
	ENSURE(m_AllocList.empty());

	m_Buffer.reset();

	for (VBChunk* const& chunk : m_FreeList)
		delete chunk;
}

bool CVertexBuffer::CompatibleVertexType(
	const size_t vertexSize, const Renderer::Backend::IBuffer::Type type,
	const uint32_t usage) const
{
	ENSURE(m_Buffer);
	return type == m_Buffer->GetType() && usage == m_Buffer->GetUsage() && vertexSize == m_VertexSize;
}

///////////////////////////////////////////////////////////////////////////////
// Allocate: try to allocate a buffer of given number of vertices (each of
// given size), with the given type, and using the given texture - return null
// if no free chunks available
CVertexBuffer::VBChunk* CVertexBuffer::Allocate(
	const size_t vertexSize, const size_t numberOfVertices,
	const Renderer::Backend::IBuffer::Type type, const uint32_t usage,
	void* backingStore)
{
	// check this is the right kind of buffer
	if (!CompatibleVertexType(vertexSize, type, usage))
		return nullptr;

	if (UseStreaming(usage))
		ENSURE(backingStore != nullptr);

	// quick check there's enough vertices spare to allocate
	if (numberOfVertices > m_FreeVertices)
		return nullptr;

	// trawl free list looking for first free chunk with enough space
	std::vector<VBChunk*>::iterator best_iter = m_FreeList.end();
	for (std::vector<VBChunk*>::iterator iter = m_FreeList.begin(); iter != m_FreeList.end(); ++iter)
	{
		if (numberOfVertices == (*iter)->m_Count)
		{
			best_iter = iter;
			break;
		}
		else if (numberOfVertices < (*iter)->m_Count && (best_iter == m_FreeList.end() || (*best_iter)->m_Count < (*iter)->m_Count))
			best_iter = iter;
	}

	// We could not find a large enough chunk.
	if (best_iter == m_FreeList.end())
		return nullptr;

	VBChunk* chunk = *best_iter;
	m_FreeList.erase(best_iter);
	m_FreeVertices -= chunk->m_Count;

	chunk->m_BackingStore = backingStore;
	chunk->m_Dirty = false;
	chunk->m_Needed = false;

	// split chunk into two; - allocate a new chunk using all unused vertices in the
	// found chunk, and add it to the free list
	if (chunk->m_Count > numberOfVertices)
	{
		VBChunk* newchunk = new VBChunk;
		newchunk->m_Owner = this;
		newchunk->m_Count = chunk->m_Count - numberOfVertices;
		newchunk->m_Index = chunk->m_Index + numberOfVertices;
		m_FreeList.emplace_back(newchunk);
		m_FreeVertices += newchunk->m_Count;

		// resize given chunk
		chunk->m_Count = numberOfVertices;
	}

	// return found chunk
	m_AllocList.push_back(chunk);
	return chunk;
}

///////////////////////////////////////////////////////////////////////////////
// Release: return given chunk to this buffer
void CVertexBuffer::Release(VBChunk* chunk)
{
	// Update total free count before potentially modifying this chunk's count
	m_FreeVertices += chunk->m_Count;

	m_AllocList.erase(std::find(m_AllocList.begin(), m_AllocList.end(), chunk));

	// Sorting O(nlogn) shouldn't be too far from O(n) by performance, because
	// the container is partly sorted already.
	std::sort(
		m_FreeList.begin(), m_FreeList.end(),
		[](const VBChunk* chunk1, const VBChunk* chunk2) -> bool
		{
			return chunk1->m_Index < chunk2->m_Index;
		});

	// Coalesce with any free-list items that are adjacent to this chunk;
	// merge the found chunk with the new one, and remove the old one
	// from the list.
	for (std::vector<VBChunk*>::iterator iter = m_FreeList.begin(); iter != m_FreeList.end();)
	{
		if ((*iter)->m_Index == chunk->m_Index + chunk->m_Count
		 || (*iter)->m_Index + (*iter)->m_Count == chunk->m_Index)
		{
			chunk->m_Index = std::min(chunk->m_Index, (*iter)->m_Index);
			chunk->m_Count += (*iter)->m_Count;
			delete *iter;
			iter = m_FreeList.erase(iter);
			if (!m_FreeList.empty() && iter != m_FreeList.begin())
				iter = std::prev(iter);
		}
		else
		{
			++iter;
		}
	}

	m_FreeList.emplace_back(chunk);
}

///////////////////////////////////////////////////////////////////////////////
// UpdateChunkVertices: update vertex data for given chunk
void CVertexBuffer::UpdateChunkVertices(VBChunk* chunk, void* data)
{
	ENSURE(m_Buffer);
	if (UseStreaming(m_Buffer->GetUsage()))
	{
		// The backend buffer is now out of sync with the backing store.
		chunk->m_Dirty = true;

		// Sanity check: Make sure the caller hasn't tried to reallocate
		// their backing store.
		ENSURE(data == chunk->m_BackingStore);
	}
	else
	{
		ENSURE(data);
		g_Renderer.GetDeviceCommandContext()->UploadBufferRegion(
			m_Buffer.get(), data, chunk->m_Index * m_VertexSize, chunk->m_Count * m_VertexSize);
	}
}

void CVertexBuffer::UploadIfNeeded(
	Renderer::Backend::IDeviceCommandContext* deviceCommandContext)
{
	if (UseStreaming(m_Buffer->GetUsage()))
	{
		if (!m_HasNeededChunks)
			return;

		// If any chunks are out of sync with the current backend buffer, and are
		// needed for rendering this frame, we'll need to re-upload the backend buffer.
		bool needUpload = false;
		for (VBChunk* const& chunk : m_AllocList)
		{
			if (chunk->m_Dirty && chunk->m_Needed)
			{
				needUpload = true;
				break;
			}
		}

		if (needUpload)
		{
			deviceCommandContext->UploadBuffer(m_Buffer.get(), [&](u8* mappedData)
			{
#ifndef NDEBUG
				// To help detect bugs where PrepareForRendering() was not called,
				// force all not-needed data to 0, so things won't get rendered
				// with undefined (but possibly still correct-looking) data.
				memset(mappedData, 0, m_MaxVertices * m_VertexSize);
#endif

				// Copy only the chunks we need. (This condition is helpful when
				// the backend buffer contains data for every unit in the world,
				// but only a handful are visible on screen and we don't need to
				// bother copying the rest.)
				for (VBChunk* const& chunk : m_AllocList)
					if (chunk->m_Needed)
						std::memcpy(mappedData + chunk->m_Index * m_VertexSize, chunk->m_BackingStore, chunk->m_Count * m_VertexSize);
			});

			// Anything we just uploaded is clean; anything else is dirty
			// since the rest of the backend buffer content is now undefined
			for (VBChunk* const& chunk : m_AllocList)
			{
				if (chunk->m_Needed)
				{
					chunk->m_Dirty = false;
					chunk->m_Needed = false;
				}
				else
					chunk->m_Dirty = true;
			}
		}
		else
		{
			// Reset the flags for the next phase.
			for (VBChunk* const& chunk : m_AllocList)
				chunk->m_Needed = false;
		}

		m_HasNeededChunks = false;
	}
}

size_t CVertexBuffer::GetBytesReserved() const
{
	return MAX_VB_SIZE_BYTES;
}

size_t CVertexBuffer::GetBytesAllocated() const
{
	return (m_MaxVertices - m_FreeVertices) * m_VertexSize;
}

void CVertexBuffer::DumpStatus() const
{
	debug_printf("freeverts = %d\n", static_cast<int>(m_FreeVertices));

	size_t maxSize = 0;
	for (VBChunk* const& chunk : m_FreeList)
	{
		debug_printf("free chunk %p: size=%d\n", static_cast<void *>(chunk), static_cast<int>(chunk->m_Count));
		maxSize = std::max(chunk->m_Count, maxSize);
	}
	debug_printf("max size = %d\n", static_cast<int>(maxSize));
}

bool CVertexBuffer::UseStreaming(const uint32_t usage)
{
	return usage & Renderer::Backend::IBuffer::Usage::DYNAMIC;
}

void CVertexBuffer::PrepareForRendering(VBChunk* chunk)
{
	chunk->m_Needed = true;
	m_HasNeededChunks = true;
}