Skip to content

[WIP] ComputeBoundingSphere SIMD implementation #398

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 91 additions & 24 deletions Sources/Overload/OvRendering/src/OvRendering/Resources/Mesh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
*/

#include <array>
#include <intrin.h>

#include <OvDebug/Logger.h>
#include <OvRendering/Resources/Mesh.h>
Expand Down Expand Up @@ -83,36 +84,102 @@ void OvRendering::Resources::Mesh::Upload(std::span<const Geometry::Vertex> p_ve

void OvRendering::Resources::Mesh::ComputeBoundingSphere(std::span<const Geometry::Vertex> p_vertices)
{
m_boundingSphere.position = OvMaths::FVector3::Zero;
m_boundingSphere.radius = 0.0f;
const size_t vertexCount = p_vertices.size();

if (!p_vertices.empty())
if (vertexCount == 0)
{
float minX = std::numeric_limits<float>::max();
float minY = std::numeric_limits<float>::max();
float minZ = std::numeric_limits<float>::max();
m_boundingSphere = {
.position = OvMaths::FVector3::Zero,
.radius = 0.0f
};

float maxX = std::numeric_limits<float>::min();
float maxY = std::numeric_limits<float>::min();
float maxZ = std::numeric_limits<float>::min();
return;
}

for (const auto& vertex : p_vertices)
{
minX = std::min(minX, vertex.position[0]);
minY = std::min(minY, vertex.position[1]);
minZ = std::min(minZ, vertex.position[2]);
// Initialize SIMD registers for min/max with first vertex values
__m128 vMinXYZ = _mm_setr_ps(p_vertices[0].position[0], p_vertices[0].position[1], p_vertices[0].position[2], FLT_MAX);
__m128 vMaxXYZ = _mm_setr_ps(p_vertices[0].position[0], p_vertices[0].position[1], p_vertices[0].position[2], -FLT_MAX);

maxX = std::max(maxX, vertex.position[0]);
maxY = std::max(maxY, vertex.position[1]);
maxZ = std::max(maxZ, vertex.position[2]);
}
// Process all vertices in one loop to find min/max
for (size_t i = 1; i < vertexCount; ++i)
{
// Load vertex position directly - assumes position is aligned properly
const float* posPtr = p_vertices[i].position;
__m128 vPos = _mm_loadu_ps(posPtr); // Using loadu in case it's not 16-byte aligned

m_boundingSphere.position = OvMaths::FVector3{ minX + maxX, minY + maxY, minZ + maxZ } / 2.0f;
// Update min and max in one pass
vMinXYZ = _mm_min_ps(vMinXYZ, vPos);
vMaxXYZ = _mm_max_ps(vMaxXYZ, vPos);
}

for (const auto& vertex : p_vertices)
{
const auto& position = reinterpret_cast<const OvMaths::FVector3&>(vertex.position);
m_boundingSphere.radius = std::max(m_boundingSphere.radius, OvMaths::FVector3::Distance(m_boundingSphere.position, position));
}
// Calculate center = (min + max) * 0.5
__m128 vCenter = _mm_mul_ps(_mm_add_ps(vMinXYZ, vMaxXYZ), _mm_set1_ps(0.5f));

// Store center position
float centerArr[4];
_mm_store_ps(centerArr, vCenter);
auto center = OvMaths::FVector3{ centerArr[0], centerArr[1], centerArr[2] };

// Calculate radius - use dot product for distance calculation
__m128 vMaxDistSq = _mm_setzero_ps();

// Pre-load center vector once outside the loop
const __m128 vCenterXYZ = _mm_setr_ps(
center.x,
center.y,
center.z,
0.0f
);

// Unroll the loop by 4 for better throughput
size_t i = 0;
const size_t unrollCount = vertexCount & ~3ull; // Round down to multiple of 4

for (; i < unrollCount; i += 4)
{
// Load 4 vertices at once
const float* pos0 = p_vertices[i].position;
const float* pos1 = p_vertices[i + 1].position;
const float* pos2 = p_vertices[i + 2].position;
const float* pos3 = p_vertices[i + 3].position;

__m128 vPos0 = _mm_loadu_ps(pos0);
__m128 vDiff0 = _mm_sub_ps(vPos0, vCenterXYZ);
__m128 vDistSq0 = _mm_dp_ps(vDiff0, vDiff0, 0x77); // Dot product with mask 0x77 (sum xyz, store in all)
vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq0);

__m128 vPos1 = _mm_loadu_ps(pos1);
__m128 vDiff1 = _mm_sub_ps(vPos1, vCenterXYZ);
__m128 vDistSq1 = _mm_dp_ps(vDiff1, vDiff1, 0x77);
vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq1);

__m128 vPos2 = _mm_loadu_ps(pos2);
__m128 vDiff2 = _mm_sub_ps(vPos2, vCenterXYZ);
__m128 vDistSq2 = _mm_dp_ps(vDiff2, vDiff2, 0x77);
vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq2);

__m128 vPos3 = _mm_loadu_ps(pos3);
__m128 vDiff3 = _mm_sub_ps(vPos3, vCenterXYZ);
__m128 vDistSq3 = _mm_dp_ps(vDiff3, vDiff3, 0x77);
vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq3);
}

// Handle remaining vertices
for (; i < vertexCount; ++i)
{
const float* pos = p_vertices[i].position;
__m128 vPos = _mm_loadu_ps(pos);
__m128 vDiff = _mm_sub_ps(vPos, vCenterXYZ);
__m128 vDistSq = _mm_dp_ps(vDiff, vDiff, 0x77);
vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq);
}

// Extract radius (sqrt of max squared distance)
float maxDistSq;
_mm_store_ss(&maxDistSq, vMaxDistSq);

m_boundingSphere = {
.position = center,
.radius = std::sqrt(maxDistSq)
};
}