Skip to content

Commit b3be75f

Browse files
committed
ComputeBoundingSphere SIMD implementation
1 parent 21883ab commit b3be75f

File tree

1 file changed

+144
-22
lines changed
  • Sources/Overload/OvRendering/src/OvRendering/Resources

1 file changed

+144
-22
lines changed

Sources/Overload/OvRendering/src/OvRendering/Resources/Mesh.cpp

Lines changed: 144 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
*/
66

77
#include <array>
8+
#include <intrin.h>
89

910
#include <OvDebug/Logger.h>
1011
#include <OvRendering/Resources/Mesh.h>
@@ -81,38 +82,159 @@ void OvRendering::Resources::Mesh::Upload(std::span<const Geometry::Vertex> p_ve
8182
}
8283
}
8384

84-
void OvRendering::Resources::Mesh::ComputeBoundingSphere(std::span<const Geometry::Vertex> p_vertices)
85+
namespace
8586
{
86-
m_boundingSphere.position = OvMaths::FVector3::Zero;
87-
m_boundingSphere.radius = 0.0f;
88-
89-
if (!p_vertices.empty())
87+
OvRendering::Geometry::BoundingSphere ComputeBoundingSphereSIMD(std::span<const OvRendering::Geometry::Vertex> p_vertices)
9088
{
91-
float minX = std::numeric_limits<float>::max();
92-
float minY = std::numeric_limits<float>::max();
93-
float minZ = std::numeric_limits<float>::max();
89+
const size_t vertexCount = p_vertices.size();
90+
91+
if (vertexCount == 0)
92+
{
93+
return {
94+
.position = OvMaths::FVector3::Zero,
95+
.radius = 0.0f
96+
};
97+
}
9498

95-
float maxX = std::numeric_limits<float>::min();
96-
float maxY = std::numeric_limits<float>::min();
97-
float maxZ = std::numeric_limits<float>::min();
99+
// Initialize SIMD registers for min/max with first vertex values
100+
__m128 vMinXYZ = _mm_setr_ps(p_vertices[0].position[0], p_vertices[0].position[1], p_vertices[0].position[2], FLT_MAX);
101+
__m128 vMaxXYZ = _mm_setr_ps(p_vertices[0].position[0], p_vertices[0].position[1], p_vertices[0].position[2], -FLT_MAX);
98102

99-
for (const auto& vertex : p_vertices)
103+
// Process all vertices in one loop to find min/max
104+
for (size_t i = 1; i < vertexCount; ++i)
100105
{
101-
minX = std::min(minX, vertex.position[0]);
102-
minY = std::min(minY, vertex.position[1]);
103-
minZ = std::min(minZ, vertex.position[2]);
106+
// Load vertex position directly - assumes position is aligned properly
107+
const float* posPtr = p_vertices[i].position;
108+
__m128 vPos = _mm_loadu_ps(posPtr); // Using loadu in case it's not 16-byte aligned
109+
110+
// Update min and max in one pass
111+
vMinXYZ = _mm_min_ps(vMinXYZ, vPos);
112+
vMaxXYZ = _mm_max_ps(vMaxXYZ, vPos);
113+
}
114+
115+
// Calculate center = (min + max) * 0.5
116+
__m128 vCenter = _mm_mul_ps(_mm_add_ps(vMinXYZ, vMaxXYZ), _mm_set1_ps(0.5f));
117+
118+
// Store center position
119+
float centerArr[4];
120+
_mm_store_ps(centerArr, vCenter);
121+
auto center = OvMaths::FVector3{ centerArr[0], centerArr[1], centerArr[2] };
122+
123+
// Calculate radius - use dot product for distance calculation
124+
__m128 vMaxDistSq = _mm_setzero_ps();
125+
126+
// Pre-load center vector once outside the loop
127+
const __m128 vCenterXYZ = _mm_setr_ps(
128+
center.x,
129+
center.y,
130+
center.z,
131+
0.0f
132+
);
133+
134+
// Unroll the loop by 4 for better throughput
135+
size_t i = 0;
136+
const size_t unrollCount = vertexCount & ~3ull; // Round down to multiple of 4
137+
138+
for (; i < unrollCount; i += 4)
139+
{
140+
// Load 4 vertices at once
141+
const float* pos0 = p_vertices[i].position;
142+
const float* pos1 = p_vertices[i + 1].position;
143+
const float* pos2 = p_vertices[i + 2].position;
144+
const float* pos3 = p_vertices[i + 3].position;
145+
146+
__m128 vPos0 = _mm_loadu_ps(pos0);
147+
__m128 vDiff0 = _mm_sub_ps(vPos0, vCenterXYZ);
148+
__m128 vDistSq0 = _mm_dp_ps(vDiff0, vDiff0, 0x77); // Dot product with mask 0x77 (sum xyz, store in all)
149+
vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq0);
150+
151+
__m128 vPos1 = _mm_loadu_ps(pos1);
152+
__m128 vDiff1 = _mm_sub_ps(vPos1, vCenterXYZ);
153+
__m128 vDistSq1 = _mm_dp_ps(vDiff1, vDiff1, 0x77);
154+
vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq1);
155+
156+
__m128 vPos2 = _mm_loadu_ps(pos2);
157+
__m128 vDiff2 = _mm_sub_ps(vPos2, vCenterXYZ);
158+
__m128 vDistSq2 = _mm_dp_ps(vDiff2, vDiff2, 0x77);
159+
vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq2);
160+
161+
__m128 vPos3 = _mm_loadu_ps(pos3);
162+
__m128 vDiff3 = _mm_sub_ps(vPos3, vCenterXYZ);
163+
__m128 vDistSq3 = _mm_dp_ps(vDiff3, vDiff3, 0x77);
164+
vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq3);
165+
}
104166

105-
maxX = std::max(maxX, vertex.position[0]);
106-
maxY = std::max(maxY, vertex.position[1]);
107-
maxZ = std::max(maxZ, vertex.position[2]);
167+
// Handle remaining vertices
168+
for (; i < vertexCount; ++i)
169+
{
170+
const float* pos = p_vertices[i].position;
171+
__m128 vPos = _mm_loadu_ps(pos);
172+
__m128 vDiff = _mm_sub_ps(vPos, vCenterXYZ);
173+
__m128 vDistSq = _mm_dp_ps(vDiff, vDiff, 0x77);
174+
vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq);
108175
}
109176

110-
m_boundingSphere.position = OvMaths::FVector3{ minX + maxX, minY + maxY, minZ + maxZ } / 2.0f;
177+
// Extract radius (sqrt of max squared distance)
178+
float maxDistSq;
179+
_mm_store_ss(&maxDistSq, vMaxDistSq);
180+
181+
return {
182+
.position = center,
183+
.radius = std::sqrt(maxDistSq)
184+
};
185+
}
186+
187+
OvRendering::Geometry::BoundingSphere ComputeBoundingSphereRegular(std::span<const OvRendering::Geometry::Vertex> p_vertices)
188+
{
189+
auto result = OvRendering::Geometry::BoundingSphere{
190+
.position = OvMaths::FVector3::Zero,
191+
.radius = 0.0f
192+
};
111193

112-
for (const auto& vertex : p_vertices)
194+
if (!p_vertices.empty())
113195
{
114-
const auto& position = reinterpret_cast<const OvMaths::FVector3&>(vertex.position);
115-
m_boundingSphere.radius = std::max(m_boundingSphere.radius, OvMaths::FVector3::Distance(m_boundingSphere.position, position));
196+
float minX = std::numeric_limits<float>::max();
197+
float minY = std::numeric_limits<float>::max();
198+
float minZ = std::numeric_limits<float>::max();
199+
200+
float maxX = std::numeric_limits<float>::min();
201+
float maxY = std::numeric_limits<float>::min();
202+
float maxZ = std::numeric_limits<float>::min();
203+
204+
for (const auto& vertex : p_vertices)
205+
{
206+
minX = std::min(minX, vertex.position[0]);
207+
minY = std::min(minY, vertex.position[1]);
208+
minZ = std::min(minZ, vertex.position[2]);
209+
210+
maxX = std::max(maxX, vertex.position[0]);
211+
maxY = std::max(maxY, vertex.position[1]);
212+
maxZ = std::max(maxZ, vertex.position[2]);
213+
}
214+
215+
result.position = OvMaths::FVector3{ minX + maxX, minY + maxY, minZ + maxZ } / 2.0f;
216+
217+
for (const auto& vertex : p_vertices)
218+
{
219+
const auto& position = reinterpret_cast<const OvMaths::FVector3&>(vertex.position);
220+
result.radius = std::max(result.radius, OvMaths::FVector3::Distance(result.position, position));
221+
}
116222
}
223+
224+
return result;
225+
}
226+
}
227+
228+
void OvRendering::Resources::Mesh::ComputeBoundingSphere(std::span<const Geometry::Vertex> p_vertices)
229+
{
230+
constexpr bool useSIMD = true;
231+
232+
if constexpr (useSIMD)
233+
{
234+
m_boundingSphere = ComputeBoundingSphereSIMD(p_vertices);
235+
}
236+
else
237+
{
238+
m_boundingSphere = ComputeBoundingSphereRegular(p_vertices);
117239
}
118240
}

0 commit comments

Comments
 (0)