Skip to content

Commit ade1387

Browse files
committed
Cleaned up implementation
1 parent beabcdf commit ade1387

File tree

1 file changed

+82
-137
lines changed
  • Sources/Overload/OvRendering/src/OvRendering/Resources

1 file changed

+82
-137
lines changed

Sources/Overload/OvRendering/src/OvRendering/Resources/Mesh.cpp

Lines changed: 82 additions & 137 deletions
Original file line numberDiff line numberDiff line change
@@ -82,159 +82,104 @@ void OvRendering::Resources::Mesh::Upload(std::span<const Geometry::Vertex> p_ve
8282
}
8383
}
8484

85-
namespace
85+
void OvRendering::Resources::Mesh::ComputeBoundingSphere(std::span<const Geometry::Vertex> p_vertices)
8686
{
87-
OvRendering::Geometry::BoundingSphere ComputeBoundingSphereSIMD(std::span<const OvRendering::Geometry::Vertex> p_vertices)
88-
{
89-
const size_t vertexCount = p_vertices.size();
90-
91-
if (vertexCount == 0)
92-
{
93-
return {
94-
.position = OvMaths::FVector3::Zero,
95-
.radius = 0.0f
96-
};
97-
}
87+
const size_t vertexCount = p_vertices.size();
9888

99-
// Initialize SIMD registers for min/max with first vertex values
100-
__m128 vMinXYZ = _mm_setr_ps(p_vertices[0].position[0], p_vertices[0].position[1], p_vertices[0].position[2], FLT_MAX);
101-
__m128 vMaxXYZ = _mm_setr_ps(p_vertices[0].position[0], p_vertices[0].position[1], p_vertices[0].position[2], -FLT_MAX);
89+
if (vertexCount == 0)
90+
{
91+
m_boundingSphere = {
92+
.position = OvMaths::FVector3::Zero,
93+
.radius = 0.0f
94+
};
10295

103-
// Process all vertices in one loop to find min/max
104-
for (size_t i = 1; i < vertexCount; ++i)
105-
{
106-
// Load vertex position directly - assumes position is aligned properly
107-
const float* posPtr = p_vertices[i].position;
108-
__m128 vPos = _mm_loadu_ps(posPtr); // Using loadu in case it's not 16-byte aligned
96+
return;
97+
}
10998

110-
// Update min and max in one pass
111-
vMinXYZ = _mm_min_ps(vMinXYZ, vPos);
112-
vMaxXYZ = _mm_max_ps(vMaxXYZ, vPos);
113-
}
99+
// Initialize SIMD registers for min/max with first vertex values
100+
__m128 vMinXYZ = _mm_setr_ps(p_vertices[0].position[0], p_vertices[0].position[1], p_vertices[0].position[2], FLT_MAX);
101+
__m128 vMaxXYZ = _mm_setr_ps(p_vertices[0].position[0], p_vertices[0].position[1], p_vertices[0].position[2], -FLT_MAX);
114102

115-
// Calculate center = (min + max) * 0.5
116-
__m128 vCenter = _mm_mul_ps(_mm_add_ps(vMinXYZ, vMaxXYZ), _mm_set1_ps(0.5f));
103+
// Process all vertices in one loop to find min/max
104+
for (size_t i = 1; i < vertexCount; ++i)
105+
{
106+
// Load vertex position directly - assumes position is aligned properly
107+
const float* posPtr = p_vertices[i].position;
108+
__m128 vPos = _mm_loadu_ps(posPtr); // Using loadu in case it's not 16-byte aligned
117109

118-
// Store center position
119-
float centerArr[4];
120-
_mm_store_ps(centerArr, vCenter);
121-
auto center = OvMaths::FVector3{ centerArr[0], centerArr[1], centerArr[2] };
110+
// Update min and max in one pass
111+
vMinXYZ = _mm_min_ps(vMinXYZ, vPos);
112+
vMaxXYZ = _mm_max_ps(vMaxXYZ, vPos);
113+
}
122114

123-
// Calculate radius - use dot product for distance calculation
124-
__m128 vMaxDistSq = _mm_setzero_ps();
115+
// Calculate center = (min + max) * 0.5
116+
__m128 vCenter = _mm_mul_ps(_mm_add_ps(vMinXYZ, vMaxXYZ), _mm_set1_ps(0.5f));
125117

126-
// Pre-load center vector once outside the loop
127-
const __m128 vCenterXYZ = _mm_setr_ps(
128-
center.x,
129-
center.y,
130-
center.z,
131-
0.0f
132-
);
118+
// Store center position
119+
float centerArr[4];
120+
_mm_store_ps(centerArr, vCenter);
121+
auto center = OvMaths::FVector3{ centerArr[0], centerArr[1], centerArr[2] };
133122

134-
// Unroll the loop by 4 for better throughput
135-
size_t i = 0;
136-
const size_t unrollCount = vertexCount & ~3ull; // Round down to multiple of 4
123+
// Calculate radius - use dot product for distance calculation
124+
__m128 vMaxDistSq = _mm_setzero_ps();
137125

138-
for (; i < unrollCount; i += 4)
139-
{
140-
// Load 4 vertices at once
141-
const float* pos0 = p_vertices[i].position;
142-
const float* pos1 = p_vertices[i + 1].position;
143-
const float* pos2 = p_vertices[i + 2].position;
144-
const float* pos3 = p_vertices[i + 3].position;
145-
146-
__m128 vPos0 = _mm_loadu_ps(pos0);
147-
__m128 vDiff0 = _mm_sub_ps(vPos0, vCenterXYZ);
148-
__m128 vDistSq0 = _mm_dp_ps(vDiff0, vDiff0, 0x77); // Dot product with mask 0x77 (sum xyz, store in all)
149-
vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq0);
150-
151-
__m128 vPos1 = _mm_loadu_ps(pos1);
152-
__m128 vDiff1 = _mm_sub_ps(vPos1, vCenterXYZ);
153-
__m128 vDistSq1 = _mm_dp_ps(vDiff1, vDiff1, 0x77);
154-
vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq1);
155-
156-
__m128 vPos2 = _mm_loadu_ps(pos2);
157-
__m128 vDiff2 = _mm_sub_ps(vPos2, vCenterXYZ);
158-
__m128 vDistSq2 = _mm_dp_ps(vDiff2, vDiff2, 0x77);
159-
vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq2);
160-
161-
__m128 vPos3 = _mm_loadu_ps(pos3);
162-
__m128 vDiff3 = _mm_sub_ps(vPos3, vCenterXYZ);
163-
__m128 vDistSq3 = _mm_dp_ps(vDiff3, vDiff3, 0x77);
164-
vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq3);
165-
}
126+
// Pre-load center vector once outside the loop
127+
const __m128 vCenterXYZ = _mm_setr_ps(
128+
center.x,
129+
center.y,
130+
center.z,
131+
0.0f
132+
);
166133

167-
// Handle remaining vertices
168-
for (; i < vertexCount; ++i)
169-
{
170-
const float* pos = p_vertices[i].position;
171-
__m128 vPos = _mm_loadu_ps(pos);
172-
__m128 vDiff = _mm_sub_ps(vPos, vCenterXYZ);
173-
__m128 vDistSq = _mm_dp_ps(vDiff, vDiff, 0x77);
174-
vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq);
175-
}
134+
// Unroll the loop by 4 for better throughput
135+
size_t i = 0;
136+
const size_t unrollCount = vertexCount & ~3ull; // Round down to multiple of 4
176137

177-
// Extract radius (sqrt of max squared distance)
178-
float maxDistSq;
179-
_mm_store_ss(&maxDistSq, vMaxDistSq);
180-
181-
return {
182-
.position = center,
183-
.radius = std::sqrt(maxDistSq)
184-
};
138+
for (; i < unrollCount; i += 4)
139+
{
140+
// Load 4 vertices at once
141+
const float* pos0 = p_vertices[i].position;
142+
const float* pos1 = p_vertices[i + 1].position;
143+
const float* pos2 = p_vertices[i + 2].position;
144+
const float* pos3 = p_vertices[i + 3].position;
145+
146+
__m128 vPos0 = _mm_loadu_ps(pos0);
147+
__m128 vDiff0 = _mm_sub_ps(vPos0, vCenterXYZ);
148+
__m128 vDistSq0 = _mm_dp_ps(vDiff0, vDiff0, 0x77); // Dot product with mask 0x77 (sum xyz, store in all)
149+
vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq0);
150+
151+
__m128 vPos1 = _mm_loadu_ps(pos1);
152+
__m128 vDiff1 = _mm_sub_ps(vPos1, vCenterXYZ);
153+
__m128 vDistSq1 = _mm_dp_ps(vDiff1, vDiff1, 0x77);
154+
vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq1);
155+
156+
__m128 vPos2 = _mm_loadu_ps(pos2);
157+
__m128 vDiff2 = _mm_sub_ps(vPos2, vCenterXYZ);
158+
__m128 vDistSq2 = _mm_dp_ps(vDiff2, vDiff2, 0x77);
159+
vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq2);
160+
161+
__m128 vPos3 = _mm_loadu_ps(pos3);
162+
__m128 vDiff3 = _mm_sub_ps(vPos3, vCenterXYZ);
163+
__m128 vDistSq3 = _mm_dp_ps(vDiff3, vDiff3, 0x77);
164+
vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq3);
185165
}
186166

187-
OvRendering::Geometry::BoundingSphere ComputeBoundingSphereRegular(std::span<const OvRendering::Geometry::Vertex> p_vertices)
167+
// Handle remaining vertices
168+
for (; i < vertexCount; ++i)
188169
{
189-
auto result = OvRendering::Geometry::BoundingSphere{
190-
.position = OvMaths::FVector3::Zero,
191-
.radius = 0.0f
192-
};
193-
194-
if (!p_vertices.empty())
195-
{
196-
float minX = std::numeric_limits<float>::max();
197-
float minY = std::numeric_limits<float>::max();
198-
float minZ = std::numeric_limits<float>::max();
199-
200-
float maxX = std::numeric_limits<float>::min();
201-
float maxY = std::numeric_limits<float>::min();
202-
float maxZ = std::numeric_limits<float>::min();
203-
204-
for (const auto& vertex : p_vertices)
205-
{
206-
minX = std::min(minX, vertex.position[0]);
207-
minY = std::min(minY, vertex.position[1]);
208-
minZ = std::min(minZ, vertex.position[2]);
209-
210-
maxX = std::max(maxX, vertex.position[0]);
211-
maxY = std::max(maxY, vertex.position[1]);
212-
maxZ = std::max(maxZ, vertex.position[2]);
213-
}
214-
215-
result.position = OvMaths::FVector3{ minX + maxX, minY + maxY, minZ + maxZ } / 2.0f;
216-
217-
for (const auto& vertex : p_vertices)
218-
{
219-
const auto& position = reinterpret_cast<const OvMaths::FVector3&>(vertex.position);
220-
result.radius = std::max(result.radius, OvMaths::FVector3::Distance(result.position, position));
221-
}
222-
}
223-
224-
return result;
170+
const float* pos = p_vertices[i].position;
171+
__m128 vPos = _mm_loadu_ps(pos);
172+
__m128 vDiff = _mm_sub_ps(vPos, vCenterXYZ);
173+
__m128 vDistSq = _mm_dp_ps(vDiff, vDiff, 0x77);
174+
vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq);
225175
}
226-
}
227176

228-
void OvRendering::Resources::Mesh::ComputeBoundingSphere(std::span<const Geometry::Vertex> p_vertices)
229-
{
230-
constexpr bool useSIMD = true;
177+
// Extract radius (sqrt of max squared distance)
178+
float maxDistSq;
179+
_mm_store_ss(&maxDistSq, vMaxDistSq);
231180

232-
if constexpr (useSIMD)
233-
{
234-
m_boundingSphere = ComputeBoundingSphereSIMD(p_vertices);
235-
}
236-
else
237-
{
238-
m_boundingSphere = ComputeBoundingSphereRegular(p_vertices);
239-
}
181+
m_boundingSphere = {
182+
.position = center,
183+
.radius = std::sqrt(maxDistSq)
184+
};
240185
}

0 commit comments

Comments
 (0)