@@ -82,159 +82,104 @@ void OvRendering::Resources::Mesh::Upload(std::span<const Geometry::Vertex> p_ve
82
82
}
83
83
}
84
84
85
- namespace
85
+ void OvRendering::Resources::Mesh::ComputeBoundingSphere (std::span< const Geometry::Vertex> p_vertices)
86
86
{
87
- OvRendering::Geometry::BoundingSphere ComputeBoundingSphereSIMD (std::span<const OvRendering::Geometry::Vertex> p_vertices)
88
- {
89
- const size_t vertexCount = p_vertices.size ();
90
-
91
- if (vertexCount == 0 )
92
- {
93
- return {
94
- .position = OvMaths::FVector3::Zero,
95
- .radius = 0 .0f
96
- };
97
- }
87
+ const size_t vertexCount = p_vertices.size ();
98
88
99
- // Initialize SIMD registers for min/max with first vertex values
100
- __m128 vMinXYZ = _mm_setr_ps (p_vertices[0 ].position [0 ], p_vertices[0 ].position [1 ], p_vertices[0 ].position [2 ], FLT_MAX);
101
- __m128 vMaxXYZ = _mm_setr_ps (p_vertices[0 ].position [0 ], p_vertices[0 ].position [1 ], p_vertices[0 ].position [2 ], -FLT_MAX);
89
+ if (vertexCount == 0 )
90
+ {
91
+ m_boundingSphere = {
92
+ .position = OvMaths::FVector3::Zero,
93
+ .radius = 0 .0f
94
+ };
102
95
103
- // Process all vertices in one loop to find min/max
104
- for (size_t i = 1 ; i < vertexCount; ++i)
105
- {
106
- // Load vertex position directly - assumes position is aligned properly
107
- const float * posPtr = p_vertices[i].position ;
108
- __m128 vPos = _mm_loadu_ps (posPtr); // Using loadu in case it's not 16-byte aligned
96
+ return ;
97
+ }
109
98
110
- // Update min and max in one pass
111
- vMinXYZ = _mm_min_ps (vMinXYZ, vPos);
112
- vMaxXYZ = _mm_max_ps (vMaxXYZ, vPos);
113
- }
99
+ // Initialize SIMD registers for min/max with first vertex values
100
+ __m128 vMinXYZ = _mm_setr_ps (p_vertices[0 ].position [0 ], p_vertices[0 ].position [1 ], p_vertices[0 ].position [2 ], FLT_MAX);
101
+ __m128 vMaxXYZ = _mm_setr_ps (p_vertices[0 ].position [0 ], p_vertices[0 ].position [1 ], p_vertices[0 ].position [2 ], -FLT_MAX);
114
102
115
- // Calculate center = (min + max) * 0.5
116
- __m128 vCenter = _mm_mul_ps (_mm_add_ps (vMinXYZ, vMaxXYZ), _mm_set1_ps (0 .5f ));
103
+ // Process all vertices in one loop to find min/max
104
+ for (size_t i = 1 ; i < vertexCount; ++i)
105
+ {
106
+ // Load vertex position directly - assumes position is aligned properly
107
+ const float * posPtr = p_vertices[i].position ;
108
+ __m128 vPos = _mm_loadu_ps (posPtr); // Using loadu in case it's not 16-byte aligned
117
109
118
- // Store center position
119
- float centerArr[ 4 ] ;
120
- _mm_store_ps (centerArr, vCenter );
121
- auto center = OvMaths::FVector3{ centerArr[ 0 ], centerArr[ 1 ], centerArr[ 2 ] };
110
+ // Update min and max in one pass
111
+ vMinXYZ = _mm_min_ps (vMinXYZ, vPos) ;
112
+ vMaxXYZ = _mm_max_ps (vMaxXYZ, vPos );
113
+ }
122
114
123
- // Calculate radius - use dot product for distance calculation
124
- __m128 vMaxDistSq = _mm_setzero_ps ( );
115
+ // Calculate center = (min + max) * 0.5
116
+ __m128 vCenter = _mm_mul_ps ( _mm_add_ps (vMinXYZ, vMaxXYZ), _mm_set1_ps ( 0 . 5f ) );
125
117
126
- // Pre-load center vector once outside the loop
127
- const __m128 vCenterXYZ = _mm_setr_ps (
128
- center.x ,
129
- center.y ,
130
- center.z ,
131
- 0 .0f
132
- );
118
+ // Store center position
119
+ float centerArr[4 ];
120
+ _mm_store_ps (centerArr, vCenter);
121
+ auto center = OvMaths::FVector3{ centerArr[0 ], centerArr[1 ], centerArr[2 ] };
133
122
134
- // Unroll the loop by 4 for better throughput
135
- size_t i = 0 ;
136
- const size_t unrollCount = vertexCount & ~3ull ; // Round down to multiple of 4
123
+ // Calculate radius - use dot product for distance calculation
124
+ __m128 vMaxDistSq = _mm_setzero_ps ();
137
125
138
- for (; i < unrollCount; i += 4 )
139
- {
140
- // Load 4 vertices at once
141
- const float * pos0 = p_vertices[i].position ;
142
- const float * pos1 = p_vertices[i + 1 ].position ;
143
- const float * pos2 = p_vertices[i + 2 ].position ;
144
- const float * pos3 = p_vertices[i + 3 ].position ;
145
-
146
- __m128 vPos0 = _mm_loadu_ps (pos0);
147
- __m128 vDiff0 = _mm_sub_ps (vPos0, vCenterXYZ);
148
- __m128 vDistSq0 = _mm_dp_ps (vDiff0, vDiff0, 0x77 ); // Dot product with mask 0x77 (sum xyz, store in all)
149
- vMaxDistSq = _mm_max_ps (vMaxDistSq, vDistSq0);
150
-
151
- __m128 vPos1 = _mm_loadu_ps (pos1);
152
- __m128 vDiff1 = _mm_sub_ps (vPos1, vCenterXYZ);
153
- __m128 vDistSq1 = _mm_dp_ps (vDiff1, vDiff1, 0x77 );
154
- vMaxDistSq = _mm_max_ps (vMaxDistSq, vDistSq1);
155
-
156
- __m128 vPos2 = _mm_loadu_ps (pos2);
157
- __m128 vDiff2 = _mm_sub_ps (vPos2, vCenterXYZ);
158
- __m128 vDistSq2 = _mm_dp_ps (vDiff2, vDiff2, 0x77 );
159
- vMaxDistSq = _mm_max_ps (vMaxDistSq, vDistSq2);
160
-
161
- __m128 vPos3 = _mm_loadu_ps (pos3);
162
- __m128 vDiff3 = _mm_sub_ps (vPos3, vCenterXYZ);
163
- __m128 vDistSq3 = _mm_dp_ps (vDiff3, vDiff3, 0x77 );
164
- vMaxDistSq = _mm_max_ps (vMaxDistSq, vDistSq3);
165
- }
126
+ // Pre-load center vector once outside the loop
127
+ const __m128 vCenterXYZ = _mm_setr_ps (
128
+ center.x ,
129
+ center.y ,
130
+ center.z ,
131
+ 0 .0f
132
+ );
166
133
167
- // Handle remaining vertices
168
- for (; i < vertexCount; ++i)
169
- {
170
- const float * pos = p_vertices[i].position ;
171
- __m128 vPos = _mm_loadu_ps (pos);
172
- __m128 vDiff = _mm_sub_ps (vPos, vCenterXYZ);
173
- __m128 vDistSq = _mm_dp_ps (vDiff, vDiff, 0x77 );
174
- vMaxDistSq = _mm_max_ps (vMaxDistSq, vDistSq);
175
- }
134
+ // Unroll the loop by 4 for better throughput
135
+ size_t i = 0 ;
136
+ const size_t unrollCount = vertexCount & ~3ull ; // Round down to multiple of 4
176
137
177
- // Extract radius (sqrt of max squared distance)
178
- float maxDistSq;
179
- _mm_store_ss (&maxDistSq, vMaxDistSq);
180
-
181
- return {
182
- .position = center,
183
- .radius = std::sqrt (maxDistSq)
184
- };
138
+ for (; i < unrollCount; i += 4 )
139
+ {
140
+ // Load 4 vertices at once
141
+ const float * pos0 = p_vertices[i].position ;
142
+ const float * pos1 = p_vertices[i + 1 ].position ;
143
+ const float * pos2 = p_vertices[i + 2 ].position ;
144
+ const float * pos3 = p_vertices[i + 3 ].position ;
145
+
146
+ __m128 vPos0 = _mm_loadu_ps (pos0);
147
+ __m128 vDiff0 = _mm_sub_ps (vPos0, vCenterXYZ);
148
+ __m128 vDistSq0 = _mm_dp_ps (vDiff0, vDiff0, 0x77 ); // Dot product with mask 0x77 (sum xyz, store in all)
149
+ vMaxDistSq = _mm_max_ps (vMaxDistSq, vDistSq0);
150
+
151
+ __m128 vPos1 = _mm_loadu_ps (pos1);
152
+ __m128 vDiff1 = _mm_sub_ps (vPos1, vCenterXYZ);
153
+ __m128 vDistSq1 = _mm_dp_ps (vDiff1, vDiff1, 0x77 );
154
+ vMaxDistSq = _mm_max_ps (vMaxDistSq, vDistSq1);
155
+
156
+ __m128 vPos2 = _mm_loadu_ps (pos2);
157
+ __m128 vDiff2 = _mm_sub_ps (vPos2, vCenterXYZ);
158
+ __m128 vDistSq2 = _mm_dp_ps (vDiff2, vDiff2, 0x77 );
159
+ vMaxDistSq = _mm_max_ps (vMaxDistSq, vDistSq2);
160
+
161
+ __m128 vPos3 = _mm_loadu_ps (pos3);
162
+ __m128 vDiff3 = _mm_sub_ps (vPos3, vCenterXYZ);
163
+ __m128 vDistSq3 = _mm_dp_ps (vDiff3, vDiff3, 0x77 );
164
+ vMaxDistSq = _mm_max_ps (vMaxDistSq, vDistSq3);
185
165
}
186
166
187
- OvRendering::Geometry::BoundingSphere ComputeBoundingSphereRegular (std::span<const OvRendering::Geometry::Vertex> p_vertices)
167
+ // Handle remaining vertices
168
+ for (; i < vertexCount; ++i)
188
169
{
189
- auto result = OvRendering::Geometry::BoundingSphere{
190
- .position = OvMaths::FVector3::Zero,
191
- .radius = 0 .0f
192
- };
193
-
194
- if (!p_vertices.empty ())
195
- {
196
- float minX = std::numeric_limits<float >::max ();
197
- float minY = std::numeric_limits<float >::max ();
198
- float minZ = std::numeric_limits<float >::max ();
199
-
200
- float maxX = std::numeric_limits<float >::min ();
201
- float maxY = std::numeric_limits<float >::min ();
202
- float maxZ = std::numeric_limits<float >::min ();
203
-
204
- for (const auto & vertex : p_vertices)
205
- {
206
- minX = std::min (minX, vertex.position [0 ]);
207
- minY = std::min (minY, vertex.position [1 ]);
208
- minZ = std::min (minZ, vertex.position [2 ]);
209
-
210
- maxX = std::max (maxX, vertex.position [0 ]);
211
- maxY = std::max (maxY, vertex.position [1 ]);
212
- maxZ = std::max (maxZ, vertex.position [2 ]);
213
- }
214
-
215
- result.position = OvMaths::FVector3{ minX + maxX, minY + maxY, minZ + maxZ } / 2 .0f ;
216
-
217
- for (const auto & vertex : p_vertices)
218
- {
219
- const auto & position = reinterpret_cast <const OvMaths::FVector3&>(vertex.position );
220
- result.radius = std::max (result.radius , OvMaths::FVector3::Distance (result.position , position));
221
- }
222
- }
223
-
224
- return result;
170
+ const float * pos = p_vertices[i].position ;
171
+ __m128 vPos = _mm_loadu_ps (pos);
172
+ __m128 vDiff = _mm_sub_ps (vPos, vCenterXYZ);
173
+ __m128 vDistSq = _mm_dp_ps (vDiff, vDiff, 0x77 );
174
+ vMaxDistSq = _mm_max_ps (vMaxDistSq, vDistSq);
225
175
}
226
- }
227
176
228
- void OvRendering::Resources::Mesh::ComputeBoundingSphere (std::span< const Geometry::Vertex> p_vertices )
229
- {
230
- constexpr bool useSIMD = true ;
177
+ // Extract radius (sqrt of max squared distance )
178
+ float maxDistSq;
179
+ _mm_store_ss (&maxDistSq, vMaxDistSq) ;
231
180
232
- if constexpr (useSIMD)
233
- {
234
- m_boundingSphere = ComputeBoundingSphereSIMD (p_vertices);
235
- }
236
- else
237
- {
238
- m_boundingSphere = ComputeBoundingSphereRegular (p_vertices);
239
- }
181
+ m_boundingSphere = {
182
+ .position = center,
183
+ .radius = std::sqrt (maxDistSq)
184
+ };
240
185
}
0 commit comments