|
5 | 5 | */
|
6 | 6 |
|
7 | 7 | #include <array>
|
| 8 | +#include <intrin.h> |
8 | 9 |
|
9 | 10 | #include <OvDebug/Logger.h>
|
10 | 11 | #include <OvRendering/Resources/Mesh.h>
|
@@ -81,38 +82,159 @@ void OvRendering::Resources::Mesh::Upload(std::span<const Geometry::Vertex> p_ve
|
81 | 82 | }
|
82 | 83 | }
|
83 | 84 |
|
84 |
| -void OvRendering::Resources::Mesh::ComputeBoundingSphere(std::span<const Geometry::Vertex> p_vertices) |
| 85 | +namespace |
85 | 86 | {
|
86 |
| - m_boundingSphere.position = OvMaths::FVector3::Zero; |
87 |
| - m_boundingSphere.radius = 0.0f; |
88 |
| - |
89 |
| - if (!p_vertices.empty()) |
| 87 | + OvRendering::Geometry::BoundingSphere ComputeBoundingSphereSIMD(std::span<const OvRendering::Geometry::Vertex> p_vertices) |
90 | 88 | {
|
91 |
| - float minX = std::numeric_limits<float>::max(); |
92 |
| - float minY = std::numeric_limits<float>::max(); |
93 |
| - float minZ = std::numeric_limits<float>::max(); |
| 89 | + const size_t vertexCount = p_vertices.size(); |
| 90 | + |
| 91 | + if (vertexCount == 0) |
| 92 | + { |
| 93 | + return { |
| 94 | + .position = OvMaths::FVector3::Zero, |
| 95 | + .radius = 0.0f |
| 96 | + }; |
| 97 | + } |
94 | 98 |
|
95 |
| - float maxX = std::numeric_limits<float>::min(); |
96 |
| - float maxY = std::numeric_limits<float>::min(); |
97 |
| - float maxZ = std::numeric_limits<float>::min(); |
| 99 | + // Initialize SIMD registers for min/max with first vertex values |
| 100 | + __m128 vMinXYZ = _mm_setr_ps(p_vertices[0].position[0], p_vertices[0].position[1], p_vertices[0].position[2], FLT_MAX); |
| 101 | + __m128 vMaxXYZ = _mm_setr_ps(p_vertices[0].position[0], p_vertices[0].position[1], p_vertices[0].position[2], -FLT_MAX); |
98 | 102 |
|
99 |
| - for (const auto& vertex : p_vertices) |
| 103 | + // Process all vertices in one loop to find min/max |
| 104 | + for (size_t i = 1; i < vertexCount; ++i) |
100 | 105 | {
|
101 |
| - minX = std::min(minX, vertex.position[0]); |
102 |
| - minY = std::min(minY, vertex.position[1]); |
103 |
| - minZ = std::min(minZ, vertex.position[2]); |
| 106 | + // Load vertex position directly - assumes position is aligned properly |
| 107 | + const float* posPtr = p_vertices[i].position; |
| 108 | + __m128 vPos = _mm_loadu_ps(posPtr); // Using loadu in case it's not 16-byte aligned |
| 109 | + |
| 110 | + // Update min and max in one pass |
| 111 | + vMinXYZ = _mm_min_ps(vMinXYZ, vPos); |
| 112 | + vMaxXYZ = _mm_max_ps(vMaxXYZ, vPos); |
| 113 | + } |
| 114 | + |
| 115 | + // Calculate center = (min + max) * 0.5 |
| 116 | + __m128 vCenter = _mm_mul_ps(_mm_add_ps(vMinXYZ, vMaxXYZ), _mm_set1_ps(0.5f)); |
| 117 | + |
| 118 | + // Store center position |
| 119 | + float centerArr[4]; |
| 120 | + _mm_store_ps(centerArr, vCenter); |
| 121 | + auto center = OvMaths::FVector3{ centerArr[0], centerArr[1], centerArr[2] }; |
| 122 | + |
| 123 | + // Calculate radius - use dot product for distance calculation |
| 124 | + __m128 vMaxDistSq = _mm_setzero_ps(); |
| 125 | + |
| 126 | + // Pre-load center vector once outside the loop |
| 127 | + const __m128 vCenterXYZ = _mm_setr_ps( |
| 128 | + center.x, |
| 129 | + center.y, |
| 130 | + center.z, |
| 131 | + 0.0f |
| 132 | + ); |
| 133 | + |
| 134 | + // Unroll the loop by 4 for better throughput |
| 135 | + size_t i = 0; |
| 136 | + const size_t unrollCount = vertexCount & ~3ull; // Round down to multiple of 4 |
| 137 | + |
| 138 | + for (; i < unrollCount; i += 4) |
| 139 | + { |
| 140 | + // Load 4 vertices at once |
| 141 | + const float* pos0 = p_vertices[i].position; |
| 142 | + const float* pos1 = p_vertices[i + 1].position; |
| 143 | + const float* pos2 = p_vertices[i + 2].position; |
| 144 | + const float* pos3 = p_vertices[i + 3].position; |
| 145 | + |
| 146 | + __m128 vPos0 = _mm_loadu_ps(pos0); |
| 147 | + __m128 vDiff0 = _mm_sub_ps(vPos0, vCenterXYZ); |
| 148 | + __m128 vDistSq0 = _mm_dp_ps(vDiff0, vDiff0, 0x77); // Dot product with mask 0x77 (sum xyz, store in all) |
| 149 | + vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq0); |
| 150 | + |
| 151 | + __m128 vPos1 = _mm_loadu_ps(pos1); |
| 152 | + __m128 vDiff1 = _mm_sub_ps(vPos1, vCenterXYZ); |
| 153 | + __m128 vDistSq1 = _mm_dp_ps(vDiff1, vDiff1, 0x77); |
| 154 | + vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq1); |
| 155 | + |
| 156 | + __m128 vPos2 = _mm_loadu_ps(pos2); |
| 157 | + __m128 vDiff2 = _mm_sub_ps(vPos2, vCenterXYZ); |
| 158 | + __m128 vDistSq2 = _mm_dp_ps(vDiff2, vDiff2, 0x77); |
| 159 | + vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq2); |
| 160 | + |
| 161 | + __m128 vPos3 = _mm_loadu_ps(pos3); |
| 162 | + __m128 vDiff3 = _mm_sub_ps(vPos3, vCenterXYZ); |
| 163 | + __m128 vDistSq3 = _mm_dp_ps(vDiff3, vDiff3, 0x77); |
| 164 | + vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq3); |
| 165 | + } |
104 | 166 |
|
105 |
| - maxX = std::max(maxX, vertex.position[0]); |
106 |
| - maxY = std::max(maxY, vertex.position[1]); |
107 |
| - maxZ = std::max(maxZ, vertex.position[2]); |
| 167 | + // Handle remaining vertices |
| 168 | + for (; i < vertexCount; ++i) |
| 169 | + { |
| 170 | + const float* pos = p_vertices[i].position; |
| 171 | + __m128 vPos = _mm_loadu_ps(pos); |
| 172 | + __m128 vDiff = _mm_sub_ps(vPos, vCenterXYZ); |
| 173 | + __m128 vDistSq = _mm_dp_ps(vDiff, vDiff, 0x77); |
| 174 | + vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq); |
108 | 175 | }
|
109 | 176 |
|
110 |
| - m_boundingSphere.position = OvMaths::FVector3{ minX + maxX, minY + maxY, minZ + maxZ } / 2.0f; |
| 177 | + // Extract radius (sqrt of max squared distance) |
| 178 | + float maxDistSq; |
| 179 | + _mm_store_ss(&maxDistSq, vMaxDistSq); |
| 180 | + |
| 181 | + return { |
| 182 | + .position = center, |
| 183 | + .radius = std::sqrt(maxDistSq) |
| 184 | + }; |
| 185 | + } |
| 186 | + |
| 187 | + OvRendering::Geometry::BoundingSphere ComputeBoundingSphereRegular(std::span<const OvRendering::Geometry::Vertex> p_vertices) |
| 188 | + { |
| 189 | + auto result = OvRendering::Geometry::BoundingSphere{ |
| 190 | + .position = OvMaths::FVector3::Zero, |
| 191 | + .radius = 0.0f |
| 192 | + }; |
111 | 193 |
|
112 |
| - for (const auto& vertex : p_vertices) |
| 194 | + if (!p_vertices.empty()) |
113 | 195 | {
|
114 |
| - const auto& position = reinterpret_cast<const OvMaths::FVector3&>(vertex.position); |
115 |
| - m_boundingSphere.radius = std::max(m_boundingSphere.radius, OvMaths::FVector3::Distance(m_boundingSphere.position, position)); |
| 196 | + float minX = std::numeric_limits<float>::max(); |
| 197 | + float minY = std::numeric_limits<float>::max(); |
| 198 | + float minZ = std::numeric_limits<float>::max(); |
| 199 | + |
| 200 | + float maxX = std::numeric_limits<float>::min(); |
| 201 | + float maxY = std::numeric_limits<float>::min(); |
| 202 | + float maxZ = std::numeric_limits<float>::min(); |
| 203 | + |
| 204 | + for (const auto& vertex : p_vertices) |
| 205 | + { |
| 206 | + minX = std::min(minX, vertex.position[0]); |
| 207 | + minY = std::min(minY, vertex.position[1]); |
| 208 | + minZ = std::min(minZ, vertex.position[2]); |
| 209 | + |
| 210 | + maxX = std::max(maxX, vertex.position[0]); |
| 211 | + maxY = std::max(maxY, vertex.position[1]); |
| 212 | + maxZ = std::max(maxZ, vertex.position[2]); |
| 213 | + } |
| 214 | + |
| 215 | + result.position = OvMaths::FVector3{ minX + maxX, minY + maxY, minZ + maxZ } / 2.0f; |
| 216 | + |
| 217 | + for (const auto& vertex : p_vertices) |
| 218 | + { |
| 219 | + const auto& position = reinterpret_cast<const OvMaths::FVector3&>(vertex.position); |
| 220 | + result.radius = std::max(result.radius, OvMaths::FVector3::Distance(result.position, position)); |
| 221 | + } |
116 | 222 | }
|
| 223 | + |
| 224 | + return result; |
| 225 | + } |
| 226 | +} |
| 227 | + |
| 228 | +void OvRendering::Resources::Mesh::ComputeBoundingSphere(std::span<const Geometry::Vertex> p_vertices) |
| 229 | +{ |
| 230 | + constexpr bool useSIMD = true; |
| 231 | + |
| 232 | + if constexpr (useSIMD) |
| 233 | + { |
| 234 | + m_boundingSphere = ComputeBoundingSphereSIMD(p_vertices); |
| 235 | + } |
| 236 | + else |
| 237 | + { |
| 238 | + m_boundingSphere = ComputeBoundingSphereRegular(p_vertices); |
117 | 239 | }
|
118 | 240 | }
|
0 commit comments