Skip to content

Commit b21b069

Browse files
Significantly accelerate (de)allocation by swapping the naive log2() with CLZ intrinsics (#11)
* Significantly accelerate (de)allocation by swapping the naïve log2 with CLZ intrinsics * Remove an obsolete statement from the intro
1 parent 088f09c commit b21b069

File tree

6 files changed

+128
-62
lines changed

6 files changed

+128
-62
lines changed

README.md

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ due to the fact that a memory allocator has to rely on inherently unsafe operati
1919
The codebase is extremely compact (<500 LoC) and is therefore trivial to validate.
2020

2121
The allocator is designed to be portable across all conventional architectures, from 8-bit to 64-bit systems.
22-
Multi-threaded environments are supported with the help of external synchronization hooks provided by the application.
2322

2423
## Design
2524

@@ -199,6 +198,18 @@ If not specified, the macro expands as follows:
199198
For example, for GCC, Clang, and ARM Compiler, it expands into `__builtin_expect((x), 1)`.
200199
- For other (unknown) compilers it expands into the original expression with no modifications: `(x)`.
201200

201+
#### O1HEAP_CLZ(x)
202+
203+
The count leading zeros (CLZ) function is used for fast binary logarithm computation (which has to be done
204+
multiple times per allocation, so its performance is critical).
205+
Most of the modern processors implement dedicated hardware support for fast CLZ computation,
206+
which is available via compiler intrinsics.
207+
208+
If not overridden by the user, for some compilers `O1HEAP_CLZ(x)` will expand to the appropriate intrinsic
209+
(e.g., `__builtin_clzl(x)` for GCC/Clang).
210+
For other compilers it will default to a slow software implementation,
211+
which is likely to significantly degrade the performance of the library.
212+
202213
## Development
203214

204215
### Dependencies
@@ -257,6 +268,13 @@ An exception applies for the case of false-positive (invalid) warnings -- those
257268

258269
## Changelog
259270

271+
### v2.1
272+
273+
- Significantly accelerate (de-)allocation by replacing the naïve log2 implementation with fast CLZ intrinsics;
274+
see `O1HEAP_CLZ(x)`.
275+
- Do not require char to be 8-bit wide: replace `uint8_t` with `uint_fast8_t`.
276+
This is to enhance compatibility with odd embedded platforms where `CHAR_BIT!=8` (e.g., ADSP TS-201, TMS320C2804).
277+
260278
### v2.0
261279

262280
- Remove critical section hooks to enhance MISRA conformance [#4](https://github.com/pavel-kirienko/o1heap/issues/4)

o1heap/o1heap.c

Lines changed: 65 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
#include "o1heap.h"
1818
#include <assert.h>
19+
#include <limits.h>
1920

2021
// ---------------------------------------- BUILD CONFIGURATION OPTIONS ----------------------------------------
2122

@@ -32,21 +33,52 @@
3233
# define O1HEAP_ASSERT(x) assert(x) // NOSONAR
3334
#endif
3435

36+
/// Allow usage of compiler intrinsics for branch annotation and CLZ.
37+
#ifndef O1HEAP_USE_INTRINSICS
38+
# define O1HEAP_USE_INTRINSICS 1
39+
#endif
40+
3541
/// Branch probability annotations are used to improve the worst case execution time (WCET). They are entirely optional.
36-
#ifndef O1HEAP_LIKELY
42+
#if O1HEAP_USE_INTRINSICS && !defined(O1HEAP_LIKELY)
3743
# if defined(__GNUC__) || defined(__clang__) || defined(__CC_ARM)
3844
// Intentional violation of MISRA: branch hinting macro cannot be replaced with a function definition.
3945
# define O1HEAP_LIKELY(x) __builtin_expect((x), 1) // NOSONAR
40-
# else
41-
# define O1HEAP_LIKELY(x) x
4246
# endif
4347
#endif
48+
#ifndef O1HEAP_LIKELY
49+
# define O1HEAP_LIKELY(x) x
50+
#endif
4451

4552
/// This option is used for testing only. Do not use in production.
4653
#ifndef O1HEAP_PRIVATE
4754
# define O1HEAP_PRIVATE static inline
4855
#endif
4956

57+
/// Count leading zeros (CLZ) is used for fast computation of binary logarithm (which needs to be done very often).
58+
/// Most of the modern processors (including the embedded ones) implement dedicated hardware support for fast CLZ
59+
/// computation, which is available via compiler intrinsics. The default implementation will automatically use
60+
/// the intrinsics for some of the compilers; for others it will default to the slow software emulation,
61+
/// which can be overridden by the user via O1HEAP_CONFIG_HEADER. The library guarantees that the argument is positive.
62+
#if O1HEAP_USE_INTRINSICS && !defined(O1HEAP_CLZ)
63+
# if defined(__GNUC__) || defined(__clang__) || defined(__CC_ARM)
64+
# define O1HEAP_CLZ __builtin_clzl
65+
# endif
66+
#endif
67+
#ifndef O1HEAP_CLZ
68+
O1HEAP_PRIVATE uint_fast8_t O1HEAP_CLZ(const size_t x)
69+
{
70+
O1HEAP_ASSERT(x > 0);
71+
size_t t = ((size_t) 1U) << ((sizeof(size_t) * CHAR_BIT) - 1U);
72+
uint_fast8_t r = 0;
73+
while ((x & t) == 0)
74+
{
75+
t >>= 1U;
76+
r++;
77+
}
78+
return r;
79+
}
80+
#endif
81+
5082
// ---------------------------------------- INTERNAL DEFINITIONS ----------------------------------------
5183

5284
#if !defined(__STDC_VERSION__) || (__STDC_VERSION__ < 199901L)
@@ -72,7 +104,7 @@
72104

73105
/// Normally we should subtract log2(FRAGMENT_SIZE_MIN) but log2 is bulky to compute using the preprocessor only.
74106
/// We will certainly end up with unused bins this way, but it is cheap to ignore.
75-
#define NUM_BINS_MAX (sizeof(size_t) * 8U)
107+
#define NUM_BINS_MAX (sizeof(size_t) * CHAR_BIT)
76108

77109
static_assert((O1HEAP_ALIGNMENT & (O1HEAP_ALIGNMENT - 1U)) == 0U, "Not a power of 2");
78110
static_assert((FRAGMENT_SIZE_MIN & (FRAGMENT_SIZE_MIN - 1U)) == 0U, "Not a power of 2");
@@ -113,41 +145,37 @@ struct O1HeapInstance
113145
static_assert(INSTANCE_SIZE_PADDED >= sizeof(O1HeapInstance), "Invalid instance footprint computation");
114146
static_assert((INSTANCE_SIZE_PADDED % O1HEAP_ALIGNMENT) == 0U, "Invalid instance footprint computation");
115147

116-
/// True if the argument is an integer power of two or zero.
117-
O1HEAP_PRIVATE bool isPowerOf2(const size_t x)
148+
/// Undefined for zero argument.
149+
O1HEAP_PRIVATE uint_fast8_t log2Floor(const size_t x)
118150
{
119-
return (x & (x - 1U)) == 0U;
151+
O1HEAP_ASSERT(x > 0);
152+
// NOLINTNEXTLINE redundant cast to the same type.
153+
return (uint_fast8_t) (((sizeof(x) * CHAR_BIT) - 1U) - ((uint_fast8_t) O1HEAP_CLZ(x)));
120154
}
121155

122156
/// Special case: if the argument is zero, returns zero.
123-
O1HEAP_PRIVATE uint8_t log2Floor(const size_t x)
157+
O1HEAP_PRIVATE uint_fast8_t log2Ceil(const size_t x)
124158
{
125-
size_t tmp = x;
126-
uint8_t y = 0;
127-
// This is currently the only exception to the statement "routines contain neither loops nor recursion".
128-
// It is unclear if there is a better way to compute the binary logarithm than this.
129-
while (tmp > 1U)
130-
{
131-
tmp >>= 1U;
132-
y++;
133-
}
134-
return y;
135-
}
136-
137-
/// Special case: if the argument is zero, returns zero.
138-
O1HEAP_PRIVATE uint8_t log2Ceil(const size_t x)
139-
{
140-
return (uint8_t) (log2Floor(x) + (isPowerOf2(x) ? 0U : 1U));
159+
// NOLINTNEXTLINE redundant cast to the same type.
160+
return (x <= 1U) ? 0U : (uint_fast8_t) ((sizeof(x) * CHAR_BIT) - ((uint_fast8_t) O1HEAP_CLZ(x - 1U)));
141161
}
142162

143163
/// Raise 2 into the specified power.
144164
/// You might be tempted to do something like (1U << power). WRONG! We humans are prone to forgetting things.
145165
/// If you forget to cast your 1U to size_t or ULL, you may end up with undefined behavior.
146-
O1HEAP_PRIVATE size_t pow2(const uint8_t power)
166+
O1HEAP_PRIVATE size_t pow2(const uint_fast8_t power)
147167
{
148168
return ((size_t) 1U) << power;
149169
}
150170

171+
/// This is equivalent to pow2(log2Ceil(x)). Undefined for x<2.
172+
O1HEAP_PRIVATE size_t roundUpToPowerOf2(const size_t x)
173+
{
174+
O1HEAP_ASSERT(x >= 2U);
175+
// NOLINTNEXTLINE redundant cast to the same type.
176+
return ((size_t) 1U) << ((sizeof(x) * CHAR_BIT) - ((uint_fast8_t) O1HEAP_CLZ(x - 1U)));
177+
}
178+
151179
/// Links two fragments so that their next/prev pointers point to each other; left goes before right.
152180
O1HEAP_PRIVATE void interlink(Fragment* const left, Fragment* const right)
153181
{
@@ -168,7 +196,7 @@ O1HEAP_PRIVATE void rebin(O1HeapInstance* const handle, Fragment* const fragment
168196
O1HEAP_ASSERT(fragment != NULL);
169197
O1HEAP_ASSERT(fragment->header.size >= FRAGMENT_SIZE_MIN);
170198
O1HEAP_ASSERT((fragment->header.size % FRAGMENT_SIZE_MIN) == 0U);
171-
const uint8_t idx = log2Floor(fragment->header.size / FRAGMENT_SIZE_MIN); // Round DOWN when inserting.
199+
const uint_fast8_t idx = log2Floor(fragment->header.size / FRAGMENT_SIZE_MIN); // Round DOWN when inserting.
172200
O1HEAP_ASSERT(idx < NUM_BINS_MAX);
173201
// Add the new fragment to the beginning of the bin list.
174202
// I.e., each allocation will be returning the most-recently-used fragment -- good for caching.
@@ -189,7 +217,7 @@ O1HEAP_PRIVATE void unbin(O1HeapInstance* const handle, const Fragment* const fr
189217
O1HEAP_ASSERT(fragment != NULL);
190218
O1HEAP_ASSERT(fragment->header.size >= FRAGMENT_SIZE_MIN);
191219
O1HEAP_ASSERT((fragment->header.size % FRAGMENT_SIZE_MIN) == 0U);
192-
const uint8_t idx = log2Floor(fragment->header.size / FRAGMENT_SIZE_MIN); // Round DOWN when removing.
220+
const uint_fast8_t idx = log2Floor(fragment->header.size / FRAGMENT_SIZE_MIN); // Round DOWN when removing.
193221
O1HEAP_ASSERT(idx < NUM_BINS_MAX);
194222
// Remove the bin from the free fragment list.
195223
if (O1HEAP_LIKELY(fragment->next_free != NULL))
@@ -244,7 +272,7 @@ O1HeapInstance* o1heapInit(void* const base, const size_t size)
244272
O1HEAP_ASSERT((capacity >= FRAGMENT_SIZE_MIN) && (capacity <= FRAGMENT_SIZE_MAX));
245273

246274
// Initialize the root fragment.
247-
Fragment* const frag = (Fragment*) (void*) (((uint8_t*) base) + INSTANCE_SIZE_PADDED);
275+
Fragment* const frag = (Fragment*) (void*) (((char*) base) + INSTANCE_SIZE_PADDED);
248276
O1HEAP_ASSERT((((size_t) frag) % O1HEAP_ALIGNMENT) == 0U);
249277
frag->header.next = NULL;
250278
frag->header.prev = NULL;
@@ -279,13 +307,13 @@ void* o1heapAllocate(O1HeapInstance* const handle, const size_t amount)
279307
{
280308
// Add the header size and align the allocation size to the power of 2.
281309
// See "Timing-Predictable Memory Allocation In Hard Real-Time Systems", Herter, page 27.
282-
const size_t fragment_size = pow2(log2Ceil(amount + O1HEAP_ALIGNMENT));
310+
const size_t fragment_size = roundUpToPowerOf2(amount + O1HEAP_ALIGNMENT);
283311
O1HEAP_ASSERT(fragment_size <= FRAGMENT_SIZE_MAX);
284312
O1HEAP_ASSERT(fragment_size >= FRAGMENT_SIZE_MIN);
285313
O1HEAP_ASSERT(fragment_size >= amount + O1HEAP_ALIGNMENT);
286-
O1HEAP_ASSERT(isPowerOf2(fragment_size));
314+
O1HEAP_ASSERT((fragment_size & (fragment_size - 1U)) == 0U); // Is power of 2.
287315

288-
const uint8_t optimal_bin_index = log2Ceil(fragment_size / FRAGMENT_SIZE_MIN); // Use CEIL when fetching.
316+
const uint_fast8_t optimal_bin_index = log2Ceil(fragment_size / FRAGMENT_SIZE_MIN); // Use CEIL when fetching.
289317
O1HEAP_ASSERT(optimal_bin_index < NUM_BINS_MAX);
290318
const size_t candidate_bin_mask = ~(pow2(optimal_bin_index) - 1U);
291319

@@ -294,8 +322,8 @@ void* o1heapAllocate(O1HeapInstance* const handle, const size_t amount)
294322
const size_t smallest_bin_mask = suitable_bins & ~(suitable_bins - 1U); // Clear all bits but the lowest.
295323
if (O1HEAP_LIKELY(smallest_bin_mask != 0))
296324
{
297-
O1HEAP_ASSERT(isPowerOf2(smallest_bin_mask));
298-
const uint8_t bin_index = log2Floor(smallest_bin_mask);
325+
O1HEAP_ASSERT((smallest_bin_mask & (smallest_bin_mask - 1U)) == 0U); // Is power of 2.
326+
const uint_fast8_t bin_index = log2Floor(smallest_bin_mask);
299327
O1HEAP_ASSERT(bin_index >= optimal_bin_index);
300328
O1HEAP_ASSERT(bin_index < NUM_BINS_MAX);
301329

@@ -314,7 +342,7 @@ void* o1heapAllocate(O1HeapInstance* const handle, const size_t amount)
314342
O1HEAP_ASSERT(leftover % FRAGMENT_SIZE_MIN == 0U); // Alignment check.
315343
if (O1HEAP_LIKELY(leftover >= FRAGMENT_SIZE_MIN))
316344
{
317-
Fragment* const new_frag = (Fragment*) (void*) (((uint8_t*) frag) + fragment_size);
345+
Fragment* const new_frag = (Fragment*) (void*) (((char*) frag) + fragment_size);
318346
O1HEAP_ASSERT(((size_t) new_frag) % O1HEAP_ALIGNMENT == 0U);
319347
new_frag->header.size = leftover;
320348
new_frag->header.used = false;
@@ -336,7 +364,7 @@ void* o1heapAllocate(O1HeapInstance* const handle, const size_t amount)
336364
O1HEAP_ASSERT(frag->header.size >= amount + O1HEAP_ALIGNMENT);
337365
frag->header.used = true;
338366

339-
out = ((uint8_t*) frag) + O1HEAP_ALIGNMENT;
367+
out = ((char*) frag) + O1HEAP_ALIGNMENT;
340368
}
341369
}
342370

@@ -359,7 +387,7 @@ void o1heapFree(O1HeapInstance* const handle, void* const pointer)
359387
O1HEAP_ASSERT(handle->diagnostics.capacity <= FRAGMENT_SIZE_MAX);
360388
if (O1HEAP_LIKELY(pointer != NULL)) // NULL pointer is a no-op.
361389
{
362-
Fragment* const frag = (Fragment*) (void*) (((uint8_t*) pointer) - O1HEAP_ALIGNMENT);
390+
Fragment* const frag = (Fragment*) (void*) (((char*) pointer) - O1HEAP_ALIGNMENT);
363391

364392
// Check for heap corruption in debug builds.
365393
O1HEAP_ASSERT(((size_t) frag) % sizeof(Fragment*) == 0U);
@@ -429,7 +457,7 @@ bool o1heapDoInvariantsHold(const O1HeapInstance* const handle)
429457
// Check the bin mask consistency.
430458
for (size_t i = 0; i < NUM_BINS_MAX; i++) // Dear compiler, feel free to unroll this loop.
431459
{
432-
const bool mask_bit_set = (handle->nonempty_bin_mask & pow2((uint8_t) i)) != 0U;
460+
const bool mask_bit_set = (handle->nonempty_bin_mask & pow2((uint_fast8_t) i)) != 0U;
433461
const bool bin_nonempty = handle->bins[i] != NULL;
434462
valid = valid && (mask_bit_set == bin_nonempty);
435463
}

tests/.idea/dictionaries/pavel.xml

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/CMakeLists.txt

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -78,14 +78,16 @@ function(gen_test name files compile_definitions compile_features compile_flags
7878
add_test("run_${name}" "${name}" --rng-seed time)
7979
endfunction()
8080

81-
function(gen_test_matrix name files compile_definitions)
82-
gen_test("${name}_c99_x64" "${files}" "${compile_definitions}" c_std_99 "-m64" "-m64")
83-
gen_test("${name}_c99_x32" "${files}" "${compile_definitions}" c_std_99 "-m32" "-m32")
84-
gen_test("${name}_c11_x64" "${files}" "${compile_definitions}" c_std_11 "-m64" "-m64")
85-
gen_test("${name}_c11_x32" "${files}" "${compile_definitions}" c_std_11 "-m32" "-m32")
81+
function(gen_test_matrix name files defs)
82+
gen_test("${name}_c99_x64" "${files}" "${defs}" c_std_99 "-m64" "-m64")
83+
gen_test("${name}_c99_x32" "${files}" "${defs}" c_std_99 "-m32" "-m32")
84+
gen_test("${name}_c11_x64" "${files}" "${defs}" c_std_11 "-m64" "-m64")
85+
gen_test("${name}_c11_x32" "${files}" "${defs}" c_std_11 "-m32" "-m32")
86+
gen_test("${name}_c11_x64_ni" "${files}" "${defs};O1HEAP_USE_INTRINSICS=0" c_std_11 "-m64" "-m64")
87+
gen_test("${name}_c11_x32_ni" "${files}" "${defs};O1HEAP_USE_INTRINSICS=0" c_std_11 "-m32" "-m32")
8688
# Coverage is only available for GCC builds.
8789
if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_BUILD_TYPE STREQUAL "Debug"))
88-
gen_test("${name}_cov" "${files}" "${compile_definitions}" c_std_11 "-g -O0 --coverage" "--coverage")
90+
gen_test("${name}_cov" "${files}" "${defs}" c_std_11 "-g -O0 --coverage" "--coverage")
8991
endif ()
9092
endfunction()
9193

tests/internal.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,10 @@
3434
namespace internal
3535
{
3636
extern "C" {
37-
auto isPowerOf2(const std::size_t x) -> bool;
3837
auto log2Floor(const std::size_t x) -> std::uint8_t;
3938
auto log2Ceil(const std::size_t x) -> std::uint8_t;
4039
auto pow2(const std::uint8_t power) -> std::size_t;
40+
auto roundUpToPowerOf2(const std::size_t x) -> std::size_t;
4141
}
4242

4343
struct Fragment;

tests/test_private.cpp

Lines changed: 33 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,26 +16,11 @@
1616

1717
#include "internal.hpp"
1818

19-
TEST_CASE("Private: isPowerOf2")
20-
{
21-
using internal::isPowerOf2;
22-
REQUIRE(isPowerOf2(0)); // Special case.
23-
REQUIRE(isPowerOf2(1)); // 2**0
24-
REQUIRE(isPowerOf2(2)); // 2**1
25-
REQUIRE(!isPowerOf2(3));
26-
REQUIRE(isPowerOf2(4));
27-
REQUIRE(!isPowerOf2(5));
28-
REQUIRE(!isPowerOf2(6));
29-
REQUIRE(!isPowerOf2(7));
30-
REQUIRE(isPowerOf2(8));
31-
REQUIRE(!isPowerOf2(9));
32-
}
33-
3419
TEST_CASE("Private: log2")
3520
{
3621
using internal::log2Floor;
3722
using internal::log2Ceil;
38-
REQUIRE(log2Floor(0) == 0);
23+
// The function is only defined for x>=1.
3924
REQUIRE(log2Floor(1) == 0);
4025
REQUIRE(log2Floor(2) == 1);
4126
REQUIRE(log2Floor(3) == 1);
@@ -44,7 +29,7 @@ TEST_CASE("Private: log2")
4429
REQUIRE(log2Floor(60) == 5);
4530
REQUIRE(log2Floor(64) == 6);
4631

47-
REQUIRE(log2Ceil(0) == 0);
32+
REQUIRE(log2Ceil(0) == 0); // Special case.
4833
REQUIRE(log2Ceil(1) == 0);
4934
REQUIRE(log2Ceil(2) == 1);
5035
REQUIRE(log2Ceil(3) == 2);
@@ -68,3 +53,34 @@ TEST_CASE("Private: pow2")
6853
REQUIRE(pow2(8) == 256);
6954
REQUIRE(pow2(9) == 512);
7055
}
56+
57+
TEST_CASE("Private: roundUpToPowerOf2")
58+
{
59+
using internal::log2Ceil;
60+
using internal::pow2;
61+
using internal::roundUpToPowerOf2;
62+
// The function is only defined for x>=2.
63+
REQUIRE(roundUpToPowerOf2(2) == 2);
64+
REQUIRE(roundUpToPowerOf2(3) == 4);
65+
REQUIRE(roundUpToPowerOf2(4) == 4);
66+
REQUIRE(roundUpToPowerOf2(5) == 8);
67+
REQUIRE(roundUpToPowerOf2(6) == 8);
68+
REQUIRE(roundUpToPowerOf2(7) == 8);
69+
REQUIRE(roundUpToPowerOf2(8) == 8);
70+
REQUIRE(roundUpToPowerOf2(9) == 16);
71+
REQUIRE(roundUpToPowerOf2(10) == 16);
72+
REQUIRE(roundUpToPowerOf2(11) == 16);
73+
REQUIRE(roundUpToPowerOf2(12) == 16);
74+
REQUIRE(roundUpToPowerOf2(13) == 16);
75+
REQUIRE(roundUpToPowerOf2(14) == 16);
76+
REQUIRE(roundUpToPowerOf2(15) == 16);
77+
REQUIRE(roundUpToPowerOf2(16) == 16);
78+
REQUIRE(roundUpToPowerOf2(17) == 32);
79+
REQUIRE(roundUpToPowerOf2(32) == 32);
80+
REQUIRE(roundUpToPowerOf2(2147483647U) == 2147483648U);
81+
REQUIRE(roundUpToPowerOf2(2147483648U) == 2147483648U);
82+
for (auto i = 2U; i < 1'000'000; i++)
83+
{
84+
REQUIRE(pow2(log2Ceil(i)) == roundUpToPowerOf2(i));
85+
}
86+
}

0 commit comments

Comments
 (0)