Skip to content

Commit 4746387

Browse files
committed
Remove the old requantizing less-than-8-bit stuff.
Map the DefaultL7R5BitDepthParams setting to DefaultL8R8BitDepthParams so that this change does not break any user relying on it, since the whole point of requantization was to make this an implementation detail. Instead, from now on, using actual less-than-8-bit settings will only have the effect of selecting a different GEMM kernel. In other words, it is now the responsibility of the user to know the actual bit-depth (i.e. range) of their 8bit values. In exchange for that responsibility, users of lower-than-8-bit depths will enjoy the increased performance of corresponding kernels without the overhead of requantization.
1 parent bc3491e commit 4746387

File tree

12 files changed

+97
-668
lines changed

12 files changed

+97
-668
lines changed

doc/less-than-8-bit.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ O(N^3) compute stage. For large enough matrices, that should be worth it.
2525

2626
### The present
2727

28+
TODO(benoitjacob): update this documentation. This 'present' state just
29+
became the past (February 2017).
30+
2831
At the moment, this less-than-8-bit mode of gemmlowp is not much used in
2932
practice, because the implicit requantization of operands from 8bit to
3033
less-than-8bit turned out to be more expensive than initially expected, both in

internal/multi_thread_gemm.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -449,13 +449,13 @@ struct GemmWithPackedRhsTask : Task {
449449
for (int r = 0; r < rows; r += block_params.l2_rows) {
450450
int rs = std::min(block_params.l2_rows, rows - r);
451451

452-
PackLhs<BitDepthParams>(&packed_lhs, lhs.block(r, 0, rs, depth));
452+
PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth));
453453

454454
Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs);
455455

456456
auto curr_result_block = MatrixBlockBounds(
457457
result_block.start_row + r, result_block.start_col + c, rs, cs);
458-
UnpackResult<BitDepthParams>(&result, curr_result_block, packed_result,
458+
UnpackResult(&result, curr_result_block, packed_result,
459459
depth, packed_lhs.sums_of_each_slice(),
460460
packed_rhs.sums_of_each_slice(),
461461
lhs_offset, rhs_offset, output_pipeline);
@@ -637,7 +637,7 @@ void MultiThreadGemm(GemmContextType* context, const KernelBase& kernel,
637637
int cs = std::min(block_params.l2_cols, cols - c);
638638

639639
// Pack a large block of the RHS.
640-
PackRhs<BitDepthParams>(&packed_rhs, rhs.block(0, c, depth, cs));
640+
PackRhs(&packed_rhs, rhs.block(0, c, depth, cs));
641641

642642
// Give work to each worker.
643643
int next_start_row = 0;

internal/pack.h

Lines changed: 20 additions & 158 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929

3030
#include <cstring>
3131

32-
#include "../public/bit_depth.h"
3332
#include "allocator.h"
3433
#include "block_params.h"
3534
#include "common.h"
@@ -188,94 +187,6 @@ class SideMap {
188187
int width_, depth_, stride_;
189188
};
190189

191-
template <RoundingMode tRoundingMode>
192-
class ScalarRoundingOffsetGenerator {
193-
public:
194-
std::uint8_t get() {
195-
assert(false); // This generic path should never be called.
196-
return 0;
197-
}
198-
};
199-
200-
// A RoundingOffsetGenerator for rounding-to-nearest, always returning
201-
// the midpoint value 127.
202-
template <>
203-
class ScalarRoundingOffsetGenerator<RoundingMode::Nearest> {
204-
public:
205-
std::uint8_t get() { return 127; }
206-
};
207-
208-
// A RoundingOffsetGenerator based on a 8-bit Xorshift.
209-
// This gives good results as Xorshift naturally generates
210-
// uniform random *nonzero* bytes i.e. 255 different values,
211-
// so it only remains for us to subtract one.
212-
template <>
213-
class ScalarRoundingOffsetGenerator<RoundingMode::ProbabilisticXorshift> {
214-
public:
215-
ScalarRoundingOffsetGenerator() { x_ = 128; }
216-
217-
std::uint8_t get() {
218-
std::uint8_t result = x_ - 1;
219-
// Xorshift8(7,5,3)
220-
x_ ^= x_ << 7;
221-
x_ ^= x_ >> 5;
222-
x_ ^= x_ << 3;
223-
return result;
224-
}
225-
226-
private:
227-
// State
228-
std::uint8_t x_;
229-
};
230-
231-
// A RoundingOffsetGenerator based on an 8-bit add/mod
232-
// low-discrepancy sequence. See less-than-8-bit.txt for
233-
// an explanation (the constant 97 is important - it must
234-
// be both relatively prime to 255, in order for the sequence
235-
// to be full-period, and c/255 should be close to 0.38 to
236-
// obtain low discrepancy). Uses a small bit hack to avoid
237-
// expensive % operations.
238-
template <>
239-
class ScalarRoundingOffsetGenerator<RoundingMode::ProbabilisticAddmod> {
240-
static const std::uint8_t AddConst = 97;
241-
242-
public:
243-
ScalarRoundingOffsetGenerator() { x_ = 1; } // Start must be non-zero
244-
245-
std::uint8_t get() {
246-
// The +'d boolean term causes the increment to skip over 255,
247-
// (recalling that 255+1 = 256 = 0 for an 8 bit uint),
248-
// thus implementing %255
249-
x_ += (AddConst + (x_ >= (255 - AddConst)));
250-
return x_;
251-
}
252-
253-
private:
254-
// State
255-
std::uint8_t x_;
256-
};
257-
258-
// Requantizes a source uint8 value in [0..255] range
259-
// to the range specified by BitDepth, [0..((2^bits)-1)].
260-
// Bias must be avoided. Currently this is achieved
261-
// by probabilistic rounding.
262-
template <typename QuantizationParams>
263-
std::uint8_t Requantize(
264-
std::uint8_t raw_src_val,
265-
ScalarRoundingOffsetGenerator<QuantizationParams::kRoundingMode>*
266-
rounding_offset_generator) {
267-
static const int kBits = QuantizationParams::BitDepth::kBits;
268-
static const std::uint8_t kMaxVal = (1 << kBits) - 1;
269-
270-
if (kBits == 8) {
271-
return raw_src_val;
272-
}
273-
274-
std::uint16_t scaled = static_cast<std::uint16_t>(raw_src_val) * kMaxVal;
275-
std::uint8_t rounding_offset = rounding_offset_generator->get();
276-
return (scaled + rounding_offset) / 255;
277-
}
278-
279190
// A PackingRegisterBlock is a small fixed-size block of a matrix being
280191
// packed. This class is the generic non-optimized implementation,
281192
// it is inherited by the generic implementation of PackingRegisterBlock,
@@ -292,7 +203,7 @@ std::uint8_t Requantize(
292203
// 2. Packing a complete block into the destination, see Pack. This is the
293204
// most critical part, so it's convenient that unaligned boundaries have
294205
// already been handled in step 1.
295-
template <typename QuantizationParams, typename SrcMapType,
206+
template <typename SrcMapType,
296207
typename PackedSideBlock>
297208
class PackingRegisterBlockBase {
298209
public:
@@ -305,9 +216,6 @@ class PackingRegisterBlockBase {
305216
static const int kCellSize = CellFormat::kSize;
306217
static const SideMapOrder kSrcOrder = SrcMapType::kOrder;
307218

308-
typedef ScalarRoundingOffsetGenerator<QuantizationParams::kRoundingMode>
309-
RoundingOffsetGenerator;
310-
311219
PackingRegisterBlockBase() : complete_src_(nullptr, 0, 0, 0) {}
312220

313221
protected:
@@ -344,8 +252,7 @@ class PackingRegisterBlockBase {
344252
// Packs a complete block into the destination. This is the most
345253
// critical part and the part that we most typically want to
346254
// override in architecture-specific optimized specializations.
347-
void Pack(PackedSideBlock* dst, int start_width,
348-
RoundingOffsetGenerator* rounding_offset_generator) {
255+
void Pack(PackedSideBlock* dst, int start_width) {
349256
std::uint8_t* dst_ptr = dst->current_data();
350257
for (int cell_start_depth = 0; cell_start_depth < kRegisterSize;
351258
cell_start_depth += kCellDepth) {
@@ -359,11 +266,9 @@ class PackingRegisterBlockBase {
359266
for (int w = 0; w < kCellWidth; w++) {
360267
std::int32_t sum = 0;
361268
for (int d = 0; d < kCellDepth; d++) {
362-
const std::uint8_t raw_src_val = src_cell_map(w, d);
363-
const std::uint8_t requantized = Requantize<QuantizationParams>(
364-
raw_src_val, rounding_offset_generator);
365-
dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = requantized;
366-
sum += requantized;
269+
const std::uint8_t src_val = src_cell_map(w, d);
270+
dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = src_val;
271+
sum += src_val;
367272
}
368273
cell_sums_of_each_slice_ptr[w] += sum;
369274
}
@@ -374,14 +279,14 @@ class PackingRegisterBlockBase {
374279
}
375280
};
376281

377-
template <typename QuantizationParams, typename SrcMapType,
282+
template <typename SrcMapType,
378283
typename PackedSideBlock>
379284
class PackingRegisterBlock
380-
: public PackingRegisterBlockBase<QuantizationParams, SrcMapType,
285+
: public PackingRegisterBlockBase<SrcMapType,
381286
PackedSideBlock> {};
382287

383288
// Large-scale implementation of packing.
384-
template <typename QuantizationParams, typename SrcMapType,
289+
template <typename SrcMapType,
385290
typename PackedSideBlock>
386291
class PackSideBlockImpl {
387292
public:
@@ -392,10 +297,8 @@ class PackSideBlockImpl {
392297
static const int kKernelWidth = CellFormat::kWidth * kCells;
393298
static const int kCellDepth = CellFormat::kDepth;
394299

395-
typedef PackingRegisterBlock<QuantizationParams, SrcMapType, PackedSideBlock>
300+
typedef PackingRegisterBlock<SrcMapType, PackedSideBlock>
396301
PackingRegisterBlockType;
397-
typedef typename PackingRegisterBlockType::RoundingOffsetGenerator
398-
RoundingOffsetGenerator;
399302

400303
PackSideBlockImpl(PackedSideBlock* packed_side_block,
401304
const SrcMapType& src_map)
@@ -461,22 +364,22 @@ class PackSideBlockImpl {
461364
for (int d = 0; d < register_aligned_depth; d += kRegisterSize) {
462365
b.UseCompleteSrcInPlace(src_map_.block(start_width, start_depth + d,
463366
width, kRegisterSize));
464-
b.Pack(packed_side_block_, start_width, &rounding_offset_generator_);
367+
b.Pack(packed_side_block_, start_width);
465368
}
466369
}
467370
if (register_aligned_depth < depth) {
468371
b.MakeCompleteSrc(
469372
src_map_.block(start_width, start_depth + register_aligned_depth,
470373
width, depth - register_aligned_depth));
471-
b.Pack(packed_side_block_, start_width, &rounding_offset_generator_);
374+
b.Pack(packed_side_block_, start_width);
472375
}
473376
} else {
474377
assert(width < kKernelWidth);
475378
for (int d = 0; d < depth; d += kRegisterSize) {
476379
const int ds = std::min(+kRegisterSize, depth - d);
477380
b.MakeCompleteSrc(
478381
src_map_.block(start_width, start_depth + d, width, ds));
479-
b.Pack(packed_side_block_, start_width, &rounding_offset_generator_);
382+
b.Pack(packed_side_block_, start_width);
480383
}
481384
}
482385
}
@@ -487,23 +390,10 @@ class PackSideBlockImpl {
487390
// A map on the block of the original matrix block being packed,
488391
// i.e. the 'source'.
489392
const SrcMapType& src_map_;
490-
491-
// Used for requantization in the less-than-8-bit case.
492-
// Otherwise unused.
493-
RoundingOffsetGenerator rounding_offset_generator_;
494-
};
495-
496-
// Quantization parameters for the side (LHS or RHS) being packed,
497-
// with the rounding strategy having been already resolved to a specific
498-
// rounding mode.
499-
template <typename tBitDepth, RoundingMode tRoundingMode>
500-
struct QuantizationParams {
501-
typedef tBitDepth BitDepth;
502-
static const RoundingMode kRoundingMode = tRoundingMode;
503393
};
504394

505395
// Packs a block of the input LHS matrix, into a PackedSideBlock
506-
template <typename BitDepthParams, typename PackedSideBlock,
396+
template <typename PackedSideBlock,
507397
typename MatrixMapType>
508398
void PackLhs(PackedSideBlock* dst, const MatrixMapType& src) {
509399
ScopedProfilingLabel label("pack LHS");
@@ -513,28 +403,14 @@ void PackLhs(PackedSideBlock* dst, const MatrixMapType& src) {
513403
typedef typename MatrixMapType::Scalar Scalar;
514404
typedef SideMap<Scalar, kSideMapOrder> SideMapType;
515405
SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride());
516-
typedef typename BitDepthParams::LhsBitDepth BitDepth;
517-
typedef typename BitDepthParams::RoundingStrategy RoundingStrategy;
518406
const int accumulation_depth = src_side_map.depth();
519-
if (accumulation_depth < RoundingStrategy::kRoundingModeSizeThreshold) {
520-
typedef QuantizationParams<BitDepth,
521-
RoundingStrategy::kRoundingModeForSmallSizes>
522-
QParams;
523-
typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
524-
ImplType impl(dst, src_side_map);
525-
impl.PackL2();
526-
} else {
527-
typedef QuantizationParams<BitDepth,
528-
RoundingStrategy::kRoundingModeForLargeSizes>
529-
QParams;
530-
typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
531-
ImplType impl(dst, src_side_map);
532-
impl.PackL2();
533-
}
407+
typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType;
408+
ImplType impl(dst, src_side_map);
409+
impl.PackL2();
534410
}
535411

536412
// Packs a block of the input RHS matrix, into a PackedSideBlock
537-
template <typename BitDepthParams, typename PackedSideBlock,
413+
template <typename PackedSideBlock,
538414
typename MatrixMapType>
539415
void PackRhs(PackedSideBlock* dst, const MatrixMapType& src) {
540416
ScopedProfilingLabel label("pack RHS");
@@ -544,24 +420,10 @@ void PackRhs(PackedSideBlock* dst, const MatrixMapType& src) {
544420
typedef typename MatrixMapType::Scalar Scalar;
545421
typedef SideMap<Scalar, kSideMapOrder> SideMapType;
546422
SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride());
547-
typedef typename BitDepthParams::RhsBitDepth BitDepth;
548-
typedef typename BitDepthParams::RoundingStrategy RoundingStrategy;
549423
const int accumulation_depth = src_side_map.depth();
550-
if (accumulation_depth < RoundingStrategy::kRoundingModeSizeThreshold) {
551-
typedef QuantizationParams<BitDepth,
552-
RoundingStrategy::kRoundingModeForSmallSizes>
553-
QParams;
554-
typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
555-
ImplType impl(dst, src_side_map);
556-
impl.PackL2();
557-
} else {
558-
typedef QuantizationParams<BitDepth,
559-
RoundingStrategy::kRoundingModeForLargeSizes>
560-
QParams;
561-
typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
562-
ImplType impl(dst, src_side_map);
563-
impl.PackL2();
564-
}
424+
typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType;
425+
ImplType impl(dst, src_side_map);
426+
impl.PackL2();
565427
}
566428

567429
} // namespace gemmlowp

0 commit comments

Comments
 (0)