29
29
30
30
#include < cstring>
31
31
32
- #include " ../public/bit_depth.h"
33
32
#include " allocator.h"
34
33
#include " block_params.h"
35
34
#include " common.h"
@@ -188,94 +187,6 @@ class SideMap {
188
187
int width_, depth_, stride_;
189
188
};
190
189
191
- template <RoundingMode tRoundingMode>
192
- class ScalarRoundingOffsetGenerator {
193
- public:
194
- std::uint8_t get () {
195
- assert (false ); // This generic path should never be called.
196
- return 0 ;
197
- }
198
- };
199
-
200
- // A RoundingOffsetGenerator for rounding-to-nearest, always returning
201
- // the midpoint value 127.
202
- template <>
203
- class ScalarRoundingOffsetGenerator <RoundingMode::Nearest> {
204
- public:
205
- std::uint8_t get () { return 127 ; }
206
- };
207
-
208
- // A RoundingOffsetGenerator based on a 8-bit Xorshift.
209
- // This gives good results as Xorshift naturally generates
210
- // uniform random *nonzero* bytes i.e. 255 different values,
211
- // so it only remains for us to subtract one.
212
- template <>
213
- class ScalarRoundingOffsetGenerator <RoundingMode::ProbabilisticXorshift> {
214
- public:
215
- ScalarRoundingOffsetGenerator () { x_ = 128 ; }
216
-
217
- std::uint8_t get () {
218
- std::uint8_t result = x_ - 1 ;
219
- // Xorshift8(7,5,3)
220
- x_ ^= x_ << 7 ;
221
- x_ ^= x_ >> 5 ;
222
- x_ ^= x_ << 3 ;
223
- return result;
224
- }
225
-
226
- private:
227
- // State
228
- std::uint8_t x_;
229
- };
230
-
231
- // A RoundingOffsetGenerator based on an 8-bit add/mod
232
- // low-discrepancy sequence. See less-than-8-bit.txt for
233
- // an explanation (the constant 97 is important - it must
234
- // be both relatively prime to 255, in order for the sequence
235
- // to be full-period, and c/255 should be close to 0.38 to
236
- // obtain low discrepancy). Uses a small bit hack to avoid
237
- // expensive % operations.
238
- template <>
239
- class ScalarRoundingOffsetGenerator <RoundingMode::ProbabilisticAddmod> {
240
- static const std::uint8_t AddConst = 97 ;
241
-
242
- public:
243
- ScalarRoundingOffsetGenerator () { x_ = 1 ; } // Start must be non-zero
244
-
245
- std::uint8_t get () {
246
- // The +'d boolean term causes the increment to skip over 255,
247
- // (recalling that 255+1 = 256 = 0 for an 8 bit uint),
248
- // thus implementing %255
249
- x_ += (AddConst + (x_ >= (255 - AddConst)));
250
- return x_;
251
- }
252
-
253
- private:
254
- // State
255
- std::uint8_t x_;
256
- };
257
-
258
- // Requantizes a source uint8 value in [0..255] range
259
- // to the range specified by BitDepth, [0..((2^bits)-1)].
260
- // Bias must be avoided. Currently this is achieved
261
- // by probabilistic rounding.
262
- template <typename QuantizationParams>
263
- std::uint8_t Requantize (
264
- std::uint8_t raw_src_val,
265
- ScalarRoundingOffsetGenerator<QuantizationParams::kRoundingMode >*
266
- rounding_offset_generator) {
267
- static const int kBits = QuantizationParams::BitDepth::kBits ;
268
- static const std::uint8_t kMaxVal = (1 << kBits ) - 1 ;
269
-
270
- if (kBits == 8 ) {
271
- return raw_src_val;
272
- }
273
-
274
- std::uint16_t scaled = static_cast <std::uint16_t >(raw_src_val) * kMaxVal ;
275
- std::uint8_t rounding_offset = rounding_offset_generator->get ();
276
- return (scaled + rounding_offset) / 255 ;
277
- }
278
-
279
190
// A PackingRegisterBlock is a small fixed-size block of a matrix being
280
191
// packed. This class is the generic non-optimized implementation,
281
192
// it is inherited by the generic implementation of PackingRegisterBlock,
@@ -292,7 +203,7 @@ std::uint8_t Requantize(
292
203
// 2. Packing a complete block into the destination, see Pack. This is the
293
204
// most critical part, so it's convenient that unaligned boundaries have
294
205
// already been handled in step 1.
295
- template <typename QuantizationParams, typename SrcMapType,
206
+ template <typename SrcMapType,
296
207
typename PackedSideBlock>
297
208
class PackingRegisterBlockBase {
298
209
public:
@@ -305,9 +216,6 @@ class PackingRegisterBlockBase {
305
216
static const int kCellSize = CellFormat::kSize ;
306
217
static const SideMapOrder kSrcOrder = SrcMapType::kOrder ;
307
218
308
- typedef ScalarRoundingOffsetGenerator<QuantizationParams::kRoundingMode >
309
- RoundingOffsetGenerator;
310
-
311
219
PackingRegisterBlockBase () : complete_src_(nullptr , 0 , 0 , 0 ) {}
312
220
313
221
protected:
@@ -344,8 +252,7 @@ class PackingRegisterBlockBase {
344
252
// Packs a complete block into the destination. This is the most
345
253
// critical part and the part that we most typically want to
346
254
// override in architecture-specific optimized specializations.
347
- void Pack (PackedSideBlock* dst, int start_width,
348
- RoundingOffsetGenerator* rounding_offset_generator) {
255
+ void Pack (PackedSideBlock* dst, int start_width) {
349
256
std::uint8_t * dst_ptr = dst->current_data ();
350
257
for (int cell_start_depth = 0 ; cell_start_depth < kRegisterSize ;
351
258
cell_start_depth += kCellDepth ) {
@@ -359,11 +266,9 @@ class PackingRegisterBlockBase {
359
266
for (int w = 0 ; w < kCellWidth ; w++) {
360
267
std::int32_t sum = 0 ;
361
268
for (int d = 0 ; d < kCellDepth ; d++) {
362
- const std::uint8_t raw_src_val = src_cell_map (w, d);
363
- const std::uint8_t requantized = Requantize<QuantizationParams>(
364
- raw_src_val, rounding_offset_generator);
365
- dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = requantized;
366
- sum += requantized;
269
+ const std::uint8_t src_val = src_cell_map (w, d);
270
+ dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = src_val;
271
+ sum += src_val;
367
272
}
368
273
cell_sums_of_each_slice_ptr[w] += sum;
369
274
}
@@ -374,14 +279,14 @@ class PackingRegisterBlockBase {
374
279
}
375
280
};
376
281
377
- template <typename QuantizationParams, typename SrcMapType,
282
+ template <typename SrcMapType,
378
283
typename PackedSideBlock>
379
284
class PackingRegisterBlock
380
- : public PackingRegisterBlockBase<QuantizationParams, SrcMapType,
285
+ : public PackingRegisterBlockBase<SrcMapType,
381
286
PackedSideBlock> {};
382
287
383
288
// Large-scale implementation of packing.
384
- template <typename QuantizationParams, typename SrcMapType,
289
+ template <typename SrcMapType,
385
290
typename PackedSideBlock>
386
291
class PackSideBlockImpl {
387
292
public:
@@ -392,10 +297,8 @@ class PackSideBlockImpl {
392
297
static const int kKernelWidth = CellFormat::kWidth * kCells ;
393
298
static const int kCellDepth = CellFormat::kDepth ;
394
299
395
- typedef PackingRegisterBlock<QuantizationParams, SrcMapType, PackedSideBlock>
300
+ typedef PackingRegisterBlock<SrcMapType, PackedSideBlock>
396
301
PackingRegisterBlockType;
397
- typedef typename PackingRegisterBlockType::RoundingOffsetGenerator
398
- RoundingOffsetGenerator;
399
302
400
303
PackSideBlockImpl (PackedSideBlock* packed_side_block,
401
304
const SrcMapType& src_map)
@@ -461,22 +364,22 @@ class PackSideBlockImpl {
461
364
for (int d = 0 ; d < register_aligned_depth; d += kRegisterSize ) {
462
365
b.UseCompleteSrcInPlace (src_map_.block (start_width, start_depth + d,
463
366
width, kRegisterSize ));
464
- b.Pack (packed_side_block_, start_width, &rounding_offset_generator_ );
367
+ b.Pack (packed_side_block_, start_width);
465
368
}
466
369
}
467
370
if (register_aligned_depth < depth) {
468
371
b.MakeCompleteSrc (
469
372
src_map_.block (start_width, start_depth + register_aligned_depth,
470
373
width, depth - register_aligned_depth));
471
- b.Pack (packed_side_block_, start_width, &rounding_offset_generator_ );
374
+ b.Pack (packed_side_block_, start_width);
472
375
}
473
376
} else {
474
377
assert (width < kKernelWidth );
475
378
for (int d = 0 ; d < depth; d += kRegisterSize ) {
476
379
const int ds = std::min (+kRegisterSize , depth - d);
477
380
b.MakeCompleteSrc (
478
381
src_map_.block (start_width, start_depth + d, width, ds));
479
- b.Pack (packed_side_block_, start_width, &rounding_offset_generator_ );
382
+ b.Pack (packed_side_block_, start_width);
480
383
}
481
384
}
482
385
}
@@ -487,23 +390,10 @@ class PackSideBlockImpl {
487
390
// A map on the block of the original matrix block being packed,
488
391
// i.e. the 'source'.
489
392
const SrcMapType& src_map_;
490
-
491
- // Used for requantization in the less-than-8-bit case.
492
- // Otherwise unused.
493
- RoundingOffsetGenerator rounding_offset_generator_;
494
- };
495
-
496
- // Quantization parameters for the side (LHS or RHS) being packed,
497
- // with the rounding strategy having been already resolved to a specific
498
- // rounding mode.
499
- template <typename tBitDepth, RoundingMode tRoundingMode>
500
- struct QuantizationParams {
501
- typedef tBitDepth BitDepth;
502
- static const RoundingMode kRoundingMode = tRoundingMode;
503
393
};
504
394
505
395
// Packs a block of the input LHS matrix, into a PackedSideBlock
506
- template <typename BitDepthParams, typename PackedSideBlock,
396
+ template <typename PackedSideBlock,
507
397
typename MatrixMapType>
508
398
void PackLhs (PackedSideBlock* dst, const MatrixMapType& src) {
509
399
ScopedProfilingLabel label (" pack LHS" );
@@ -513,28 +403,14 @@ void PackLhs(PackedSideBlock* dst, const MatrixMapType& src) {
513
403
typedef typename MatrixMapType::Scalar Scalar;
514
404
typedef SideMap<Scalar, kSideMapOrder > SideMapType;
515
405
SideMapType src_side_map (src.data (), src.rows (), src.cols (), src.stride ());
516
- typedef typename BitDepthParams::LhsBitDepth BitDepth;
517
- typedef typename BitDepthParams::RoundingStrategy RoundingStrategy;
518
406
const int accumulation_depth = src_side_map.depth ();
519
- if (accumulation_depth < RoundingStrategy::kRoundingModeSizeThreshold ) {
520
- typedef QuantizationParams<BitDepth,
521
- RoundingStrategy::kRoundingModeForSmallSizes >
522
- QParams;
523
- typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
524
- ImplType impl (dst, src_side_map);
525
- impl.PackL2 ();
526
- } else {
527
- typedef QuantizationParams<BitDepth,
528
- RoundingStrategy::kRoundingModeForLargeSizes >
529
- QParams;
530
- typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
531
- ImplType impl (dst, src_side_map);
532
- impl.PackL2 ();
533
- }
407
+ typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType;
408
+ ImplType impl (dst, src_side_map);
409
+ impl.PackL2 ();
534
410
}
535
411
536
412
// Packs a block of the input RHS matrix, into a PackedSideBlock
537
- template <typename BitDepthParams, typename PackedSideBlock,
413
+ template <typename PackedSideBlock,
538
414
typename MatrixMapType>
539
415
void PackRhs (PackedSideBlock* dst, const MatrixMapType& src) {
540
416
ScopedProfilingLabel label (" pack RHS" );
@@ -544,24 +420,10 @@ void PackRhs(PackedSideBlock* dst, const MatrixMapType& src) {
544
420
typedef typename MatrixMapType::Scalar Scalar;
545
421
typedef SideMap<Scalar, kSideMapOrder > SideMapType;
546
422
SideMapType src_side_map (src.data (), src.cols (), src.rows (), src.stride ());
547
- typedef typename BitDepthParams::RhsBitDepth BitDepth;
548
- typedef typename BitDepthParams::RoundingStrategy RoundingStrategy;
549
423
const int accumulation_depth = src_side_map.depth ();
550
- if (accumulation_depth < RoundingStrategy::kRoundingModeSizeThreshold ) {
551
- typedef QuantizationParams<BitDepth,
552
- RoundingStrategy::kRoundingModeForSmallSizes >
553
- QParams;
554
- typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
555
- ImplType impl (dst, src_side_map);
556
- impl.PackL2 ();
557
- } else {
558
- typedef QuantizationParams<BitDepth,
559
- RoundingStrategy::kRoundingModeForLargeSizes >
560
- QParams;
561
- typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
562
- ImplType impl (dst, src_side_map);
563
- impl.PackL2 ();
564
- }
424
+ typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType;
425
+ ImplType impl (dst, src_side_map);
426
+ impl.PackL2 ();
565
427
}
566
428
567
429
} // namespace gemmlowp
0 commit comments