1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
/// Creates an unsigned division function that uses binary long division, designed for
/// computer architectures without division instructions. These functions have good performance for
/// microarchitectures with large branch miss penalties and architectures without the ability to
/// predicate instructions. For architectures with predicated instructions, one of the algorithms
/// described in the documentation of these functions probably has higher performance, and a custom
/// assembly routine should be used instead.
#[allow(unused_macros)]
macro_rules! impl_binary_long {
    (
        $fn:ident, // name of the unsigned division function
        $zero_div_fn:ident, // function called when division by zero is attempted
        $normalization_shift:ident, // function for finding the normalization shift
        $n:tt, // the number of bits in a $iX or $uX
        $uX:ident, // unsigned integer type for the inputs and outputs of `$fn`
        $iX:ident // signed integer type with same bitwidth as `$uX`
    ) => {
        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
        /// tuple.
        pub fn $fn(duo: $uX, div: $uX) -> ($uX, $uX) {
            let mut duo = duo;
            // handle edge cases before calling `$normalization_shift`
            if div == 0 {
                $zero_div_fn()
            }
            if duo < div {
                return (0, duo);
            }

            // There are many variations of binary division algorithm that could be used. This
            // documentation gives a tour of different methods so that future readers wanting to
            // optimize further do not have to painstakingly derive them. The SWAR variation is
            // especially hard to understand without reading the less convoluted methods first.

            // You may notice that a `duo < div_original` check is included in many these
            // algorithms. A critical optimization that many algorithms miss is handling of
            // quotients that will turn out to have many trailing zeros or many leading zeros. This
            // happens in cases of exact or close-to-exact divisions, divisions by power of two, and
            // in cases where the quotient is small. The `duo < div_original` check handles these
            // cases of early returns and ends up replacing other kinds of mundane checks that
            // normally terminate a binary division algorithm.
            //
            // Something you may see in other algorithms that is not special-cased here is checks
            // for division by powers of two. The `duo < div_original` check handles this case and
            // more, however it can be checked up front before the bisection using the
            // `((div > 0) && ((div & (div - 1)) == 0))` trick. This is not special-cased because
            // compilers should handle most cases where divisions by power of two occur, and we do
            // not want to add on a few cycles for every division operation just to save a few
            // cycles rarely.

            // The following example is the most straightforward translation from the way binary
            // long division is typically visualized:
            // Dividing 178u8 (0b10110010) by 6u8 (0b110). `div` is shifted left by 5, according to
            // the result from `$normalization_shift(duo, div, false)`.
            //
            // Step 0: `sub` is negative, so there is not full normalization, so no `quo` bit is set
            // and `duo` is kept unchanged.
            // duo:10110010, div_shifted:11000000, sub:11110010, quo:00000000, shl:5
            //
            // Step 1: `sub` is positive, set a `quo` bit and update `duo` for next step.
            // duo:10110010, div_shifted:01100000, sub:01010010, quo:00010000, shl:4
            //
            // Step 2: Continue based on `sub`. The `quo` bits start accumulating.
            // duo:01010010, div_shifted:00110000, sub:00100010, quo:00011000, shl:3
            // duo:00100010, div_shifted:00011000, sub:00001010, quo:00011100, shl:2
            // duo:00001010, div_shifted:00001100, sub:11111110, quo:00011100, shl:1
            // duo:00001010, div_shifted:00000110, sub:00000100, quo:00011100, shl:0
            // The `duo < div_original` check terminates the algorithm with the correct quotient of
            // 29u8 and remainder of 4u8
            /*
            let div_original = div;
            let mut shl = $normalization_shift(duo, div, false);
            let mut quo = 0;
            loop {
                let div_shifted = div << shl;
                let sub = duo.wrapping_sub(div_shifted);
                // it is recommended to use `println!`s like this if functionality is unclear
                /*
                println!("duo:{:08b}, div_shifted:{:08b}, sub:{:08b}, quo:{:08b}, shl:{}",
                    duo,
                    div_shifted,
                    sub,
                    quo,
                    shl
                );
                */
                if 0 <= (sub as $iX) {
                    duo = sub;
                    quo += 1 << shl;
                    if duo < div_original {
                        // this branch is optional
                        return (quo, duo)
                    }
                }
                if shl == 0 {
                    return (quo, duo)
                }
                shl -= 1;
            }
            */

            // This restoring binary long division algorithm reduces the number of operations
            // overall via:
            // - `pow` can be shifted right instead of recalculating from `shl`
            // - starting `div` shifted left and shifting it right for each step instead of
            //   recalculating from `shl`
            // - The `duo < div_original` branch is used to terminate the algorithm instead of the
            //   `shl == 0` branch. This check is strong enough to prevent set bits of `pow` and
            //   `div` from being shifted off the end. This check also only occurs on half of steps
            //   on average, since it is behind the `(sub as $iX) >= 0` branch.
            // - `shl` is now not needed by any aspect of of the loop and thus only 3 variables are
            //   being updated between steps
            //
            // There are many variations of this algorithm, but this encompases the largest number
            // of architectures and does not rely on carry flags, add-with-carry, or SWAR
            // complications to be decently fast.
            /*
            let div_original = div;
            let shl = $normalization_shift(duo, div, false);
            let mut div: $uX = div << shl;
            let mut pow: $uX = 1 << shl;
            let mut quo: $uX = 0;
            loop {
                let sub = duo.wrapping_sub(div);
                if 0 <= (sub as $iX) {
                    duo = sub;
                    quo |= pow;
                    if duo < div_original {
                        return (quo, duo)
                    }
                }
                div >>= 1;
                pow >>= 1;
            }
            */

            // If the architecture has flags and predicated arithmetic instructions, it is possible
            // to do binary long division without branching and in only 3 or 4 instructions. This is
            // a variation of a 3 instruction central loop from
            // http://www.chiark.greenend.org.uk/~theom/riscos/docs/ultimate/a252div.txt.
            //
            // What allows doing division in only 3 instructions is realizing that instead of
            // keeping `duo` in place and shifting `div` right to align bits, `div` can be kept in
            // place and `duo` can be shifted left. This means `div` does not have to be updated,
            // but causes edge case problems and makes `duo < div_original` tests harder. Some
            // architectures have an option to shift an argument in an arithmetic operation, which
            // means `duo` can be shifted left and subtracted from in one instruction. The other two
            // instructions are updating `quo` and undoing the subtraction if it turns out things
            // were not normalized.

            /*
            // Perform one binary long division step on the already normalized arguments, because
            // the main. Note that this does a full normalization since the central loop needs
            // `duo.leading_zeros()` to be at least 1 more than `div.leading_zeros()`. The original
            // variation only did normalization to the nearest 4 steps, but this makes handling edge
            // cases much harder. We do a full normalization and perform a binary long division
            // step. In the edge case where the msbs of `duo` and `div` are set, it clears the msb
            // of `duo`, then the edge case handler shifts `div` right and does another long
            // division step to always insure `duo.leading_zeros() + 1 >= div.leading_zeros()`.
            let div_original = div;
            let mut shl = $normalization_shift(duo, div, true);
            let mut div: $uX = (div << shl);
            let mut quo: $uX = 1;
            duo = duo.wrapping_sub(div);
            if duo < div_original {
                return (1 << shl, duo);
            }
            let div_neg: $uX;
            if (div as $iX) < 0 {
                // A very ugly edge case where the most significant bit of `div` is set (after
                // shifting to match `duo` when its most significant bit is at the sign bit), which
                // leads to the sign bit of `div_neg` being cut off and carries not happening when
                // they should. This branch performs a long division step that keeps `duo` in place
                // and shifts `div` down.
                div >>= 1;
                div_neg = div.wrapping_neg();
                let (sub, carry) = duo.overflowing_add(div_neg);
                duo = sub;
                quo = quo.wrapping_add(quo).wrapping_add(carry as $uX);
                if !carry {
                    duo = duo.wrapping_add(div);
                }
                shl -= 1;
            } else {
                div_neg = div.wrapping_neg();
            }
            // The add-with-carry that updates `quo` needs to have the carry set when a normalized
            // subtract happens. Using `duo.wrapping_shl(1).overflowing_sub(div)` to do the
            // subtraction generates a carry when an unnormalized subtract happens, which is the
            // opposite of what we want. Instead, we use
            // `duo.wrapping_shl(1).overflowing_add(div_neg)`, where `div_neg` is negative `div`.
            let mut i = shl;
            loop {
                if i == 0 {
                    break;
                }
                i -= 1;
                // `ADDS duo, div, duo, LSL #1`
                // (add `div` to `duo << 1` and set flags)
                let (sub, carry) = duo.wrapping_shl(1).overflowing_add(div_neg);
                duo = sub;
                // `ADC quo, quo, quo`
                // (add with carry). Effectively shifts `quo` left by 1 and sets the least
                // significant bit to the carry.
                quo = quo.wrapping_add(quo).wrapping_add(carry as $uX);
                // `ADDCC duo, duo, div`
                // (add if carry clear). Undoes the subtraction if no carry was generated.
                if !carry {
                    duo = duo.wrapping_add(div);
                }
            }
            return (quo, duo >> shl);
            */

            // This is the SWAR (SIMD within in a register) restoring division algorithm.
            // This combines several ideas of the above algorithms:
            //  - If `duo` is shifted left instead of shifting `div` right like in the 3 instruction
            //    restoring division algorithm, some architectures can do the shifting and
            //    subtraction step in one instruction.
            //  - `quo` can be constructed by adding powers-of-two to it or shifting it left by one
            //    and adding one.
            //  - Every time `duo` is shifted left, there is another unused 0 bit shifted into the
            //    LSB, so what if we use those bits to store `quo`?
            // Through a complex setup, it is possible to manage `duo` and `quo` in the same
            // register, and perform one step with 2 or 3 instructions. The only major downsides are
            // that there is significant setup (it is only saves instructions if `shl` is
            // approximately more than 4), `duo < div_original` checks are impractical once SWAR is
            // initiated, and the number of division steps taken has to be exact (we cannot do more
            // division steps than `shl`, because it introduces edge cases where quotient bits in
            // `duo` start to collide with the real part of `div`.
            /*
            // first step. The quotient bit is stored in `quo` for now
            let div_original = div;
            let mut shl = $normalization_shift(duo, div, true);
            let mut div: $uX = (div << shl);
            duo = duo.wrapping_sub(div);
            let mut quo: $uX = 1 << shl;
            if duo < div_original {
                return (quo, duo);
            }

            let mask: $uX;
            if (div as $iX) < 0 {
                // deal with same edge case as the 3 instruction restoring division algorithm, but
                // the quotient bit from this step also has to be stored in `quo`
                div >>= 1;
                shl -= 1;
                let tmp = 1 << shl;
                mask = tmp - 1;
                let sub = duo.wrapping_sub(div);
                if (sub as $iX) >= 0 {
                    // restore
                    duo = sub;
                    quo |= tmp;
                }
                if duo < div_original {
                    return (quo, duo);
                }
            } else {
                mask = quo - 1;
            }
            // There is now room for quotient bits in `duo`.

            // Note that `div` is already shifted left and has `shl` unset bits. We subtract 1 from
            // `div` and end up with the subset of `shl` bits being all being set. This subset acts
            // just like a two's complement negative one. The subset of `div` containing the divisor
            // had 1 subtracted from it, but a carry will always be generated from the `shl` subset
            // as long as the quotient stays positive.
            //
            // When the modified `div` is subtracted from `duo.wrapping_shl(1)`, the `shl` subset
            // adds a quotient bit to the least significant bit.
            // For example, 89 (0b01011001) divided by 3 (0b11):
            //
            // shl:4, div:0b00110000
            // first step:
            //       duo:0b01011001
            // + div_neg:0b11010000
            // ____________________
            //           0b00101001
            // quo is set to 0b00010000 and mask is set to 0b00001111 for later
            //
            // 1 is subtracted from `div`. I will differentiate the `shl` part of `div` and the
            // quotient part of `duo` with `^`s.
            // chars.
            //     div:0b00110000
            //               ^^^^
            //   +     0b11111111
            //   ________________
            //         0b00101111
            //               ^^^^
            // div_neg:0b11010001
            //
            // first SWAR step:
            //  duo_shl1:0b01010010
            //                    ^
            // + div_neg:0b11010001
            // ____________________
            //           0b00100011
            //                    ^
            // second:
            //  duo_shl1:0b01000110
            //                   ^^
            // + div_neg:0b11010001
            // ____________________
            //           0b00010111
            //                   ^^
            // third:
            //  duo_shl1:0b00101110
            //                  ^^^
            // + div_neg:0b11010001
            // ____________________
            //           0b11111111
            //                  ^^^
            // 3 steps resulted in the quotient with 3 set bits as expected, but currently the real
            // part of `duo` is negative and the third step was an unnormalized step. The restore
            // branch then restores `duo`. Note that the restore branch does not shift `duo` left.
            //
            //   duo:0b11111111
            //              ^^^
            // + div:0b00101111
            //             ^^^^
            // ________________
            //       0b00101110
            //              ^^^
            // `duo` is now back in the `duo_shl1` state it was at in the the third step, with an
            // unset quotient bit.
            //
            // final step (`shl` was 4, so exactly 4 steps must be taken)
            //  duo_shl1:0b01011100
            //                 ^^^^
            // + div_neg:0b11010001
            // ____________________
            //           0b00101101
            //                 ^^^^
            // The quotient includes the `^` bits added with the `quo` bits from the beginning that
            // contained the first step and potential edge case step,
            // `quo:0b00010000 + (duo:0b00101101 & mask:0b00001111) == 0b00011101 == 29u8`.
            // The remainder is the bits remaining in `duo` that are not part of the quotient bits,
            // `duo:0b00101101 >> shl == 0b0010 == 2u8`.
            let div: $uX = div.wrapping_sub(1);
            let mut i = shl;
            loop {
                if i == 0 {
                    break;
                }
                i -= 1;
                duo = duo.wrapping_shl(1).wrapping_sub(div);
                if (duo as $iX) < 0 {
                    // restore
                    duo = duo.wrapping_add(div);
                }
            }
            // unpack the results of SWAR
            return ((duo & mask) | quo, duo >> shl);
            */

            // The problem with the conditional restoring SWAR algorithm above is that, in practice,
            // it requires assembly code to bring out its full unrolled potential (It seems that
            // LLVM can't use unrolled conditionals optimally and ends up erasing all the benefit
            // that my algorithm intends. On architectures without predicated instructions, the code
            // gen is especially bad. We need a default software division algorithm that is
            // guaranteed to get decent code gen for the central loop.

            // For non-SWAR algorithms, there is a way to do binary long division without
            // predication or even branching. This involves creating a mask from the sign bit and
            // performing different kinds of steps using that.
            /*
            let shl = $normalization_shift(duo, div, true);
            let mut div: $uX = div << shl;
            let mut pow: $uX = 1 << shl;
            let mut quo: $uX = 0;
            loop {
                let sub = duo.wrapping_sub(div);
                let sign_mask = !((sub as $iX).wrapping_shr($n - 1) as $uX);
                duo -= div & sign_mask;
                quo |= pow & sign_mask;
                div >>= 1;
                pow >>= 1;
                if pow == 0 {
                    break;
                }
            }
            return (quo, duo);
            */
            // However, it requires about 4 extra operations (smearing the sign bit, negating the
            // mask, and applying the mask twice) on top of the operations done by the actual
            // algorithm. With SWAR however, just 2 extra operations are needed, making it
            // practical and even the most optimal algorithm for some architectures.

            // What we do is use custom assembly for predicated architectures that need software
            // division, and for the default algorithm use a mask based restoring SWAR algorithm
            // without conditionals or branches. On almost all architectures, this Rust code is
            // guaranteed to compile down to 5 assembly instructions or less for each step, and LLVM
            // will unroll it in a decent way.

            // standard opening for SWAR algorithm with first step and edge case handling
            let div_original = div;
            let mut shl = $normalization_shift(duo, div, true);
            let mut div: $uX = (div << shl);
            duo = duo.wrapping_sub(div);
            let mut quo: $uX = 1 << shl;
            if duo < div_original {
                return (quo, duo);
            }
            let mask: $uX;
            if (div as $iX) < 0 {
                div >>= 1;
                shl -= 1;
                let tmp = 1 << shl;
                mask = tmp - 1;
                let sub = duo.wrapping_sub(div);
                if (sub as $iX) >= 0 {
                    duo = sub;
                    quo |= tmp;
                }
                if duo < div_original {
                    return (quo, duo);
                }
            } else {
                mask = quo - 1;
            }

            // central loop
            div = div.wrapping_sub(1);
            let mut i = shl;
            loop {
                if i == 0 {
                    break;
                }
                i -= 1;
                // shift left 1 and subtract
                duo = duo.wrapping_shl(1).wrapping_sub(div);
                // create mask
                let mask = (duo as $iX).wrapping_shr($n - 1) as $uX;
                // restore
                duo = duo.wrapping_add(div & mask);
            }
            // unpack
            return ((duo & mask) | quo, duo >> shl);

            // miscellanious binary long division algorithms that might be better for specific
            // architectures

            // Another kind of long division uses an interesting fact that `div` and `pow` can be
            // negated when `duo` is negative to perform a "negated" division step that works in
            // place of any normalization mechanism. This is a non-restoring division algorithm that
            // is very similar to the non-restoring division algorithms that can be found on the
            // internet, except there is only one test for `duo < 0`. The subtraction from `quo` can
            // be viewed as shifting the least significant set bit right (e.x. if we enter a series
            // of negated binary long division steps starting with `quo == 0b1011_0000` and
            // `pow == 0b0000_1000`, `quo` will progress like this: 0b1010_1000, 0b1010_0100,
            // 0b1010_0010, 0b1010_0001).
            /*
            let div_original = div;
            let shl = $normalization_shift(duo, div, true);
            let mut div: $uX = (div << shl);
            let mut pow: $uX = 1 << shl;
            let mut quo: $uX = pow;
            duo = duo.wrapping_sub(div);
            if duo < div_original {
                return (quo, duo);
            }
            div >>= 1;
            pow >>= 1;
            loop {
                if (duo as $iX) < 0 {
                    // Negated binary long division step.
                    duo = duo.wrapping_add(div);
                    quo = quo.wrapping_sub(pow);
                } else {
                    // Normal long division step.
                    if duo < div_original {
                        return (quo, duo)
                    }
                    duo = duo.wrapping_sub(div);
                    quo = quo.wrapping_add(pow);
                }
                pow >>= 1;
                div >>= 1;
            }
            */

            // This is the Nonrestoring SWAR algorithm, combining the nonrestoring algorithm with
            // SWAR techniques that makes the only difference between steps be negation of `div`.
            // If there was an architecture with an instruction that negated inputs to an adder
            // based on conditionals, and in place shifting (or a three input addition operation
            // that can have `duo` as two of the inputs to effectively shift it left by 1), then a
            // single instruction central loop is possible. Microarchitectures often have inputs to
            // their ALU that can invert the arguments and carry in of adders, but the architectures
            // unfortunately do not have an instruction to dynamically invert this input based on
            // conditionals.
            /*
            // SWAR opening
            let div_original = div;
            let mut shl = $normalization_shift(duo, div, true);
            let mut div: $uX = (div << shl);
            duo = duo.wrapping_sub(div);
            let mut quo: $uX = 1 << shl;
            if duo < div_original {
                return (quo, duo);
            }
            let mask: $uX;
            if (div as $iX) < 0 {
                div >>= 1;
                shl -= 1;
                let tmp = 1 << shl;
                let sub = duo.wrapping_sub(div);
                if (sub as $iX) >= 0 {
                    // restore
                    duo = sub;
                    quo |= tmp;
                }
                if duo < div_original {
                    return (quo, duo);
                }
                mask = tmp - 1;
            } else {
                mask = quo - 1;
            }

            // central loop
            let div: $uX = div.wrapping_sub(1);
            let mut i = shl;
            loop {
                if i == 0 {
                    break;
                }
                i -= 1;
                // note: the `wrapping_shl(1)` can be factored out, but would require another
                // restoring division step to prevent `(duo as $iX)` from overflowing
                if (duo as $iX) < 0 {
                    // Negated binary long division step.
                    duo = duo.wrapping_shl(1).wrapping_add(div);
                } else {
                    // Normal long division step.
                    duo = duo.wrapping_shl(1).wrapping_sub(div);
                }
            }
            if (duo as $iX) < 0 {
                // Restore. This was not needed in the original nonrestoring algorithm because of
                // the `duo < div_original` checks.
                duo = duo.wrapping_add(div);
            }
            // unpack
            return ((duo & mask) | quo, duo >> shl);
            */
        }
    };
}