1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
// TODO: when `unsafe_block_in_unsafe_fn` is stabilized, remove this
#![allow(unused_unsafe)]
// The functions are complex with many branches, and explicit
// `return`s makes it clear where function exit points are
#![allow(clippy::needless_return)]
#![allow(clippy::comparison_chain)]
// Clippy is confused by the complex configuration
#![allow(clippy::if_same_then_else)]
#![allow(clippy::needless_bool)]

//! This `specialized_div_rem` module is originally from version 1.0.0 of the
//! `specialized-div-rem` crate. Note that `for` loops with ranges are not used in this
//! module, since unoptimized compilation may generate references to `memcpy`.
//!
//! The purpose of these macros is to easily change the both the division algorithm used
//! for a given integer size and the half division used by that algorithm. The way
//! functions call each other is also constructed such that linkers will find the chain of
//! software and hardware divisions needed for every size of signed and unsigned division.
//! For example, most target compilations do the following:
//!
//!  - Many 128 bit division functions like `u128::wrapping_div` use
//!    `std::intrinsics::unchecked_div`, which gets replaced by `__udivti3` because there
//!    is not a 128 bit by 128 bit hardware division function in most architectures.
//!    `__udivti3` uses `u128_div_rem` (this extra level of function calls exists because
//!    `__umodti3` and `__udivmodti4` also exist, and `specialized_div_rem` supplies just
//!    one function to calculate both the quotient and remainder. If configuration flags
//!    enable it, `impl_trifecta!` defines `u128_div_rem` to use the trifecta algorithm,
//!    which requires the half sized division `u64_by_u64_div_rem`. If the architecture
//!    supplies a 64 bit hardware division instruction, `u64_by_u64_div_rem` will be
//!    reduced to those instructions. Note that we do not specify the half size division
//!    directly to be `__udivdi3`, because hardware division would never be introduced.
//!  - If the architecture does not supply a 64 bit hardware division instruction, u64
//!    divisions will use functions such as `__udivdi3`. This will call `u64_div_rem`
//!    which is defined by `impl_delegate!`. The half division for this algorithm is
//!    `u32_by_u32_div_rem` which in turn becomes hardware division instructions or more
//!    software division algorithms.
//!  - If the architecture does not supply a 32 bit hardware instruction, linkers will
//!    look for `__udivsi3`. `impl_binary_long!` is used, but this  algorithm uses no half
//!    division, so the chain of calls ends here.
//!
//! On some architectures like x86_64, an asymmetrically sized division is supplied, in
//! which 128 bit numbers can be divided by 64 bit numbers. `impl_asymmetric!` is used to
//! extend the 128 by 64 bit division to a full 128 by 128 bit division.

// `allow(dead_code)` is used in various places, because the configuration code would otherwise be
// ridiculously complex

#[macro_use]
mod norm_shift;

#[macro_use]
mod binary_long;

#[macro_use]
mod delegate;

// used on SPARC
#[allow(unused_imports)]
#[cfg(not(feature = "public-test-deps"))]
pub(crate) use self::delegate::u128_divide_sparc;

#[cfg(feature = "public-test-deps")]
pub use self::delegate::u128_divide_sparc;

#[macro_use]
mod trifecta;

#[macro_use]
mod asymmetric;

/// The behavior of all divisions by zero is controlled by this function. This function should be
/// impossible to reach by Rust users, unless `compiler-builtins` public division functions or
/// `core/std::unchecked_div/rem` are directly used without a zero check in front.
fn zero_div_fn() -> ! {
    // Calling the intrinsic directly, to avoid the `assert_unsafe_precondition` that cannot be used
    // here because it involves non-`inline` functions
    // (https://github.com/rust-lang/compiler-builtins/issues/491).
    unsafe { core::intrinsics::unreachable() }
}

const USE_LZ: bool = {
    if cfg!(target_arch = "arm") {
        if cfg!(target_feature = "thumb-mode") {
            // ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is
            // supported. This is needed to successfully differentiate between targets like
            // `thumbv8.base` and `thumbv8.main`.
            cfg!(target_feature = "v6t2")
        } else {
            // Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is
            // supported. Technically, ARMv5T was the first to have CLZ, but the "v5t" target
            // feature does not seem to work.
            cfg!(target_feature = "v5te")
        }
    } else if cfg!(any(target_arch = "sparc", target_arch = "sparc64")) {
        // LZD or LZCNT on SPARC only exists for the VIS 3 extension and later.
        cfg!(target_feature = "vis3")
    } else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
        // The `B` extension on RISC-V determines if a CLZ assembly instruction exists
        cfg!(target_feature = "b")
    } else {
        // All other common targets Rust supports should have CLZ instructions
        true
    }
};

impl_normalization_shift!(
    u32_normalization_shift,
    USE_LZ,
    32,
    u32,
    i32,
    allow(dead_code)
);
impl_normalization_shift!(
    u64_normalization_shift,
    USE_LZ,
    64,
    u64,
    i64,
    allow(dead_code)
);

/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
/// `checked_div` and `checked_rem` are used to avoid bringing in panic function
/// dependencies.
#[inline]
fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
    if let Some(quo) = duo.checked_div(div) {
        if let Some(rem) = duo.checked_rem(div) {
            return (quo, rem);
        }
    }
    zero_div_fn()
}

// Whether `trifecta` or `delegate` is faster for 128 bit division depends on the speed at which a
// microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
// faster if the target pointer width is at least 64.
#[cfg(all(
    not(any(target_pointer_width = "16", target_pointer_width = "32")),
    not(all(not(feature = "no-asm"), target_arch = "x86_64")),
    not(any(target_arch = "sparc", target_arch = "sparc64"))
))]
impl_trifecta!(
    u128_div_rem,
    zero_div_fn,
    u64_by_u64_div_rem,
    32,
    u32,
    u64,
    u128
);

// If the pointer width less than 64, then the target architecture almost certainly does not have
// the fast 64 to 128 bit widening multiplication needed for `trifecta` to be faster.
#[cfg(all(
    any(target_pointer_width = "16", target_pointer_width = "32"),
    not(all(not(feature = "no-asm"), target_arch = "x86_64")),
    not(any(target_arch = "sparc", target_arch = "sparc64"))
))]
impl_delegate!(
    u128_div_rem,
    zero_div_fn,
    u64_normalization_shift,
    u64_by_u64_div_rem,
    32,
    u32,
    u64,
    u128,
    i128
);

/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
///
/// # Safety
///
/// If the quotient does not fit in a `u64`, a floating point exception occurs.
/// If `div == 0`, then a division by zero exception occurs.
#[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
#[inline]
unsafe fn u128_by_u64_div_rem(duo: u128, div: u64) -> (u64, u64) {
    let duo_lo = duo as u64;
    let duo_hi = (duo >> 64) as u64;
    let quo: u64;
    let rem: u64;
    unsafe {
        // divides the combined registers rdx:rax (`duo` is split into two 64 bit parts to do this)
        // by `div`. The quotient is stored in rax and the remainder in rdx.
        // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
        core::arch::asm!(
            "div {0}",
            in(reg) div,
            inlateout("rax") duo_lo => quo,
            inlateout("rdx") duo_hi => rem,
            options(att_syntax, pure, nomem, nostack)
        );
    }
    (quo, rem)
}

// use `asymmetric` instead of `trifecta` on x86_64
#[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
impl_asymmetric!(
    u128_div_rem,
    zero_div_fn,
    u64_by_u64_div_rem,
    u128_by_u64_div_rem,
    32,
    u32,
    u64,
    u128
);

/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
/// `checked_div` and `checked_rem` are used to avoid bringing in panic function
/// dependencies.
#[inline]
#[allow(dead_code)]
fn u32_by_u32_div_rem(duo: u32, div: u32) -> (u32, u32) {
    if let Some(quo) = duo.checked_div(div) {
        if let Some(rem) = duo.checked_rem(div) {
            return (quo, rem);
        }
    }
    zero_div_fn()
}

// When not on x86 and the pointer width is not 64, use `delegate` since the division size is larger
// than register size.
#[cfg(all(
    not(all(not(feature = "no-asm"), target_arch = "x86")),
    not(target_pointer_width = "64")
))]
impl_delegate!(
    u64_div_rem,
    zero_div_fn,
    u32_normalization_shift,
    u32_by_u32_div_rem,
    16,
    u16,
    u32,
    u64,
    i64
);

// When not on x86 and the pointer width is 64, use `binary_long`.
#[cfg(all(
    not(all(not(feature = "no-asm"), target_arch = "x86")),
    target_pointer_width = "64"
))]
impl_binary_long!(
    u64_div_rem,
    zero_div_fn,
    u64_normalization_shift,
    64,
    u64,
    i64
);

/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
///
/// # Safety
///
/// If the quotient does not fit in a `u32`, a floating point exception occurs.
/// If `div == 0`, then a division by zero exception occurs.
#[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
#[inline]
unsafe fn u64_by_u32_div_rem(duo: u64, div: u32) -> (u32, u32) {
    let duo_lo = duo as u32;
    let duo_hi = (duo >> 32) as u32;
    let quo: u32;
    let rem: u32;
    unsafe {
        // divides the combined registers rdx:rax (`duo` is split into two 32 bit parts to do this)
        // by `div`. The quotient is stored in rax and the remainder in rdx.
        // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
        core::arch::asm!(
            "div {0}",
            in(reg) div,
            inlateout("rax") duo_lo => quo,
            inlateout("rdx") duo_hi => rem,
            options(att_syntax, pure, nomem, nostack)
        );
    }
    (quo, rem)
}

// use `asymmetric` instead of `delegate` on x86
#[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
impl_asymmetric!(
    u64_div_rem,
    zero_div_fn,
    u32_by_u32_div_rem,
    u64_by_u32_div_rem,
    16,
    u16,
    u32,
    u64
);

// 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division
impl_binary_long!(
    u32_div_rem,
    zero_div_fn,
    u32_normalization_shift,
    32,
    u32,
    i32
);