1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309
// TODO: when `unsafe_block_in_unsafe_fn` is stabilized, remove this
#![allow(unused_unsafe)]
// The functions are complex with many branches, and explicit
// `return`s makes it clear where function exit points are
#![allow(clippy::needless_return)]
#![allow(clippy::comparison_chain)]
// Clippy is confused by the complex configuration
#![allow(clippy::if_same_then_else)]
#![allow(clippy::needless_bool)]
//! This `specialized_div_rem` module is originally from version 1.0.0 of the
//! `specialized-div-rem` crate. Note that `for` loops with ranges are not used in this
//! module, since unoptimized compilation may generate references to `memcpy`.
//!
//! The purpose of these macros is to easily change the both the division algorithm used
//! for a given integer size and the half division used by that algorithm. The way
//! functions call each other is also constructed such that linkers will find the chain of
//! software and hardware divisions needed for every size of signed and unsigned division.
//! For example, most target compilations do the following:
//!
//! - Many 128 bit division functions like `u128::wrapping_div` use
//! `std::intrinsics::unchecked_div`, which gets replaced by `__udivti3` because there
//! is not a 128 bit by 128 bit hardware division function in most architectures.
//! `__udivti3` uses `u128_div_rem` (this extra level of function calls exists because
//! `__umodti3` and `__udivmodti4` also exist, and `specialized_div_rem` supplies just
//! one function to calculate both the quotient and remainder. If configuration flags
//! enable it, `impl_trifecta!` defines `u128_div_rem` to use the trifecta algorithm,
//! which requires the half sized division `u64_by_u64_div_rem`. If the architecture
//! supplies a 64 bit hardware division instruction, `u64_by_u64_div_rem` will be
//! reduced to those instructions. Note that we do not specify the half size division
//! directly to be `__udivdi3`, because hardware division would never be introduced.
//! - If the architecture does not supply a 64 bit hardware division instruction, u64
//! divisions will use functions such as `__udivdi3`. This will call `u64_div_rem`
//! which is defined by `impl_delegate!`. The half division for this algorithm is
//! `u32_by_u32_div_rem` which in turn becomes hardware division instructions or more
//! software division algorithms.
//! - If the architecture does not supply a 32 bit hardware instruction, linkers will
//! look for `__udivsi3`. `impl_binary_long!` is used, but this algorithm uses no half
//! division, so the chain of calls ends here.
//!
//! On some architectures like x86_64, an asymmetrically sized division is supplied, in
//! which 128 bit numbers can be divided by 64 bit numbers. `impl_asymmetric!` is used to
//! extend the 128 by 64 bit division to a full 128 by 128 bit division.
// `allow(dead_code)` is used in various places, because the configuration code would otherwise be
// ridiculously complex
#[macro_use]
mod norm_shift;
#[macro_use]
mod binary_long;
#[macro_use]
mod delegate;
// used on SPARC
#[allow(unused_imports)]
#[cfg(not(feature = "public-test-deps"))]
pub(crate) use self::delegate::u128_divide_sparc;
#[cfg(feature = "public-test-deps")]
pub use self::delegate::u128_divide_sparc;
#[macro_use]
mod trifecta;
#[macro_use]
mod asymmetric;
/// The behavior of all divisions by zero is controlled by this function. This function should be
/// impossible to reach by Rust users, unless `compiler-builtins` public division functions or
/// `core/std::unchecked_div/rem` are directly used without a zero check in front.
fn zero_div_fn() -> ! {
// Calling the intrinsic directly, to avoid the `assert_unsafe_precondition` that cannot be used
// here because it involves non-`inline` functions
// (https://github.com/rust-lang/compiler-builtins/issues/491).
unsafe { core::intrinsics::unreachable() }
}
const USE_LZ: bool = {
if cfg!(target_arch = "arm") {
if cfg!(target_feature = "thumb-mode") {
// ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is
// supported. This is needed to successfully differentiate between targets like
// `thumbv8.base` and `thumbv8.main`.
cfg!(target_feature = "v6t2")
} else {
// Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is
// supported. Technically, ARMv5T was the first to have CLZ, but the "v5t" target
// feature does not seem to work.
cfg!(target_feature = "v5te")
}
} else if cfg!(any(target_arch = "sparc", target_arch = "sparc64")) {
// LZD or LZCNT on SPARC only exists for the VIS 3 extension and later.
cfg!(target_feature = "vis3")
} else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
// The `B` extension on RISC-V determines if a CLZ assembly instruction exists
cfg!(target_feature = "b")
} else {
// All other common targets Rust supports should have CLZ instructions
true
}
};
impl_normalization_shift!(
u32_normalization_shift,
USE_LZ,
32,
u32,
i32,
allow(dead_code)
);
impl_normalization_shift!(
u64_normalization_shift,
USE_LZ,
64,
u64,
i64,
allow(dead_code)
);
/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
/// `checked_div` and `checked_rem` are used to avoid bringing in panic function
/// dependencies.
#[inline]
fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
if let Some(quo) = duo.checked_div(div) {
if let Some(rem) = duo.checked_rem(div) {
return (quo, rem);
}
}
zero_div_fn()
}
// Whether `trifecta` or `delegate` is faster for 128 bit division depends on the speed at which a
// microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
// faster if the target pointer width is at least 64.
#[cfg(all(
not(any(target_pointer_width = "16", target_pointer_width = "32")),
not(all(not(feature = "no-asm"), target_arch = "x86_64")),
not(any(target_arch = "sparc", target_arch = "sparc64"))
))]
impl_trifecta!(
u128_div_rem,
zero_div_fn,
u64_by_u64_div_rem,
32,
u32,
u64,
u128
);
// If the pointer width less than 64, then the target architecture almost certainly does not have
// the fast 64 to 128 bit widening multiplication needed for `trifecta` to be faster.
#[cfg(all(
any(target_pointer_width = "16", target_pointer_width = "32"),
not(all(not(feature = "no-asm"), target_arch = "x86_64")),
not(any(target_arch = "sparc", target_arch = "sparc64"))
))]
impl_delegate!(
u128_div_rem,
zero_div_fn,
u64_normalization_shift,
u64_by_u64_div_rem,
32,
u32,
u64,
u128,
i128
);
/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
///
/// # Safety
///
/// If the quotient does not fit in a `u64`, a floating point exception occurs.
/// If `div == 0`, then a division by zero exception occurs.
#[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
#[inline]
unsafe fn u128_by_u64_div_rem(duo: u128, div: u64) -> (u64, u64) {
let duo_lo = duo as u64;
let duo_hi = (duo >> 64) as u64;
let quo: u64;
let rem: u64;
unsafe {
// divides the combined registers rdx:rax (`duo` is split into two 64 bit parts to do this)
// by `div`. The quotient is stored in rax and the remainder in rdx.
// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
core::arch::asm!(
"div {0}",
in(reg) div,
inlateout("rax") duo_lo => quo,
inlateout("rdx") duo_hi => rem,
options(att_syntax, pure, nomem, nostack)
);
}
(quo, rem)
}
// use `asymmetric` instead of `trifecta` on x86_64
#[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
impl_asymmetric!(
u128_div_rem,
zero_div_fn,
u64_by_u64_div_rem,
u128_by_u64_div_rem,
32,
u32,
u64,
u128
);
/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
/// `checked_div` and `checked_rem` are used to avoid bringing in panic function
/// dependencies.
#[inline]
#[allow(dead_code)]
fn u32_by_u32_div_rem(duo: u32, div: u32) -> (u32, u32) {
if let Some(quo) = duo.checked_div(div) {
if let Some(rem) = duo.checked_rem(div) {
return (quo, rem);
}
}
zero_div_fn()
}
// When not on x86 and the pointer width is not 64, use `delegate` since the division size is larger
// than register size.
#[cfg(all(
not(all(not(feature = "no-asm"), target_arch = "x86")),
not(target_pointer_width = "64")
))]
impl_delegate!(
u64_div_rem,
zero_div_fn,
u32_normalization_shift,
u32_by_u32_div_rem,
16,
u16,
u32,
u64,
i64
);
// When not on x86 and the pointer width is 64, use `binary_long`.
#[cfg(all(
not(all(not(feature = "no-asm"), target_arch = "x86")),
target_pointer_width = "64"
))]
impl_binary_long!(
u64_div_rem,
zero_div_fn,
u64_normalization_shift,
64,
u64,
i64
);
/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
///
/// # Safety
///
/// If the quotient does not fit in a `u32`, a floating point exception occurs.
/// If `div == 0`, then a division by zero exception occurs.
#[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
#[inline]
unsafe fn u64_by_u32_div_rem(duo: u64, div: u32) -> (u32, u32) {
let duo_lo = duo as u32;
let duo_hi = (duo >> 32) as u32;
let quo: u32;
let rem: u32;
unsafe {
// divides the combined registers rdx:rax (`duo` is split into two 32 bit parts to do this)
// by `div`. The quotient is stored in rax and the remainder in rdx.
// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
core::arch::asm!(
"div {0}",
in(reg) div,
inlateout("rax") duo_lo => quo,
inlateout("rdx") duo_hi => rem,
options(att_syntax, pure, nomem, nostack)
);
}
(quo, rem)
}
// use `asymmetric` instead of `delegate` on x86
#[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
impl_asymmetric!(
u64_div_rem,
zero_div_fn,
u32_by_u32_div_rem,
u64_by_u32_div_rem,
16,
u16,
u32,
u64
);
// 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division
impl_binary_long!(
u32_div_rem,
zero_div_fn,
u32_normalization_shift,
32,
u32,
i32
);