1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have
// been enhanced to perform better than an simple qword loop, making them ideal
// for implementing memcpy/memset. Note that "rep cmps" has received no such
// enhancement, so it is not used to implement memcmp.
//
// On certain recent Intel processors, "rep movsb" and "rep stosb" have been
// further enhanced to automatically select the best microarchitectural
// implementation based on length and alignment. See the following features from
// the "Intel® 64 and IA-32 Architectures Optimization Reference Manual":
// - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later)
// - FSRM - Fast Short REP MOV (Ice Lake and later)
// - Fast Zero-Length MOVSB (On no current hardware)
// - Fast Short STOSB (On no current hardware)
//
// To simplify things, we switch to using the byte-based variants if the "ermsb"
// feature is present at compile-time. We don't bother detecting other features.
// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
use core::arch::asm;
use core::intrinsics;
use core::mem;
#[inline(always)]
#[cfg(target_feature = "ermsb")]
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
core::arch::asm!(
"repe movsb (%rsi), (%rdi)",
inout("rcx") count => _,
inout("rdi") dest => _,
inout("rsi") src => _,
options(att_syntax, nostack, preserves_flags)
);
}
#[inline(always)]
#[cfg(not(target_feature = "ermsb"))]
pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) {
let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
// Separating the blocks gives the compiler more freedom to reorder instructions.
asm!(
"rep movsb",
inout("ecx") pre_byte_count => _,
inout("rdi") dest => dest,
inout("rsi") src => src,
options(att_syntax, nostack, preserves_flags)
);
asm!(
"rep movsq",
inout("rcx") qword_count => _,
inout("rdi") dest => dest,
inout("rsi") src => src,
options(att_syntax, nostack, preserves_flags)
);
asm!(
"rep movsb",
inout("ecx") byte_count => _,
inout("rdi") dest => _,
inout("rsi") src => _,
options(att_syntax, nostack, preserves_flags)
);
}
#[inline(always)]
pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
// We can't separate this block due to std/cld
asm!(
"std",
"rep movsb",
"sub $7, %rsi",
"sub $7, %rdi",
"mov {qword_count}, %rcx",
"rep movsq",
"test {pre_byte_count:e}, {pre_byte_count:e}",
"add $7, %rsi",
"add $7, %rdi",
"mov {pre_byte_count:e}, %ecx",
"rep movsb",
"cld",
pre_byte_count = in(reg) pre_byte_count,
qword_count = in(reg) qword_count,
inout("ecx") byte_count => _,
inout("rdi") dest.add(count - 1) => _,
inout("rsi") src.add(count - 1) => _,
// We modify flags, but we restore it afterwards
options(att_syntax, nostack, preserves_flags)
);
}
#[inline(always)]
#[cfg(target_feature = "ermsb")]
pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
core::arch::asm!(
"repe stosb %al, (%rdi)",
inout("rcx") count => _,
inout("rdi") dest => _,
inout("al") c => _,
options(att_syntax, nostack, preserves_flags)
)
}
#[inline(always)]
#[cfg(not(target_feature = "ermsb"))]
pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) {
let c = c as u64 * 0x0101_0101_0101_0101;
let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
// Separating the blocks gives the compiler more freedom to reorder instructions.
asm!(
"rep stosb",
inout("ecx") pre_byte_count => _,
inout("rdi") dest => dest,
in("rax") c,
options(att_syntax, nostack, preserves_flags)
);
asm!(
"rep stosq",
inout("rcx") qword_count => _,
inout("rdi") dest => dest,
in("rax") c,
options(att_syntax, nostack, preserves_flags)
);
asm!(
"rep stosb",
inout("ecx") byte_count => _,
inout("rdi") dest => _,
in("rax") c,
options(att_syntax, nostack, preserves_flags)
);
}
#[inline(always)]
pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
#[inline(always)]
unsafe fn cmp<T, U, F>(mut a: *const T, mut b: *const T, n: usize, f: F) -> i32
where
T: Clone + Copy + Eq,
U: Clone + Copy + Eq,
F: FnOnce(*const U, *const U, usize) -> i32,
{
// Ensure T is not a ZST.
const { assert!(mem::size_of::<T>() != 0) };
let end = a.add(intrinsics::unchecked_div(n, mem::size_of::<T>()));
while a != end {
if a.read_unaligned() != b.read_unaligned() {
return f(a.cast(), b.cast(), mem::size_of::<T>());
}
a = a.add(1);
b = b.add(1);
}
f(
a.cast(),
b.cast(),
intrinsics::unchecked_rem(n, mem::size_of::<T>()),
)
}
let c1 = |mut a: *const u8, mut b: *const u8, n| {
for _ in 0..n {
if a.read() != b.read() {
return i32::from(a.read()) - i32::from(b.read());
}
a = a.add(1);
b = b.add(1);
}
0
};
let c2 = |a: *const u16, b, n| cmp(a, b, n, c1);
let c4 = |a: *const u32, b, n| cmp(a, b, n, c2);
let c8 = |a: *const u64, b, n| cmp(a, b, n, c4);
let c16 = |a: *const u128, b, n| cmp(a, b, n, c8);
c16(a.cast(), b.cast(), n)
}
// In order to process more than on byte simultaneously when executing strlen,
// two things must be considered:
// * An n byte read with an n-byte aligned address will never cross
// a page boundary and will always succeed. Any smaller alignment
// may result in a read that will cross a page boundary, which may
// trigger an access violation.
// * Surface Rust considers any kind of out-of-bounds read as undefined
// behaviour. To dodge this, memory access operations are written
// using inline assembly.
#[cfg(target_feature = "sse2")]
#[inline(always)]
pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
use core::arch::x86_64::{__m128i, _mm_cmpeq_epi8, _mm_movemask_epi8, _mm_set1_epi8};
let mut n = 0;
// The use of _mm_movemask_epi8 and company allow for speedups,
// but they aren't cheap by themselves. Thus, possibly small strings
// are handled in simple loops.
for _ in 0..4 {
if *s == 0 {
return n;
}
n += 1;
s = s.add(1);
}
// Shave of the least significand bits to align the address to a 16
// byte boundary. The shaved of bits are used to correct the first iteration.
let align = s as usize & 15;
let mut s = ((s as usize) - align) as *const __m128i;
let zero = _mm_set1_epi8(0);
let x = {
let r;
asm!(
"movdqa ({addr}), {dest}",
addr = in(reg) s,
dest = out(xmm_reg) r,
options(att_syntax, nostack),
);
r
};
let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) >> align;
if cmp != 0 {
return n + cmp.trailing_zeros() as usize;
}
n += 16 - align;
s = s.add(1);
loop {
let x = {
let r;
asm!(
"movdqa ({addr}), {dest}",
addr = in(reg) s,
dest = out(xmm_reg) r,
options(att_syntax, nostack),
);
r
};
let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) as u32;
if cmp == 0 {
n += 16;
s = s.add(1);
} else {
return n + cmp.trailing_zeros() as usize;
}
}
}
// Provided for scenarios like kernel development, where SSE might not
// be available.
#[cfg(not(target_feature = "sse2"))]
#[inline(always)]
pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
let mut n = 0;
// Check bytes in steps of one until
// either a zero byte is discovered or
// pointer is aligned to an eight byte boundary.
while s as usize & 7 != 0 {
if *s == 0 {
return n;
}
n += 1;
s = s.add(1);
}
// Check bytes in steps of eight until a zero
// byte is discovered.
let mut s = s as *const u64;
loop {
let mut cs = {
let r: u64;
asm!(
"mov ({addr}), {dest}",
addr = in(reg) s,
dest = out(reg) r,
options(att_syntax, nostack),
);
r
};
// Detect if a word has a zero byte, taken from
// https://graphics.stanford.edu/~seander/bithacks.html
if (cs.wrapping_sub(0x0101010101010101) & !cs & 0x8080808080808080) != 0 {
loop {
if cs & 255 == 0 {
return n;
} else {
cs >>= 8;
n += 1;
}
}
} else {
n += 8;
s = s.add(1);
}
}
}
/// Determine optimal parameters for a `rep` instruction.
fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
// Unaligned writes are still slow on modern processors, so align the destination address.
let pre_byte_count = ((8 - (dest as usize & 0b111)) & 0b111).min(count);
count -= pre_byte_count;
let qword_count = count >> 3;
let byte_count = count & 0b111;
(pre_byte_count, qword_count, byte_count)
}