diff --git a/jxl/src/features/noise.rs b/jxl/src/features/noise.rs index 5770a1934..760ff3ccb 100644 --- a/jxl/src/features/noise.rs +++ b/jxl/src/features/noise.rs @@ -17,6 +17,7 @@ impl Noise { } Ok(noise) } + #[inline(always)] pub fn strength(&self, vx: f32) -> f32 { let k_scale = (self.lut.len() - 2) as f32; let scaled_vx = f32::max(0.0, vx * k_scale); diff --git a/jxl/src/frame/decode.rs b/jxl/src/frame/decode.rs index f58b1044d..c7dde66bf 100644 --- a/jxl/src/frame/decode.rs +++ b/jxl/src/frame/decode.rs @@ -547,6 +547,8 @@ impl Frame { Ok(()) } + #[inline(always)] + #[allow(unsafe_code)] pub fn render_noise_for_group( &mut self, group: usize, @@ -614,23 +616,20 @@ impl Frame { continue; } - // Fill all 3 channels with this subregion's noise, sharing the RNG + // Fill all 3 channels with this subregion's noise, sharing the RNG. + // Reinterpret the u64 batch as u32 pairs to avoid per-element branching. for buf in &mut bufs { for y in 0..sub_ysize { let row = buf.row_mut(sub_y0 + y); for batch_index in 0..sub_xsize.div_ceil(FLOATS_PER_BATCH) { rng.fill(&mut batch); + // SAFETY: [u64; N] and [u32; 2*N] have the same layout + let batch_u32: &[u32; FLOATS_PER_BATCH] = + unsafe { &*batch.as_ptr().cast() }; let batch_size = (sub_xsize - batch_index * FLOATS_PER_BATCH).min(FLOATS_PER_BATCH); - for i in 0..batch_size { + for (i, &bits) in batch_u32.iter().take(batch_size).enumerate() { let x = sub_x0 + FLOATS_PER_BATCH * batch_index + i; - let k = i / 2; - let high_bytes = i % 2 != 0; - let bits = if high_bytes { - ((batch[k] & 0xFFFFFFFF00000000) >> 32) as u32 - } else { - (batch[k] & 0xFFFFFFFF) as u32 - }; row[x] = bits_to_float(bits); } } diff --git a/jxl/src/util/xorshift128plus.rs b/jxl/src/util/xorshift128plus.rs index bf55805a1..5b323721e 100644 --- a/jxl/src/util/xorshift128plus.rs +++ b/jxl/src/util/xorshift128plus.rs @@ -47,6 +47,7 @@ impl Xorshift128Plus { Self { s0, s1 } } + #[inline(always)] pub fn fill(&mut self, random_bits: &mut [u64; Self::N]) { for ((s0, s1), random_bits) in self .s0