Skip to content

Commit 4b180f8

Browse files
authored
Merge pull request #4802 from epage/osstr
fix(lex): Deprecate unsound `OsStrExt::split_at`
2 parents 53cb165 + 48dc66f commit 4b180f8

File tree

2 files changed

+60
-23
lines changed

2 files changed

+60
-23
lines changed

clap_lex/src/ext.rs

Lines changed: 55 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ pub trait OsStrExt: private::Sealed {
193193
/// assert_eq!("Per", first);
194194
/// assert_eq!(" Martin-Löf", last);
195195
/// ```
196+
#[deprecated(since = "4.1.0", note = "This is not sound for all `index`")]
196197
fn split_at(&self, index: usize) -> (&OsStr, &OsStr);
197198
/// Splits the string on the first occurrence of the specified delimiter and
198199
/// returns prefix before delimiter and suffix after delimiter.
@@ -212,7 +213,8 @@ pub trait OsStrExt: private::Sealed {
212213

213214
impl OsStrExt for OsStr {
214215
fn try_str(&self) -> Result<&str, std::str::Utf8Error> {
215-
let bytes = to_bytes(self);
216+
// SAFETY: Only interacting with `OsStr` as `&str
217+
let bytes = unsafe { to_bytes(self) };
216218
std::str::from_utf8(bytes)
217219
}
218220

@@ -221,17 +223,24 @@ impl OsStrExt for OsStr {
221223
}
222224

223225
fn find(&self, needle: &str) -> Option<usize> {
226+
// SAFETY: Only interacting with `OsStr` as `&str
227+
let bytes = unsafe { to_bytes(self) };
224228
(0..=self.len().checked_sub(needle.len())?)
225-
.find(|&x| to_bytes(self)[x..].starts_with(needle.as_bytes()))
229+
.find(|&x| bytes[x..].starts_with(needle.as_bytes()))
226230
}
227231

228232
fn strip_prefix(&self, prefix: &str) -> Option<&OsStr> {
229-
to_bytes(self)
230-
.strip_prefix(prefix.as_bytes())
231-
.map(to_os_str)
233+
// SAFETY: Only interacting with `OsStr` as `&str
234+
let bytes = unsafe { to_bytes(self) };
235+
bytes.strip_prefix(prefix.as_bytes()).map(|s| {
236+
// SAFETY: Only interacting with `OsStr` as `&str
237+
unsafe { to_os_str(s) }
238+
})
232239
}
233240
fn starts_with(&self, prefix: &str) -> bool {
234-
to_bytes(self).starts_with(prefix.as_bytes())
241+
// SAFETY: Only interacting with `OsStr` as `&str
242+
let bytes = unsafe { to_bytes(self) };
243+
bytes.starts_with(prefix.as_bytes())
235244
}
236245

237246
fn split<'s, 'n>(&'s self, needle: &'n str) -> Split<'s, 'n> {
@@ -243,17 +252,24 @@ impl OsStrExt for OsStr {
243252
}
244253

245254
fn split_at(&self, index: usize) -> (&OsStr, &OsStr) {
246-
let (first, second) = to_bytes(self).split_at(index);
247-
(to_os_str(first), to_os_str(second))
255+
// BUG: This is unsafe and has been deprecated
256+
unsafe {
257+
let bytes = to_bytes(self);
258+
let (first, second) = bytes.split_at(index);
259+
(to_os_str(first), to_os_str(second))
260+
}
248261
}
249262

250263
fn split_once(&self, needle: &'_ str) -> Option<(&OsStr, &OsStr)> {
251264
let start = self.find(needle)?;
252265
let end = start + needle.len();
253-
let haystack = to_bytes(self);
254-
let first = &haystack[0..start];
255-
let second = &haystack[end..];
256-
Some((to_os_str(first), to_os_str(second)))
266+
// SAFETY: Only interacting with `OsStr` as `&str
267+
unsafe {
268+
let haystack = to_bytes(self);
269+
let first = &haystack[0..start];
270+
let second = &haystack[end..];
271+
Some((to_os_str(first), to_os_str(second)))
272+
}
257273
}
258274
}
259275

@@ -265,33 +281,40 @@ mod private {
265281

266282
/// Allow access to raw bytes
267283
///
268-
/// **Note:** the bytes only make sense when compared with ASCII or `&str`
284+
/// # Safety
269285
///
270-
/// **Note:** This must never be serialized as there is no guarantee at how invalid UTF-8 will be
271-
/// encoded, even within the same version of this crate (since its dependent on rustc version)
272-
fn to_bytes(s: &OsStr) -> &[u8] {
286+
/// - The bytes only make sense when compared with ASCII or `&str`
287+
/// - This must never be serialized as there is no guarantee at how invalid UTF-8 will be
288+
/// encoded, even within the same version of this crate (since its dependent on rustc version)
289+
unsafe fn to_bytes(s: &OsStr) -> &[u8] {
273290
// SAFETY:
274291
// - Lifetimes are the same
275-
// - Types are compatible (`OsStr` is a transparent wrapper for `[u8]`)
292+
// - Types are compatible (`OsStr` is effectively a transparent wrapper for `[u8]`)
276293
// - The primary contract is that the encoding for invalid surrogate code points is not
277294
// guaranteed which isn't a problem here
278295
//
279296
// There is a proposal to support this natively (https://github.com/rust-lang/rust/pull/95290)
280297
// but its in limbo
281-
unsafe { std::mem::transmute(s) }
298+
std::mem::transmute(s)
282299
}
283300

284301
/// Restore raw bytes as `OsStr`
285-
fn to_os_str(s: &[u8]) -> &OsStr {
302+
///
303+
/// # Safety
304+
///
305+
/// - The bytes only make sense when compared with ASCII or `&str`
306+
/// - This must never be serialized as there is no guarantee at how invalid UTF-8 will be
307+
/// encoded, even within the same version of this crate (since its dependent on rustc version)
308+
unsafe fn to_os_str(s: &[u8]) -> &OsStr {
286309
// SAFETY:
287310
// - Lifetimes are the same
288-
// - Types are compatible (`OsStr` is a transparent wrapper for `[u8]`)
311+
// - Types are compatible (`OsStr` is effectively a transparent wrapper for `[u8]`)
289312
// - The primary contract is that the encoding for invalid surrogate code points is not
290313
// guaranteed which isn't a problem here
291314
//
292315
// There is a proposal to support this natively (https://github.com/rust-lang/rust/pull/95290)
293316
// but its in limbo
294-
unsafe { std::mem::transmute(s) }
317+
std::mem::transmute(s)
295318
}
296319

297320
pub struct Split<'s, 'n> {
@@ -319,3 +342,14 @@ impl<'s, 'n> Iterator for Split<'s, 'n> {
319342
}
320343
}
321344
}
345+
346+
/// Split an `OsStr`
347+
///
348+
/// # Safety
349+
///
350+
/// `index` must be at a valid UTF-8 boundary
351+
pub(crate) unsafe fn split_at(os: &OsStr, index: usize) -> (&OsStr, &OsStr) {
352+
let bytes = to_bytes(os);
353+
let (first, second) = bytes.split_at(index);
354+
(to_os_str(first), to_os_str(second))
355+
}

clap_lex/src/lib.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,9 @@ impl<'s> ShortFlags<'s> {
433433
if let Some((index, _)) = self.utf8_prefix.next() {
434434
self.utf8_prefix = "".char_indices();
435435
self.invalid_suffix = None;
436-
return Some(self.inner.split_at(index).1);
436+
// SAFETY: `char_indices` ensures `index` is at a valid UTF-8 boundary
437+
let remainder = unsafe { ext::split_at(self.inner, index).1 };
438+
return Some(remainder);
437439
}
438440

439441
if let Some(suffix) = self.invalid_suffix {
@@ -457,7 +459,8 @@ fn split_nonutf8_once(b: &OsStr) -> (&str, Option<&OsStr>) {
457459
match b.try_str() {
458460
Ok(s) => (s, None),
459461
Err(err) => {
460-
let (valid, after_valid) = b.split_at(err.valid_up_to());
462+
// SAFETY: `char_indices` ensures `index` is at a valid UTF-8 boundary
463+
let (valid, after_valid) = unsafe { ext::split_at(b, err.valid_up_to()) };
461464
let valid = valid.try_str().unwrap();
462465
(valid, Some(after_valid))
463466
}

0 commit comments

Comments
 (0)