Skip to content

Commit fe4e36b

Browse files
committed
cut: fix -s flag for newline delimiter and improve performance
- Fixed the -s flag incorrectly suppressing output when the delimiter is a newline. - Improved performance in cut_fields_newline_char_delim. - Updated tests to match GNU cut behavior for newline delimiters.
1 parent f335d14 commit fe4e36b

3 files changed

Lines changed: 337 additions & 16 deletions

File tree

src/uu/cut/benches/cut_bench.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,24 @@ fn cut_fields_custom_delim(bencher: Bencher) {
7171
});
7272
}
7373

74+
/// Benchmark cutting fields with newline delimiter
75+
#[divan::bench]
76+
fn cut_fields_newline_delim(bencher: Bencher) {
77+
let mut data = Vec::new();
78+
for i in 0..100_000 {
79+
let line = format!("field_content_number_{i}\n");
80+
data.extend_from_slice(line.as_bytes());
81+
}
82+
let file_path = setup_test_file(&data);
83+
84+
bencher.bench(|| {
85+
black_box(run_util_function(
86+
uumain,
87+
&["-d", "\n", "-f", "1,3,5", file_path.to_str().unwrap()],
88+
));
89+
});
90+
}
91+
7492
fn main() {
7593
divan::main();
7694
}

src/uu/cut/src/cut.rs

Lines changed: 120 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
// For the full copyright and license information, please view the LICENSE
44
// file that was distributed with this source code.
55

6-
// spell-checker:ignore (ToDO) delim sourcefiles
6+
// spell-checker:ignore (ToDO) delim sourcefiles undelimited
77

88
use bstr::io::BufReadExt;
99
use clap::{Arg, ArgAction, ArgMatches, Command, builder::ValueParser};
@@ -254,35 +254,132 @@ fn cut_fields_implicit_out_delim<R: Read, W: Write, M: Matcher>(
254254
Ok(())
255255
}
256256

257-
/// The input delimiter is identical to `newline_char`
257+
/// Streams and filters fields where the record terminator and
258+
/// field delimiter are the same character (specified by `newline_char`)
258259
fn cut_fields_newline_char_delim<R: Read, W: Write>(
259260
reader: R,
260261
out: &mut W,
261262
ranges: &[Range],
263+
only_delimited: bool,
262264
newline_char: u8,
263265
out_delim: &[u8],
264266
) -> UResult<()> {
265-
let buf_in = BufReader::new(reader);
267+
let mut reader = BufReader::new(reader);
268+
let mut line = Vec::new();
266269

267-
let segments: Vec<_> = buf_in.split(newline_char).filter_map(Result::ok).collect();
268-
let mut print_delim = false;
270+
// We start at 1 because 'cut' field indexing is 1-based
271+
let mut current_field_idx = 1;
272+
let mut first_field_printed = false;
273+
let mut has_data = false;
274+
let mut suppressed = false;
269275

270-
for &Range { low, high } in ranges {
271-
for i in low..=high {
272-
// "- 1" is necessary because fields start from 1 whereas a Vec starts from 0
273-
if let Some(segment) = segments.get(i - 1) {
274-
if print_delim {
275-
out.write_all(out_delim)?;
276+
let mut range_idx = 0;
277+
278+
loop {
279+
line.clear();
280+
281+
let is_selected = range_idx < ranges.len() && current_field_idx >= ranges[range_idx].low;
282+
let needs_data = is_selected || current_field_idx == 1;
283+
284+
let mut has_processed_data = false;
285+
286+
if needs_data {
287+
// Standard read: copies bytes into `line`
288+
loop {
289+
let buf = reader.fill_buf()?;
290+
if buf.is_empty() {
291+
break;
292+
}
293+
294+
has_processed_data = true;
295+
296+
if let Some(pos) = memchr::memchr(newline_char, buf) {
297+
let amt = pos + 1;
298+
line.extend_from_slice(&buf[..amt]);
299+
reader.consume(amt);
300+
301+
break;
302+
}
303+
let len = buf.len();
304+
line.extend_from_slice(buf);
305+
reader.consume(len);
306+
}
307+
} else {
308+
// Zero-allocation skip: scans the buffer and advances the cursor without copying
309+
loop {
310+
let buf = reader.fill_buf()?;
311+
if buf.is_empty() {
312+
break; // EOF
313+
}
314+
315+
has_processed_data = true;
316+
317+
if let Some(pos) = memchr::memchr(newline_char, buf) {
318+
let bytes_to_consume = pos + 1;
319+
reader.consume(bytes_to_consume);
320+
break;
321+
}
322+
323+
let len = buf.len();
324+
reader.consume(len);
325+
}
326+
}
327+
328+
if !has_processed_data {
329+
break;
330+
}
331+
has_data = true;
332+
333+
// To comply with -s when the stream consists of only a single field.
334+
if current_field_idx == 1 {
335+
let is_eof_next = reader.fill_buf()?.is_empty();
336+
337+
if is_eof_next && line.last() != Some(&newline_char) {
338+
if only_delimited {
339+
suppressed = true;
276340
} else {
277-
print_delim = true;
341+
// GNU cut prints the whole line if no delimiter is found.
342+
out.write_all(&line)?;
278343
}
279-
out.write_all(segment.as_slice())?;
280-
} else {
281344
break;
282345
}
283346
}
347+
348+
if range_idx < ranges.len() && current_field_idx > ranges[range_idx].high {
349+
range_idx += 1;
350+
351+
// EARLY EXIT: If we've exhausted all ranges, stop reading the stream entirely.
352+
if range_idx == ranges.len() {
353+
break;
354+
}
355+
}
356+
357+
// Check if the current field falls inside the current active range
358+
let is_selected = range_idx < ranges.len() && current_field_idx >= ranges[range_idx].low;
359+
360+
if is_selected {
361+
if first_field_printed {
362+
out.write_all(out_delim)?;
363+
}
364+
365+
let has_newline = line.last() == Some(&newline_char);
366+
let content = if has_newline {
367+
&line[..line.len() - 1]
368+
} else {
369+
&line[..]
370+
};
371+
372+
out.write_all(content)?;
373+
first_field_printed = true;
374+
}
375+
376+
current_field_idx += 1;
284377
}
285-
out.write_all(&[newline_char])?;
378+
379+
if has_data && !suppressed {
380+
out.write_all(&[newline_char])?;
381+
}
382+
286383
Ok(())
287384
}
288385

@@ -297,7 +394,14 @@ fn cut_fields<R: Read, W: Write>(
297394
match field_opts.delimiter {
298395
Delimiter::Slice(delim) if delim == [newline_char] => {
299396
let out_delim = opts.out_delimiter.unwrap_or(delim);
300-
cut_fields_newline_char_delim(reader, out, ranges, newline_char, out_delim)
397+
cut_fields_newline_char_delim(
398+
reader,
399+
out,
400+
ranges,
401+
field_opts.only_delimited,
402+
newline_char,
403+
out_delim,
404+
)
301405
}
302406
Delimiter::Slice(delim) => {
303407
let matcher = ExactMatcher::new(delim);

0 commit comments

Comments
 (0)