Skip to content

Commit d9727d9

Browse files
committed
cut: fix -s flag for newline delimiter and improve performance
- Fixed the -s flag incorrectly suppressing output when the delimiter is a newline. - Improved performance in cut_fields_newline_char_delim. - Updated tests to match GNU cut behavior for newline delimiters.
1 parent bed3108 commit d9727d9

File tree

3 files changed

+336
-16
lines changed

3 files changed

+336
-16
lines changed

src/uu/cut/benches/cut_bench.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,24 @@ fn cut_fields_custom_delim(bencher: Bencher) {
7171
});
7272
}
7373

74+
/// Benchmark cutting fields with newline delimiter
75+
#[divan::bench]
76+
fn cut_fields_newline_delim(bencher: Bencher) {
77+
let mut data = Vec::new();
78+
for i in 0..100_000 {
79+
let line = format!("field_content_number_{i}\n");
80+
data.extend_from_slice(line.as_bytes());
81+
}
82+
let file_path = setup_test_file(&data);
83+
84+
bencher.bench(|| {
85+
black_box(run_util_function(
86+
uumain,
87+
&["-d", "\n", "-f", "1,3,5", file_path.to_str().unwrap()],
88+
));
89+
});
90+
}
91+
7492
fn main() {
7593
divan::main();
7694
}

src/uu/cut/src/cut.rs

Lines changed: 119 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
// For the full copyright and license information, please view the LICENSE
44
// file that was distributed with this source code.
55

6-
// spell-checker:ignore (ToDO) delim sourcefiles
6+
// spell-checker:ignore (ToDO) delim sourcefiles undelimited
77

88
use bstr::io::BufReadExt;
99
use clap::{Arg, ArgAction, ArgMatches, Command, builder::ValueParser};
@@ -254,35 +254,131 @@ fn cut_fields_implicit_out_delim<R: Read, W: Write, M: Matcher>(
254254
Ok(())
255255
}
256256

257-
/// The input delimiter is identical to `newline_char`
257+
/// Streams and filters fields where the record terminator and
258+
/// field delimiter are the same character (specified by `newline_char`)
258259
fn cut_fields_newline_char_delim<R: Read, W: Write>(
259260
reader: R,
260261
out: &mut W,
261262
ranges: &[Range],
263+
only_delimited: bool,
262264
newline_char: u8,
263265
out_delim: &[u8],
264266
) -> UResult<()> {
265-
let buf_in = BufReader::new(reader);
267+
let mut reader = BufReader::new(reader);
268+
let mut line = Vec::new();
266269

267-
let segments: Vec<_> = buf_in.split(newline_char).filter_map(Result::ok).collect();
268-
let mut print_delim = false;
270+
// We start at 1 because 'cut' field indexing is 1-based
271+
let mut current_field_idx = 1;
272+
let mut first_field_printed = false;
273+
let mut has_data = false;
274+
let mut suppressed = false;
269275

270-
for &Range { low, high } in ranges {
271-
for i in low..=high {
272-
// "- 1" is necessary because fields start from 1 whereas a Vec starts from 0
273-
if let Some(segment) = segments.get(i - 1) {
274-
if print_delim {
275-
out.write_all(out_delim)?;
276+
let mut range_idx = 0;
277+
278+
loop {
279+
line.clear();
280+
281+
let is_selected = range_idx < ranges.len() && current_field_idx >= ranges[range_idx].low;
282+
let needs_data = is_selected || current_field_idx == 1;
283+
284+
let mut bytes_processed = 0;
285+
286+
if needs_data {
287+
// Standard read: copies bytes into `line`
288+
loop {
289+
let buf = reader.fill_buf()?;
290+
if buf.is_empty() {
291+
break;
292+
}
293+
294+
if let Some(pos) = memchr::memchr(newline_char, buf) {
295+
let amt = pos + 1;
296+
line.extend_from_slice(&buf[..amt]);
297+
reader.consume(amt);
298+
bytes_processed += amt;
299+
break;
300+
}
301+
let len = buf.len();
302+
line.extend_from_slice(buf);
303+
reader.consume(len);
304+
bytes_processed += len;
305+
}
306+
} else {
307+
// Zero-allocation skip: scans the buffer and advances the cursor without copying
308+
loop {
309+
let buf = reader.fill_buf()?;
310+
if buf.is_empty() {
311+
break; // EOF
312+
}
313+
314+
if let Some(pos) = memchr::memchr(newline_char, buf) {
315+
let bytes_to_consume = pos + 1;
316+
reader.consume(bytes_to_consume);
317+
bytes_processed += bytes_to_consume;
318+
break;
319+
}
320+
321+
let len = buf.len();
322+
reader.consume(len);
323+
bytes_processed += len;
324+
}
325+
}
326+
327+
if bytes_processed == 0 {
328+
break;
329+
}
330+
has_data = true;
331+
332+
// To comply with -s when the stream consists of only a single field.
333+
if current_field_idx == 1 {
334+
let is_eof_next = reader.fill_buf()?.is_empty();
335+
336+
if is_eof_next && line.last() != Some(&newline_char) {
337+
if only_delimited {
338+
suppressed = true;
276339
} else {
277-
print_delim = true;
340+
// GNU cut prints the whole line if no delimiter is found.
341+
out.write_all(&line)?;
278342
}
279-
out.write_all(segment.as_slice())?;
280-
} else {
281343
break;
282344
}
283345
}
346+
347+
if range_idx < ranges.len() && current_field_idx > ranges[range_idx].high {
348+
range_idx += 1;
349+
350+
// EARLY EXIT: If we've exhausted all ranges, stop reading the stream entirely.
351+
if range_idx >= ranges.len() {
352+
break;
353+
}
354+
}
355+
356+
// Check if the current field falls inside the current active range
357+
let is_selected = range_idx < ranges.len() && current_field_idx >= ranges[range_idx].low;
358+
359+
if is_selected {
360+
if first_field_printed {
361+
out.write_all(out_delim)?;
362+
}
363+
364+
let has_newline = line.last() == Some(&newline_char);
365+
let content = if has_newline {
366+
&line[..line.len() - 1]
367+
} else {
368+
&line[..]
369+
};
370+
371+
out.write_all(content)?;
372+
first_field_printed = true;
373+
}
374+
375+
current_field_idx += 1;
376+
}
377+
378+
if has_data && !suppressed {
379+
out.write_all(&[newline_char])?;
284380
}
285-
out.write_all(&[newline_char])?;
381+
286382
Ok(())
287383
}
288384

@@ -297,7 +393,14 @@ fn cut_fields<R: Read, W: Write>(
297393
match field_opts.delimiter {
298394
Delimiter::Slice(delim) if delim == [newline_char] => {
299395
let out_delim = opts.out_delimiter.unwrap_or(delim);
300-
cut_fields_newline_char_delim(reader, out, ranges, newline_char, out_delim)
396+
cut_fields_newline_char_delim(
397+
reader,
398+
out,
399+
ranges,
400+
field_opts.only_delimited,
401+
newline_char,
402+
out_delim,
403+
)
301404
}
302405
Delimiter::Slice(delim) => {
303406
let matcher = ExactMatcher::new(delim);

0 commit comments

Comments
 (0)