From cd8f6eb59807938a3f6cffdbc9236960d030daac Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 20 May 2017 11:12:31 -0400 Subject: [PATCH] compiler: fix RegexSet bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When compiling a RegexSet, it was possible for the jump locations to become incorrect if the last regex in the set had a starting location that didn't correspond to the beginning of its program. This can happen in simple cases like when your set consists of the regexes `a` and `β`. In particular, the program for `β` is: 0: Bytes(\xB2) (goto 2) 1: Bytes(\xCE) (goto 0) 2: MATCH Where the entry point is `1` instead of `0`. To fix this, we compile a set of regexes similarly to how we compile `a|β`, where we handle the holes produced by sub-expressions correctly. Fixes #353 --- src/compile.rs | 9 +++++---- tests/set.rs | 1 + 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/compile.rs b/src/compile.rs index 355b23a3a9..ed0a93939e 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -178,18 +178,19 @@ impl Compiler { } self.fill_to_next(dotstar_patch.hole); + let mut prev_hole = Hole::None; for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() { + self.fill_to_next(prev_hole); let split = self.push_split_hole(); let Patch { hole, entry } = try!(self.c_capture(0, expr)); self.fill_to_next(hole); self.compiled.matches.push(self.insts.len()); self.push_compiled(Inst::Match(i)); - - let next = self.insts.len(); - self.fill_split(split, Some(entry), Some(next)); + prev_hole = self.fill_split(split, Some(entry), None); } let i = exprs.len() - 1; - let Patch { hole, .. } = try!(self.c_capture(0, &exprs[i])); + let Patch { hole, entry } = try!(self.c_capture(0, &exprs[i])); + self.fill(prev_hole, entry); self.fill_to_next(hole); self.compiled.matches.push(self.insts.len()); self.push_compiled(Inst::Match(i)); diff --git a/tests/set.rs b/tests/set.rs index 52b1b0dead..70cfba830f 100644 --- a/tests/set.rs +++ b/tests/set.rs @@ -15,6 +15,7 @@ matset!(set14, &[r".*", "a"], "zzzzzz", 0); matset!(set15, &[r"\ba\b"], "hello a bye", 0); matset!(set16, &["a"], "a", 0); matset!(set17, &[".*a"], "a", 0); +matset!(set18, &["a", "β"], "β", 1); nomatset!(nset1, &["a", "a"], "b"); nomatset!(nset2, &["^foo", "bar$"], "bar foo");