diff --git a/README.md b/README.md index 1689b15..574a037 100644 --- a/README.md +++ b/README.md @@ -8,12 +8,6 @@ A Java version of [simdjson](https://github.com/simdjson/simdjson) - a JSON pars based on the paper [Parsing Gigabytes of JSON per Second](https://arxiv.org/abs/1902.08318) by Geoff Langdale and Daniel Lemire. -This implementation is still missing several features available in simdsjon. For example: - -* Support for Unicode characters -* UTF-8 validation -* Support for 512-bit vectors - ## Code Sample ```java @@ -73,8 +67,8 @@ This section presents a performance comparison of different JSON parsers availab the [twitter.json](src/jmh/resources/twitter.json) dataset, and its goal was to measure the throughput (ops/s) of parsing and finding all unique users with a default profile. -**Note that simdjson-java is still missing several features (mentioned in the introduction), so the following results -may not reflect its real performance.** +**Note that simdjson-java is still missing several features (see [GitHub Issues](https://github.com/simdjson/simdjson-java/issues)), +so the following results may not reflect its real performance.** Environment: * CPU: Intel(R) Core(TM) i5-4590 CPU @ 3.30GHz diff --git a/build.gradle b/build.gradle index 4e01afa..b45863e 100644 --- a/build.gradle +++ b/build.gradle @@ -38,16 +38,18 @@ java { ext { junitVersion = '5.9.1' + jsoniterScalaVersion = '2.23.2' } dependencies { jmhImplementation group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.15.2' jmhImplementation group: 'com.alibaba.fastjson2', name: 'fastjson2', version: '2.0.35' jmhImplementation group: 'com.jsoniter', name: 'jsoniter', version: '0.9.23' - jmhImplementation group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-core_2.13', version: '2.23.2' - compileOnly group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-macros_2.13', version: '2.23.2' + jmhImplementation group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-core_2.13', version: jsoniterScalaVersion + compileOnly group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-macros_2.13', version: jsoniterScalaVersion testImplementation group: 'org.assertj', name: 'assertj-core', version: '3.24.2' + testImplementation group: 'org.apache.commons', name: 'commons-text', version: '1.10.0' testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-api', version: junitVersion testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-params', version: junitVersion testRuntimeOnly group: 'org.junit.jupiter', name: 'junit-jupiter-engine', version: junitVersion @@ -74,6 +76,9 @@ test { '--add-modules', 'jdk.incubator.vector', '-Xmx2g' ] + testLogging { + events 'PASSED', 'SKIPPED', 'FAILED', 'STANDARD_OUT', 'STANDARD_ERROR' + } } tasks.withType(JmhBytecodeGeneratorTask).configureEach { diff --git a/src/jmh/java/org/simdjson/ParseBenchmark.java b/src/jmh/java/org/simdjson/ParseBenchmark.java index e64c0cc..a8be30c 100644 --- a/src/jmh/java/org/simdjson/ParseBenchmark.java +++ b/src/jmh/java/org/simdjson/ParseBenchmark.java @@ -21,7 +21,7 @@ @OutputTimeUnit(TimeUnit.SECONDS) public class ParseBenchmark { - @Param({"/twitter.json" /*, "/gsoc-2018.json - unicode is not supported yet"*/, "/github_events.json"}) + @Param({"/twitter.json", "/gsoc-2018.json", "/github_events.json"}) String fileName; private final SimdJsonParser simdJsonParser = new SimdJsonParser(); diff --git a/src/main/java/org/simdjson/CharacterUtils.java b/src/main/java/org/simdjson/CharacterUtils.java new file mode 100644 index 0000000..a7083ed --- /dev/null +++ b/src/main/java/org/simdjson/CharacterUtils.java @@ -0,0 +1,252 @@ +package org.simdjson; + +class CharacterUtils { + + // @formatter:off + private static final boolean[] STRUCTURAL_OR_WHITESPACE = new boolean[]{ + false, false, false, false, false, false, false, false, + false, true, true, false, false, true, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + true, false, false, false, false, false, false, false, + false, false, false, false, true, false, false, false, + false, false, false, false, false, false, false, false, + false, false, true, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, true, false, true, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, true, false, true, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false + }; + // @formatter:on + + static boolean isStructuralOrWhitespace(byte b) { + if (b < 0) { + return false; + } + return STRUCTURAL_OR_WHITESPACE[b]; + } + + private static final byte[] ESCAPE_MAP = new byte[]{ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2f, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00, + 0x00, 0x00, 0x0d, 0x00, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + + static byte escape(byte escapeChar) { + if (escapeChar < 0) { + throw new JsonParsingException("Escaped unexpected character: " + ((char) escapeChar)); + } + byte escapeResult = ESCAPE_MAP[escapeChar]; + if (escapeResult == 0) { + throw new JsonParsingException("Escaped unexpected character: " + ((char) escapeChar)); + } + return escapeResult; + } + + // @formatter:off + private static final int[] HEX_DIGIT_TO_INT = new int[]{ + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, + 0x6, 0x7, 0x8, 0x9, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa, + 0xb, 0xc, 0xd, 0xe, 0xf, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xa, 0xb, 0xc, 0xd, 0xe, + 0xf, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0x0, 0x10, 0x20, 0x30, 0x40, 0x50, + 0x60, 0x70, 0x80, 0x90, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa0, + 0xb0, 0xc0, 0xd0, 0xe0, 0xf0, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, + 0xf0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0x0, 0x100, 0x200, 0x300, 0x400, 0x500, + 0x600, 0x700, 0x800, 0x900, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa00, + 0xb00, 0xc00, 0xd00, 0xe00, 0xf00, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xa00, 0xb00, 0xc00, 0xd00, 0xe00, + 0xf00, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0x0, 0x1000, 0x2000, 0x3000, 0x4000, 0x5000, + 0x6000, 0x7000, 0x8000, 0x9000, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa000, + 0xb000, 0xc000, 0xd000, 0xe000, 0xf000, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xa000, 0xb000, 0xc000, 0xd000, 0xe000, + 0xf000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF + }; + // @formatter:on + + // This method implements an efficient approach for parsing 4-byte hexadecimal strings as described in the article: + // https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/ with additional optimizations + // proposed in this comment: https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/#comment-402686. + static int hexToInt(byte[] buff, int offset) { + int v1 = HEX_DIGIT_TO_INT[630 + getUnsignedByte(buff, offset)]; + int v2 = HEX_DIGIT_TO_INT[420 + getUnsignedByte(buff, offset + 1)]; + int v3 = HEX_DIGIT_TO_INT[210 + getUnsignedByte(buff, offset + 2)]; + int v4 = HEX_DIGIT_TO_INT[getUnsignedByte(buff, offset + 3)]; + return v1 | v2 | v3 | v4; + } + + private static int getUnsignedByte(byte[] buff, int idx) { + return Byte.toUnsignedInt(buff[idx]); + } +} diff --git a/src/main/java/org/simdjson/JsonCharUtils.java b/src/main/java/org/simdjson/JsonCharUtils.java deleted file mode 100644 index 4768f48..0000000 --- a/src/main/java/org/simdjson/JsonCharUtils.java +++ /dev/null @@ -1,49 +0,0 @@ -package org.simdjson; - -class JsonCharUtils { - - private static final boolean[] STRUCTURAL_OR_WHITESPACE = new boolean[]{ - false, false, false, false, false, false, false, false, - false, true, true, false, false, true, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - true, false, false, false, false, false, false, false, - false, false, false, false, true, false, false, false, - false, false, false, false, false, false, false, false, - false, false, true, false, false, false, false, false, - - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, true, false, true, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, true, false, true, false, false, - - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false - }; - - static boolean isStructuralOrWhitespace(byte b) { - if (b < 0) { - return false; - } - return STRUCTURAL_OR_WHITESPACE[b]; - } -} diff --git a/src/main/java/org/simdjson/NumberParser.java b/src/main/java/org/simdjson/NumberParser.java index 4b350aa..7c9c101 100644 --- a/src/main/java/org/simdjson/NumberParser.java +++ b/src/main/java/org/simdjson/NumberParser.java @@ -9,7 +9,7 @@ import static java.lang.Long.remainderUnsigned; import static java.lang.Math.abs; import static java.lang.Math.unsignedMultiplyHigh; -import static org.simdjson.JsonCharUtils.isStructuralOrWhitespace; +import static org.simdjson.CharacterUtils.isStructuralOrWhitespace; import static org.simdjson.NumberParserTables.NUMBER_OF_ADDITIONAL_DIGITS_AFTER_LEFT_SHIFT; import static org.simdjson.NumberParserTables.POWERS_OF_FIVE; import static org.simdjson.NumberParserTables.POWER_OF_FIVE_DIGITS; diff --git a/src/main/java/org/simdjson/StringParser.java b/src/main/java/org/simdjson/StringParser.java new file mode 100644 index 0000000..074a3db --- /dev/null +++ b/src/main/java/org/simdjson/StringParser.java @@ -0,0 +1,127 @@ +package org.simdjson; + +import jdk.incubator.vector.ByteVector; + +import static org.simdjson.CharacterUtils.escape; +import static org.simdjson.CharacterUtils.hexToInt; +import static org.simdjson.Tape.STRING; + +class StringParser { + + private static final byte BACKSLASH = '\\'; + private static final byte QUOTE = '"'; + private static final int BYTES_PROCESSED = StructuralIndexer.SPECIES.vectorByteSize(); + private static final int MIN_HIGH_SURROGATE = 0xD800; + private static final int MAX_HIGH_SURROGATE = 0xDBFF; + private static final int MIN_LOW_SURROGATE = 0xDC00; + private static final int MAX_LOW_SURROGATE = 0xDFFF; + + private final Tape tape; + private final byte[] stringBuffer; + + private int stringBufferIdx; + + StringParser(Tape tape, byte[] stringBuffer) { + this.tape = tape; + this.stringBuffer = stringBuffer; + } + + void parseString(byte[] buffer, int idx) { + tape.append(stringBufferIdx, STRING); + int src = idx + 1; + int dst = stringBufferIdx + Integer.BYTES; + while (true) { + ByteVector srcVec = ByteVector.fromArray(StructuralIndexer.SPECIES, buffer, src); + srcVec.intoArray(stringBuffer, dst); + long backslashBits = srcVec.eq(BACKSLASH).toLong(); + long quoteBits = srcVec.eq(QUOTE).toLong(); + + if (hasQuoteFirst(backslashBits, quoteBits)) { + dst += Long.numberOfTrailingZeros(quoteBits); + break; + } + if (hasBackslash(backslashBits, quoteBits)) { + int backslashDist = Long.numberOfTrailingZeros(backslashBits); + byte escapeChar = buffer[src + backslashDist + 1]; + if (escapeChar == 'u') { + src += backslashDist; + dst += backslashDist; + int codePoint = hexToInt(buffer, src + 2); + src += 6; + if (codePoint >= MIN_HIGH_SURROGATE && codePoint <= MAX_HIGH_SURROGATE) { + codePoint = parseLowSurrogate(buffer, src, codePoint); + src += 6; + } else if (codePoint >= MIN_LOW_SURROGATE && codePoint <= MAX_LOW_SURROGATE) { + throw new JsonParsingException("Invalid code point. The range U+DC00–U+DFFF is reserved for low surrogate."); + } + dst += storeCodePointInStringBuffer(codePoint, dst); + } else { + stringBuffer[dst + backslashDist] = escape(escapeChar); + src += backslashDist + 2; + dst += backslashDist + 1; + } + } else { + src += BYTES_PROCESSED; + dst += BYTES_PROCESSED; + } + } + int len = dst - stringBufferIdx - Integer.BYTES; + IntegerUtils.toBytes(len, stringBuffer, stringBufferIdx); + stringBufferIdx = dst; + } + + private int parseLowSurrogate(byte[] buffer, int src, int codePoint) { + if ((buffer[src] << 8 | buffer[src + 1]) != ('\\' << 8 | 'u')) { + throw new JsonParsingException("Low surrogate should start with '\\u'"); + } else { + int codePoint2 = hexToInt(buffer, src + 2); + int lowBit = codePoint2 - MIN_LOW_SURROGATE; + if (lowBit >> 10 == 0) { + return (((codePoint - MIN_HIGH_SURROGATE) << 10) | lowBit) + 0x10000; + } else { + throw new JsonParsingException("Invalid code point. Low surrogate should be in the range U+DC00–U+DFFF."); + } + } + } + + private int storeCodePointInStringBuffer(int codePoint, int dst) { + if (codePoint < 0) { + throw new JsonParsingException("Invalid unicode escape sequence."); + } + if (codePoint <= 0x7F) { + stringBuffer[dst] = (byte) codePoint; + return 1; + } + if (codePoint <= 0x7FF) { + stringBuffer[dst] = (byte) ((codePoint >> 6) + 192); + stringBuffer[dst + 1] = (byte) ((codePoint & 63) + 128); + return 2; + } + if (codePoint <= 0xFFFF) { + stringBuffer[dst] = (byte) ((codePoint >> 12) + 224); + stringBuffer[dst + 1] = (byte) (((codePoint >> 6) & 63) + 128); + stringBuffer[dst + 2] = (byte) ((codePoint & 63) + 128); + return 3; + } + if (codePoint <= 0x10FFFF) { + stringBuffer[dst] = (byte) ((codePoint >> 18) + 240); + stringBuffer[dst + 1] = (byte) (((codePoint >> 12) & 63) + 128); + stringBuffer[dst + 2] = (byte) (((codePoint >> 6) & 63) + 128); + stringBuffer[dst + 3] = (byte) ((codePoint & 63) + 128); + return 4; + } + throw new IllegalStateException("Code point is greater than 0x110000."); + } + + private boolean hasQuoteFirst(long backslashBits, long quoteBits) { + return ((backslashBits - 1) & quoteBits) != 0; + } + + private boolean hasBackslash(long backslashBits, long quoteBits) { + return ((quoteBits - 1) & backslashBits) != 0; + } + + void reset() { + stringBufferIdx = 0; + } +} diff --git a/src/main/java/org/simdjson/TapeBuilder.java b/src/main/java/org/simdjson/TapeBuilder.java index 39a729a..fc7f87e 100644 --- a/src/main/java/org/simdjson/TapeBuilder.java +++ b/src/main/java/org/simdjson/TapeBuilder.java @@ -1,10 +1,8 @@ package org.simdjson; -import jdk.incubator.vector.ByteVector; - import java.util.Arrays; -import static org.simdjson.JsonCharUtils.isStructuralOrWhitespace; +import static org.simdjson.CharacterUtils.isStructuralOrWhitespace; import static org.simdjson.Tape.END_ARRAY; import static org.simdjson.Tape.END_OBJECT; import static org.simdjson.Tape.FALSE_VALUE; @@ -12,44 +10,18 @@ import static org.simdjson.Tape.ROOT; import static org.simdjson.Tape.START_ARRAY; import static org.simdjson.Tape.START_OBJECT; -import static org.simdjson.Tape.STRING; import static org.simdjson.Tape.TRUE_VALUE; class TapeBuilder { private static final byte SPACE = 0x20; - private static final byte BACKSLASH = '\\'; - private static final byte QUOTE = '"'; - private static final int BYTES_PROCESSED = StructuralIndexer.SPECIES.vectorByteSize(); - private static final byte[] ESCAPE_MAP = new byte[]{ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0. - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4. - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5. - 0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6. - 0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7. - - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }; private final Tape tape; private final byte[] stringBuffer; private final OpenContainer[] openContainers; private final int padding; private final NumberParser numberParser; - - private int stringBufferIdx; + private final StringParser stringParser; TapeBuilder(int capacity, int depth, int padding) { this.tape = new Tape(capacity); @@ -60,6 +32,7 @@ class TapeBuilder { } this.stringBuffer = new byte[capacity]; this.numberParser = new NumberParser(tape); + this.stringParser = new StringParser(tape, stringBuffer); } void visitDocumentStart() { @@ -193,56 +166,7 @@ void visitKey(byte[] buffer, int idx) { } private void visitString(byte[] buffer, int idx) { - tape.append(stringBufferIdx, STRING); - int src = idx + 1; - int dst = stringBufferIdx + Integer.BYTES; - while (true) { - ByteVector srcVec = ByteVector.fromArray(StructuralIndexer.SPECIES, buffer, src); - srcVec.intoArray(stringBuffer, dst); - long backslashBits = srcVec.eq(BACKSLASH).toLong(); - long quoteBits = srcVec.eq(QUOTE).toLong(); - - if (hasQuoteFirst(backslashBits, quoteBits)) { - dst += Long.numberOfTrailingZeros(quoteBits); - break; - } - if (hasBackslash(backslashBits, quoteBits)) { - int backslashDist = Long.numberOfTrailingZeros(backslashBits); - byte escapeChar = buffer[src + backslashDist + 1]; - if (escapeChar == 'u') { - throw new UnsupportedOperationException("Support for unicode characters is not implemented yet."); - } else { - stringBuffer[dst + backslashDist] = escape(escapeChar); - src += backslashDist + 2; - dst += backslashDist + 1; - } - } else { - src += BYTES_PROCESSED; - dst += BYTES_PROCESSED; - } - } - int len = dst - stringBufferIdx - Integer.BYTES; - IntegerUtils.toBytes(len, stringBuffer, stringBufferIdx); - stringBufferIdx = dst; - } - - private byte escape(byte escapeChar) { - if (escapeChar < 0) { - throw new JsonParsingException("Escaped unexpected character: " + ((char) escapeChar)); - } - byte escapeResult = ESCAPE_MAP[escapeChar]; - if (escapeResult == 0) { - throw new JsonParsingException("Escaped unexpected character: " + ((char) escapeChar)); - } - return escapeResult; - } - - private boolean hasQuoteFirst(long backslashBits, long quoteBits) { - return ((backslashBits - 1) & quoteBits) != 0; - } - - private boolean hasBackslash(long backslashBits, long quoteBits) { - return ((quoteBits - 1) & backslashBits) != 0; + stringParser.parseString(buffer, idx); } private void visitNumber(byte[] buffer, int idx) { @@ -278,7 +202,7 @@ private void emptyContainer(char start, char end) { void reset() { tape.reset(); - stringBufferIdx = 0; + stringParser.reset(); } JsonValue createJsonValue(byte[] buffer) { diff --git a/src/test/java/org/simdjson/JsonValueAssert.java b/src/test/java/org/simdjson/JsonValueAssert.java new file mode 100644 index 0000000..078c09d --- /dev/null +++ b/src/test/java/org/simdjson/JsonValueAssert.java @@ -0,0 +1,80 @@ +package org.simdjson; + +import org.assertj.core.api.AbstractAssert; +import org.assertj.core.api.Assertions; + +import static java.nio.charset.StandardCharsets.UTF_8; + +class JsonValueAssert extends AbstractAssert { + + JsonValueAssert(JsonValue actual) { + super(actual, JsonValueAssert.class); + } + + static JsonValueAssert assertThat(JsonValue actual) { + return new JsonValueAssert(actual); + } + + JsonValueAssert isEqualTo(long expected) { + Assertions.assertThat(actual.isLong()) + .withFailMessage("Expecting value to be long but was " + getActualType()) + .isTrue(); + Assertions.assertThat(actual.asLong()).isEqualTo(expected); + return this; + } + + JsonValueAssert isEqualTo(Double expected) { + Assertions.assertThat(actual.isDouble()) + .withFailMessage("Expecting value to be double but was " + getActualType()) + .isTrue(); + Assertions.assertThat(actual.asDouble()).isEqualTo(expected); + return this; + } + + JsonValueAssert isEqualTo(String expected) { + Assertions.assertThat(actual.isString()) + .withFailMessage("Expecting value to be string but was " + getActualType()) + .isTrue(); + Assertions.assertThat(actual.asString()).isEqualTo(expected); + CharSequence cs = actual.asCharSequence(); + byte[] bytesExpected = expected.getBytes(UTF_8); + Assertions.assertThat(cs.length()).isEqualTo(bytesExpected.length); + for (int i = 0; i < cs.length(); i++) { + Assertions.assertThat((byte) cs.charAt(i)).isEqualTo(bytesExpected[i]); + } + return this; + } + + JsonValueAssert isEqualTo(boolean expected) { + Assertions.assertThat(actual.isBoolean()) + .withFailMessage("Expecting value to be boolean but was " + getActualType()) + .isTrue(); + Assertions.assertThat(actual.asBoolean()).isEqualTo(expected); + return this; + } + + private String getActualType() { + if (actual.isArray()) { + return "array"; + } + if (actual.isString()) { + return "string"; + } + if (actual.isLong()) { + return "long"; + } + if (actual.isBoolean()) { + return "boolean"; + } + if (actual.isDouble()) { + return "double"; + } + if (actual.isNull()) { + return "null"; + } + if (actual.isObject()) { + return "object"; + } + return "unknown type"; + } +} diff --git a/src/test/java/org/simdjson/NumberParsingTest.java b/src/test/java/org/simdjson/NumberParsingTest.java index f518b48..d1e4756 100644 --- a/src/test/java/org/simdjson/NumberParsingTest.java +++ b/src/test/java/org/simdjson/NumberParsingTest.java @@ -16,6 +16,7 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.simdjson.JsonValueAssert.assertThat; import static org.simdjson.StringUtils.toUtf8; public class NumberParsingTest { @@ -137,7 +138,7 @@ public void minusZeroIsTreatedAsIntegerZero() { JsonValue value = parser.parse(json, json.length); // then - assertLong(value, 0); + assertThat(value).isEqualTo(0); } @Test @@ -186,7 +187,7 @@ public void minMaxLongValue(long input) { JsonValue jsonValue = parser.parse(json, json.length); // then - assertLong(jsonValue, input); + assertThat(jsonValue).isEqualTo(input); } @ParameterizedTest @@ -224,7 +225,7 @@ public void exponentWithMoreDigitsThanLongCanAccommodateAndLeadingZeros(String i JsonValue value = parser.parse(json, json.length); // then - assertDouble(value, expected); + assertThat(value).isEqualTo(expected); } @ParameterizedTest @@ -245,7 +246,7 @@ public void exponentWithMoreDigitsThanLongCanAccommodate(String input, double ex JsonValue value = parser.parse(json, json.length); // then - assertDouble(value, expected); + assertThat(value).isEqualTo(expected); } @ParameterizedTest @@ -270,7 +271,7 @@ public void positiveInfinity(String input) { JsonValue value = parser.parse(json, json.length); // then - assertDouble(value, Double.POSITIVE_INFINITY); + assertThat(value).isEqualTo(Double.POSITIVE_INFINITY); } @ParameterizedTest @@ -295,7 +296,7 @@ public void negativeInfinity(String input) { JsonValue value = parser.parse(json, json.length); // then - assertDouble(value, Double.NEGATIVE_INFINITY); + assertThat(value).isEqualTo(Double.NEGATIVE_INFINITY); } @ParameterizedTest @@ -321,7 +322,7 @@ public void positiveZero(String input) { JsonValue value = parser.parse(json, json.length); // then - assertDouble(value, 0.0); + assertThat(value).isEqualTo(0.0); } @ParameterizedTest @@ -347,7 +348,7 @@ public void negativeZero(String input) { JsonValue value = parser.parse(json, json.length); // then - assertDouble(value, -0.0); + assertThat(value).isEqualTo(-0.0); } @ParameterizedTest @@ -366,8 +367,8 @@ public void roundingOverflow(String input) { JsonValue value = parser.parse(json, json.length); // then - assertDouble(value, 7.2057594037927933e16); - assertDouble(value, 7.2057594037927936e16); + assertThat(value).isEqualTo(7.2057594037927933e16); + assertThat(value).isEqualTo(7.2057594037927936e16); } @ParameterizedTest @@ -387,7 +388,7 @@ public void minNormalDouble(String input) { JsonValue value = parser.parse(json, json.length); // then - assertDouble(value, 0x1p-1022); + assertThat(value).isEqualTo(0x1p-1022); } @ParameterizedTest @@ -408,7 +409,7 @@ public void maxSubnormalDouble(String input) { JsonValue value = parser.parse(json, json.length); // then - assertDouble(value, 0x0.fffffffffffffp-1022); + assertThat(value).isEqualTo(0x0.fffffffffffffp-1022); } @ParameterizedTest @@ -427,7 +428,7 @@ public void minSubnormalDouble(String input) { JsonValue value = parser.parse(json, json.length); // then - assertDouble(value, 0x0.0000000000001p-1022); + assertThat(value).isEqualTo(0x0.0000000000001p-1022); } @ParameterizedTest @@ -444,7 +445,7 @@ public void maxDouble(String input) { JsonValue value = parser.parse(json, json.length); // then - assertDouble(value, 0x1.fffffffffffffp+1023); + assertThat(value).isEqualTo(0x1.fffffffffffffp+1023); } @ParameterizedTest @@ -470,7 +471,7 @@ public void roundTiesToEven(String input, double expected) { JsonValue value = parser.parse(json, json.length); // then - assertDouble(value, expected); + assertThat(value).isEqualTo(expected); } @ParameterizedTest @@ -496,7 +497,7 @@ public void roundUpToNearest(String input, double expected) { JsonValue value = parser.parse(json, json.length); // then - assertDouble(value, expected); + assertThat(value).isEqualTo(expected); } @ParameterizedTest @@ -520,7 +521,7 @@ public void roundDownToNearest(String input, double expected) { JsonValue value = parser.parse(json, json.length); // then - assertDouble(value, expected); + assertThat(value).isEqualTo(expected); } @ParameterizedTest @@ -536,7 +537,7 @@ public void exactDouble(String input, double expected) { JsonValue value = parser.parse(json, json.length); // then - assertDouble(value, expected); + assertThat(value).isEqualTo(expected); } @ParameterizedTest @@ -558,7 +559,7 @@ public void testFiles(File file) throws IOException { JsonValue value = parser.parse(json, json.length); // then - assertDouble(value, expected); + assertThat(value).isEqualTo(expected); } } } @@ -586,14 +587,4 @@ private static List listTestFiles() throws IOException { .filter(File::isFile) .toList(); } - - private static void assertLong(JsonValue actual, long expected) { - assertThat(actual.isLong()).isTrue(); - assertThat(actual.asLong()).isEqualTo(expected); - } - - private static void assertDouble(JsonValue actual, Double expected) { - assertThat(actual.isDouble()).isTrue(); - assertThat(actual.asDouble()).isEqualTo(expected); - } } diff --git a/src/test/java/org/simdjson/SimdJsonParserTest.java b/src/test/java/org/simdjson/SimdJsonParserTest.java index a68a682..97a2316 100644 --- a/src/test/java/org/simdjson/SimdJsonParserTest.java +++ b/src/test/java/org/simdjson/SimdJsonParserTest.java @@ -7,10 +7,10 @@ import java.util.Iterator; import java.util.Map; -import static java.nio.charset.StandardCharsets.UTF_8; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.fail; import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.simdjson.JsonValueAssert.assertThat; import static org.simdjson.StringUtils.toUtf8; public class SimdJsonParserTest { @@ -92,8 +92,8 @@ public void testObjectIterator() { while (it.hasNext()) { Map.Entry field = it.next(); CharSequence key = field.getKey(); - assertString(key, expectedKeys[counter]); - assertLong(field.getValue(), expectedValue[counter]); + assertThat(key).usingComparator(CharSequence::compare).isEqualTo(expectedKeys[counter]); + assertThat(field.getValue()).isEqualTo(expectedValue[counter]); counter++; } assertThat(counter).isEqualTo(expectedKeys.length); @@ -112,8 +112,8 @@ public void testBooleanValues() { assertThat(jsonValue.isArray()).isTrue(); Iterator it = jsonValue.arrayIterator(); assertThat(it.hasNext()).isTrue(); - assertBoolean(it.next(), true); - assertBoolean(it.next(), false); + assertThat(it.next()).isEqualTo(true); + assertThat(it.next()).isEqualTo(false); assertThat(it.hasNext()).isFalse(); } @@ -128,7 +128,7 @@ public void testBooleanValuesAsRoot(boolean booleanVal) { JsonValue jsonValue = parser.parse(json, json.length); // then - assertBoolean(jsonValue, booleanVal); + assertThat(jsonValue).isEqualTo(booleanVal); } @Test @@ -175,8 +175,8 @@ public void testStringValues() { assertThat(jsonValue.isArray()).isTrue(); Iterator it = jsonValue.arrayIterator(); assertThat(it.hasNext()).isTrue(); - assertString(it.next(), "abc"); - assertString(it.next(), "ab\\c"); + assertThat(it.next()).isEqualTo("abc"); + assertThat(it.next()).isEqualTo("ab\\c"); assertThat(it.hasNext()).isFalse(); } @@ -191,7 +191,7 @@ public void testStringValuesAsRoot(String jsonStr) { JsonValue jsonValue = parser.parse(json, json.length); // then - assertString(jsonValue, jsonStr); + assertThat(jsonValue).isEqualTo(jsonStr); } @Test @@ -207,10 +207,10 @@ public void testNumericValues() { assertThat(jsonValue.isArray()).isTrue(); Iterator it = jsonValue.arrayIterator(); assertThat(it.hasNext()).isTrue(); - assertLong(it.next(), 0); - assertLong(it.next(), 1); - assertLong(it.next(), -1); - assertDouble(it.next(), "1.1"); + assertThat(it.next()).isEqualTo(0); + assertThat(it.next()).isEqualTo(1); + assertThat(it.next()).isEqualTo(-1); + assertThat(it.next()).isEqualTo(1.1); assertThat(it.hasNext()).isFalse(); } @@ -225,7 +225,7 @@ public void testLongValuesAsRoot(String longStr) { JsonValue jsonValue = parser.parse(json, json.length); // then - assertLong(jsonValue, Long.parseLong(longStr)); + assertThat(jsonValue).isEqualTo(Long.parseLong(longStr)); } @ParameterizedTest @@ -239,7 +239,7 @@ public void testDoubleValuesAsRoot(String doubleStr) { JsonValue jsonValue = parser.parse(json, json.length); // then - assertDouble(jsonValue, doubleStr); + assertThat(jsonValue).isEqualTo(Double.parseDouble(doubleStr)); } @ParameterizedTest @@ -299,50 +299,6 @@ public void testInvalidTrue(String jsonStr) { assertThat(ex.getMessage()).isEqualTo("Invalid value starting at " + jsonStr.indexOf('t') + ". Expected 'true'."); } - @Test - public void testUnicodeString() { - // given - SimdJsonParser parser = new SimdJsonParser(); - byte[] json = toUtf8("[\"\\u005C\"]"); - - // when - UnsupportedOperationException ex = assertThrows(UnsupportedOperationException.class, () -> parser.parse(json, json.length)); - - // then - assertThat(ex.getMessage()).isEqualTo("Support for unicode characters is not implemented yet."); - } - - @ParameterizedTest - @ValueSource(strings = {"\\g", "\\ą"}) - public void testInvalidEscape(String jsonStr) { - // given - SimdJsonParser parser = new SimdJsonParser(); - byte[] json = toUtf8("[\"" + jsonStr + "\"]"); - - // when - JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(json, json.length)); - - // then - assertThat(ex.getMessage()).startsWith("Escaped unexpected character: "); - } - - @Test - public void testLongString() { - // given - SimdJsonParser parser = new SimdJsonParser(); - byte[] json = toUtf8("[\"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"]"); - - // when - JsonValue jsonValue = parser.parse(json, json.length); - - // then - assertThat(jsonValue.isArray()).isTrue(); - Iterator it = jsonValue.arrayIterator(); - assertThat(it.hasNext()).isTrue(); - assertString(it.next(), "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); - assertThat(it.hasNext()).isFalse(); - } - @Test public void testArraySize() { // given @@ -393,33 +349,4 @@ public void testLargeArraySize() { assertThat(jsonValue.isArray()).isTrue(); assertThat(jsonValue.getSize()).isEqualTo(0xFFFFFF); } - - private static void assertString(JsonValue actual, String expected) { - assertThat(actual.isString()).isTrue(); - assertThat(actual.asString()).isEqualTo(expected); - assertString(actual.asCharSequence(), expected); - } - - private static void assertString(CharSequence actual, String expected) { - byte[] bytesExpected = expected.getBytes(UTF_8); - assertThat(actual.length()).isEqualTo(bytesExpected.length); - for (int i = 0; i < actual.length(); i++) { - assertThat((byte) actual.charAt(i)).isEqualTo(bytesExpected[i]); - } - } - - private static void assertBoolean(JsonValue actual, boolean expected) { - assertThat(actual.isBoolean()).isTrue(); - assertThat(actual.asBoolean()).isEqualTo(expected); - } - - private static void assertLong(JsonValue actual, long expected) { - assertThat(actual.isLong()).isTrue(); - assertThat(actual.asLong()).isEqualTo(expected); - } - - private static void assertDouble(JsonValue actual, String str) { - assertThat(actual.isDouble()).isTrue(); - assertThat(actual.asDouble()).isEqualTo(Double.valueOf(str)); - } } diff --git a/src/test/java/org/simdjson/StringParsingTest.java b/src/test/java/org/simdjson/StringParsingTest.java new file mode 100644 index 0000000..e9753c7 --- /dev/null +++ b/src/test/java/org/simdjson/StringParsingTest.java @@ -0,0 +1,167 @@ +package org.simdjson; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.util.Iterator; +import java.util.List; + +import static java.lang.Character.MAX_CODE_POINT; +import static java.lang.Character.isBmpCodePoint; +import static java.lang.Character.lowSurrogate; +import static java.util.stream.IntStream.rangeClosed; +import static org.apache.commons.text.StringEscapeUtils.unescapeJava; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.simdjson.JsonValueAssert.assertThat; +import static org.simdjson.StringUtils.toUtf8; + +public class StringParsingTest { + + @Test + public void usableUnicodeCharacters() { + // given + SimdJsonParser parser = new SimdJsonParser(); + List unicodeCharacters = rangeClosed(0, MAX_CODE_POINT) + .filter(Character::isDefined) + .filter(codePoint -> !isReservedCodePoint(codePoint)) + .mapToObj(StringParsingTest::toUnicodeEscape) + .toList(); + + for (String input : unicodeCharacters) { + byte[] json = toUtf8("\"" + input + "\""); + + // when + JsonValue value = parser.parse(json, json.length); + + // then + assertThat(value).isEqualTo(unescapeJava(input)); + } + } + + @Test + public void unicodeCharactersReservedForLowSurrogate() { + // given + SimdJsonParser parser = new SimdJsonParser(); + List unicodeCharacters = rangeClosed(0xDC00, 0xDFFF) + .mapToObj(StringParsingTest::toUnicodeEscape) + .toList(); + + for (String input : unicodeCharacters) { + byte[] json = toUtf8("\"" + input + "\""); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(json, json.length)); + + // then + assertThat(ex.getMessage()).isEqualTo("Invalid code point. The range U+DC00–U+DFFF is reserved for low surrogate."); + } + } + + @ParameterizedTest + @ValueSource(strings = {"\\uD8001", "\\uD800\\1", "\\uD800u", "\\uD800\\e", "\\uD800\\DC00"}) + public void invalidLowSurrogateEscape(String input) { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] json = toUtf8("\"" + input + "\""); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(json, json.length)); + + // then + assertThat(ex.getMessage()).isEqualTo("Low surrogate should start with '\\u'"); + } + + @ParameterizedTest + @ValueSource(strings = {"\\uD800\\u"}) + public void missingLowSurrogate(String input) { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] json = toUtf8("\"" + input + "\""); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(json, json.length)); + + // then + assertThat(ex.getMessage()).isEqualTo("Invalid code point. Low surrogate should be in the range U+DC00–U+DFFF."); + } + + @Test + public void invalidLowSurrogateRange() { + // given + SimdJsonParser parser = new SimdJsonParser(); + List unicodeCharacters = rangeClosed(0x0000, 0xFFFF) + .filter(lowSurrogate -> lowSurrogate < 0xDC00 || lowSurrogate > 0xDFFF) + .mapToObj(lowSurrogate -> String.format("\\uD800\\u%04X", lowSurrogate)) + .toList(); + + for (String input : unicodeCharacters) { + byte[] json = toUtf8("\"" + input + "\""); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(json, json.length)); + + // then + assertThat(ex.getMessage()).isEqualTo("Invalid code point. Low surrogate should be in the range U+DC00–U+DFFF."); + } + } + + @ParameterizedTest + @ValueSource(strings = {"\\u", "\\u1", "\\u12", "\\u123"}) + public void invalidUnicode(String input) { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] json = toUtf8("\"" + input + "\""); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(json, json.length)); + + // then + assertThat(ex.getMessage()).isEqualTo("Invalid unicode escape sequence."); + } + + @ParameterizedTest + @ValueSource(strings = {"\\g", "\\ą"}) + public void invalidEscape(String jsonStr) { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] json = toUtf8("[\"" + jsonStr + "\"]"); + + // when + JsonParsingException ex = assertThrows(JsonParsingException.class, () -> parser.parse(json, json.length)); + + // then + assertThat(ex.getMessage()).startsWith("Escaped unexpected character: "); + } + + @Test + public void longString() { + // given + SimdJsonParser parser = new SimdJsonParser(); + byte[] json = toUtf8("[\"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"]"); + + // when + JsonValue jsonValue = parser.parse(json, json.length); + + // then + assertThat(jsonValue.isArray()).isTrue(); + Iterator it = jsonValue.arrayIterator(); + assertThat(it.hasNext()).isTrue(); + assertThat(it.next()).isEqualTo("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); + assertThat(it.hasNext()).isFalse(); + } + + private static String toUnicodeEscape(int codePoint) { + if (isBmpCodePoint(codePoint)) { + return String.format("\\u%04X", codePoint); + } else { + return String.format("\\u%04X\\u%04X", + (int) Character.highSurrogate(codePoint), (int) lowSurrogate(codePoint)); + } + } + + private static boolean isReservedCodePoint(int codePoint) { + return codePoint >= 0xD800 && codePoint <= 0xDFFF; + } +}