Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 2 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,6 @@ A Java version of [simdjson](https://github.com/simdjson/simdjson) - a JSON pars
based on the paper [Parsing Gigabytes of JSON per Second](https://arxiv.org/abs/1902.08318)
by Geoff Langdale and Daniel Lemire.

This implementation is still missing several features available in simdsjon. For example:

* Support for Unicode characters
* UTF-8 validation
* Support for 512-bit vectors

## Code Sample

```java
Expand Down Expand Up @@ -73,8 +67,8 @@ This section presents a performance comparison of different JSON parsers availab
the [twitter.json](src/jmh/resources/twitter.json) dataset, and its goal was to measure the throughput (ops/s) of parsing
and finding all unique users with a default profile.

**Note that simdjson-java is still missing several features (mentioned in the introduction), so the following results
may not reflect its real performance.**
**Note that simdjson-java is still missing several features (see [GitHub Issues](https://github.com/simdjson/simdjson-java/issues)),
so the following results may not reflect its real performance.**

Environment:
* CPU: Intel(R) Core(TM) i5-4590 CPU @ 3.30GHz
Expand Down
9 changes: 7 additions & 2 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,18 @@ java {

ext {
junitVersion = '5.9.1'
jsoniterScalaVersion = '2.23.2'
}

dependencies {
jmhImplementation group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.15.2'
jmhImplementation group: 'com.alibaba.fastjson2', name: 'fastjson2', version: '2.0.35'
jmhImplementation group: 'com.jsoniter', name: 'jsoniter', version: '0.9.23'
jmhImplementation group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-core_2.13', version: '2.23.2'
compileOnly group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-macros_2.13', version: '2.23.2'
jmhImplementation group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-core_2.13', version: jsoniterScalaVersion
compileOnly group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-macros_2.13', version: jsoniterScalaVersion

testImplementation group: 'org.assertj', name: 'assertj-core', version: '3.24.2'
testImplementation group: 'org.apache.commons', name: 'commons-text', version: '1.10.0'
testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-api', version: junitVersion
testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-params', version: junitVersion
testRuntimeOnly group: 'org.junit.jupiter', name: 'junit-jupiter-engine', version: junitVersion
Expand All @@ -74,6 +76,9 @@ test {
'--add-modules', 'jdk.incubator.vector',
'-Xmx2g'
]
testLogging {
events 'PASSED', 'SKIPPED', 'FAILED', 'STANDARD_OUT', 'STANDARD_ERROR'
}
}

tasks.withType(JmhBytecodeGeneratorTask).configureEach {
Expand Down
2 changes: 1 addition & 1 deletion src/jmh/java/org/simdjson/ParseBenchmark.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
@OutputTimeUnit(TimeUnit.SECONDS)
public class ParseBenchmark {

@Param({"/twitter.json" /*, "/gsoc-2018.json - unicode is not supported yet"*/, "/github_events.json"})
@Param({"/twitter.json", "/gsoc-2018.json", "/github_events.json"})
String fileName;

private final SimdJsonParser simdJsonParser = new SimdJsonParser();
Expand Down
252 changes: 252 additions & 0 deletions src/main/java/org/simdjson/CharacterUtils.java

Large diffs are not rendered by default.

49 changes: 0 additions & 49 deletions src/main/java/org/simdjson/JsonCharUtils.java

This file was deleted.

2 changes: 1 addition & 1 deletion src/main/java/org/simdjson/NumberParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import static java.lang.Long.remainderUnsigned;
import static java.lang.Math.abs;
import static java.lang.Math.unsignedMultiplyHigh;
import static org.simdjson.JsonCharUtils.isStructuralOrWhitespace;
import static org.simdjson.CharacterUtils.isStructuralOrWhitespace;
import static org.simdjson.NumberParserTables.NUMBER_OF_ADDITIONAL_DIGITS_AFTER_LEFT_SHIFT;
import static org.simdjson.NumberParserTables.POWERS_OF_FIVE;
import static org.simdjson.NumberParserTables.POWER_OF_FIVE_DIGITS;
Expand Down
127 changes: 127 additions & 0 deletions src/main/java/org/simdjson/StringParser.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
package org.simdjson;

import jdk.incubator.vector.ByteVector;

import static org.simdjson.CharacterUtils.escape;
import static org.simdjson.CharacterUtils.hexToInt;
import static org.simdjson.Tape.STRING;

class StringParser {

private static final byte BACKSLASH = '\\';
private static final byte QUOTE = '"';
private static final int BYTES_PROCESSED = StructuralIndexer.SPECIES.vectorByteSize();
private static final int MIN_HIGH_SURROGATE = 0xD800;
private static final int MAX_HIGH_SURROGATE = 0xDBFF;
private static final int MIN_LOW_SURROGATE = 0xDC00;
private static final int MAX_LOW_SURROGATE = 0xDFFF;

private final Tape tape;
private final byte[] stringBuffer;

private int stringBufferIdx;

StringParser(Tape tape, byte[] stringBuffer) {
this.tape = tape;
this.stringBuffer = stringBuffer;
}

void parseString(byte[] buffer, int idx) {
tape.append(stringBufferIdx, STRING);
int src = idx + 1;
int dst = stringBufferIdx + Integer.BYTES;
while (true) {
ByteVector srcVec = ByteVector.fromArray(StructuralIndexer.SPECIES, buffer, src);
srcVec.intoArray(stringBuffer, dst);
long backslashBits = srcVec.eq(BACKSLASH).toLong();
long quoteBits = srcVec.eq(QUOTE).toLong();

if (hasQuoteFirst(backslashBits, quoteBits)) {
dst += Long.numberOfTrailingZeros(quoteBits);
break;
}
if (hasBackslash(backslashBits, quoteBits)) {
int backslashDist = Long.numberOfTrailingZeros(backslashBits);
byte escapeChar = buffer[src + backslashDist + 1];
if (escapeChar == 'u') {
src += backslashDist;
dst += backslashDist;
int codePoint = hexToInt(buffer, src + 2);
src += 6;
if (codePoint >= MIN_HIGH_SURROGATE && codePoint <= MAX_HIGH_SURROGATE) {
codePoint = parseLowSurrogate(buffer, src, codePoint);
src += 6;
} else if (codePoint >= MIN_LOW_SURROGATE && codePoint <= MAX_LOW_SURROGATE) {
throw new JsonParsingException("Invalid code point. The range U+DC00–U+DFFF is reserved for low surrogate.");
}
dst += storeCodePointInStringBuffer(codePoint, dst);
} else {
stringBuffer[dst + backslashDist] = escape(escapeChar);
src += backslashDist + 2;
dst += backslashDist + 1;
}
} else {
src += BYTES_PROCESSED;
dst += BYTES_PROCESSED;
}
}
int len = dst - stringBufferIdx - Integer.BYTES;
IntegerUtils.toBytes(len, stringBuffer, stringBufferIdx);
stringBufferIdx = dst;
}

private int parseLowSurrogate(byte[] buffer, int src, int codePoint) {
if ((buffer[src] << 8 | buffer[src + 1]) != ('\\' << 8 | 'u')) {
throw new JsonParsingException("Low surrogate should start with '\\u'");
} else {
int codePoint2 = hexToInt(buffer, src + 2);
int lowBit = codePoint2 - MIN_LOW_SURROGATE;
if (lowBit >> 10 == 0) {
return (((codePoint - MIN_HIGH_SURROGATE) << 10) | lowBit) + 0x10000;
} else {
throw new JsonParsingException("Invalid code point. Low surrogate should be in the range U+DC00–U+DFFF.");
}
}
}

private int storeCodePointInStringBuffer(int codePoint, int dst) {
if (codePoint < 0) {
throw new JsonParsingException("Invalid unicode escape sequence.");
}
if (codePoint <= 0x7F) {
stringBuffer[dst] = (byte) codePoint;
return 1;
}
if (codePoint <= 0x7FF) {
stringBuffer[dst] = (byte) ((codePoint >> 6) + 192);
stringBuffer[dst + 1] = (byte) ((codePoint & 63) + 128);
return 2;
}
if (codePoint <= 0xFFFF) {
stringBuffer[dst] = (byte) ((codePoint >> 12) + 224);
stringBuffer[dst + 1] = (byte) (((codePoint >> 6) & 63) + 128);
stringBuffer[dst + 2] = (byte) ((codePoint & 63) + 128);
return 3;
}
if (codePoint <= 0x10FFFF) {
stringBuffer[dst] = (byte) ((codePoint >> 18) + 240);
stringBuffer[dst + 1] = (byte) (((codePoint >> 12) & 63) + 128);
stringBuffer[dst + 2] = (byte) (((codePoint >> 6) & 63) + 128);
stringBuffer[dst + 3] = (byte) ((codePoint & 63) + 128);
return 4;
}
throw new IllegalStateException("Code point is greater than 0x110000.");
}

private boolean hasQuoteFirst(long backslashBits, long quoteBits) {
return ((backslashBits - 1) & quoteBits) != 0;
}

private boolean hasBackslash(long backslashBits, long quoteBits) {
return ((quoteBits - 1) & backslashBits) != 0;
}

void reset() {
stringBufferIdx = 0;
}
}
86 changes: 5 additions & 81 deletions src/main/java/org/simdjson/TapeBuilder.java
Original file line number Diff line number Diff line change
@@ -1,55 +1,27 @@
package org.simdjson;

import jdk.incubator.vector.ByteVector;

import java.util.Arrays;

import static org.simdjson.JsonCharUtils.isStructuralOrWhitespace;
import static org.simdjson.CharacterUtils.isStructuralOrWhitespace;
import static org.simdjson.Tape.END_ARRAY;
import static org.simdjson.Tape.END_OBJECT;
import static org.simdjson.Tape.FALSE_VALUE;
import static org.simdjson.Tape.NULL_VALUE;
import static org.simdjson.Tape.ROOT;
import static org.simdjson.Tape.START_ARRAY;
import static org.simdjson.Tape.START_OBJECT;
import static org.simdjson.Tape.STRING;
import static org.simdjson.Tape.TRUE_VALUE;

class TapeBuilder {

private static final byte SPACE = 0x20;
private static final byte BACKSLASH = '\\';
private static final byte QUOTE = '"';
private static final int BYTES_PROCESSED = StructuralIndexer.SPECIES.vectorByteSize();
private static final byte[] ESCAPE_MAP = new byte[]{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5.
0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};

private final Tape tape;
private final byte[] stringBuffer;
private final OpenContainer[] openContainers;
private final int padding;
private final NumberParser numberParser;

private int stringBufferIdx;
private final StringParser stringParser;

TapeBuilder(int capacity, int depth, int padding) {
this.tape = new Tape(capacity);
Expand All @@ -60,6 +32,7 @@ class TapeBuilder {
}
this.stringBuffer = new byte[capacity];
this.numberParser = new NumberParser(tape);
this.stringParser = new StringParser(tape, stringBuffer);
}

void visitDocumentStart() {
Expand Down Expand Up @@ -193,56 +166,7 @@ void visitKey(byte[] buffer, int idx) {
}

private void visitString(byte[] buffer, int idx) {
tape.append(stringBufferIdx, STRING);
int src = idx + 1;
int dst = stringBufferIdx + Integer.BYTES;
while (true) {
ByteVector srcVec = ByteVector.fromArray(StructuralIndexer.SPECIES, buffer, src);
srcVec.intoArray(stringBuffer, dst);
long backslashBits = srcVec.eq(BACKSLASH).toLong();
long quoteBits = srcVec.eq(QUOTE).toLong();

if (hasQuoteFirst(backslashBits, quoteBits)) {
dst += Long.numberOfTrailingZeros(quoteBits);
break;
}
if (hasBackslash(backslashBits, quoteBits)) {
int backslashDist = Long.numberOfTrailingZeros(backslashBits);
byte escapeChar = buffer[src + backslashDist + 1];
if (escapeChar == 'u') {
throw new UnsupportedOperationException("Support for unicode characters is not implemented yet.");
} else {
stringBuffer[dst + backslashDist] = escape(escapeChar);
src += backslashDist + 2;
dst += backslashDist + 1;
}
} else {
src += BYTES_PROCESSED;
dst += BYTES_PROCESSED;
}
}
int len = dst - stringBufferIdx - Integer.BYTES;
IntegerUtils.toBytes(len, stringBuffer, stringBufferIdx);
stringBufferIdx = dst;
}

private byte escape(byte escapeChar) {
if (escapeChar < 0) {
throw new JsonParsingException("Escaped unexpected character: " + ((char) escapeChar));
}
byte escapeResult = ESCAPE_MAP[escapeChar];
if (escapeResult == 0) {
throw new JsonParsingException("Escaped unexpected character: " + ((char) escapeChar));
}
return escapeResult;
}

private boolean hasQuoteFirst(long backslashBits, long quoteBits) {
return ((backslashBits - 1) & quoteBits) != 0;
}

private boolean hasBackslash(long backslashBits, long quoteBits) {
return ((quoteBits - 1) & backslashBits) != 0;
stringParser.parseString(buffer, idx);
}

private void visitNumber(byte[] buffer, int idx) {
Expand Down Expand Up @@ -278,7 +202,7 @@ private void emptyContainer(char start, char end) {

void reset() {
tape.reset();
stringBufferIdx = 0;
stringParser.reset();
}

JsonValue createJsonValue(byte[] buffer) {
Expand Down
Loading