Skip to content

Commit

Permalink
Write 4-byte characters (surrogate pairs) instead of escapes
Browse files Browse the repository at this point in the history
  • Loading branch information
rnetuka committed Sep 16, 2024
1 parent 89b2381 commit 6fad16d
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 17 deletions.
13 changes: 12 additions & 1 deletion src/main/java/com/fasterxml/jackson/core/JsonGenerator.java
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,18 @@ public enum Feature {
*
* @since 2.17
*/
ESCAPE_FORWARD_SLASHES(false);
ESCAPE_FORWARD_SLASHES(false),

/**
* Feature that specifies how 4-byte characters should be handled in {@link JsonGenerator}. If enabled,
* 4-byte characters made by surrogate pairs are combined and flushed as a single character encoded in UTF-8.
* If disabled, each pair is written as UTF-16 escape.
* <p>
* Feature is disabled by default
*
* @since 2.18
*/
COMBINE_UNICODE_SURROGATES(false);

private final boolean _defaultState;
private final int _mask;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import java.io.*;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;

import com.fasterxml.jackson.core.*;
import com.fasterxml.jackson.core.io.CharTypes;
Expand Down Expand Up @@ -659,6 +660,10 @@ public void writeUTF8String(byte[] text, int offset, int len) throws IOException
_outputBuffer[_outputTail++] = _quoteChar;
}

private boolean isSurrogatePair(char ch) {
return (ch & 0xD800) == 0xD800;
}

/*
/**********************************************************
/* Output method implementations, unprocessed ("raw")
Expand Down Expand Up @@ -1489,6 +1494,8 @@ private final void _writeStringSegment2(final char[] cbuf, int offset, final int
final byte[] outputBuffer = _outputBuffer;
final int[] escCodes = _outputEscapes;

boolean combineSurrogates = Feature.COMBINE_UNICODE_SURROGATES.enabledIn(_features);

while (offset < end) {
int ch = cbuf[offset++];
if (ch <= 0x7F) {
Expand All @@ -1510,7 +1517,14 @@ private final void _writeStringSegment2(final char[] cbuf, int offset, final int
outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6));
outputBuffer[outputPtr++] = (byte) (0x80 | (ch & 0x3f));
} else {
outputPtr = _outputMultiByteChar(ch, outputPtr);
// multibyte character
if (combineSurrogates && isSurrogatePair((char) ch) && offset < end) {
char highSurrogate = (char) ch;
char lowSurrogate = cbuf[offset++];
outputPtr = _outputSurrogatePair(highSurrogate, lowSurrogate, outputPtr);
} else {
outputPtr = _outputMultiByteChar(ch, outputPtr);
}
}
}
_outputTail = outputPtr;
Expand All @@ -1527,6 +1541,8 @@ private final void _writeStringSegment2(final String text, int offset, final int
final byte[] outputBuffer = _outputBuffer;
final int[] escCodes = _outputEscapes;

boolean combineSurrogates = Feature.COMBINE_UNICODE_SURROGATES.enabledIn(_features);

while (offset < end) {
int ch = text.charAt(offset++);
if (ch <= 0x7F) {
Expand All @@ -1548,7 +1564,14 @@ private final void _writeStringSegment2(final String text, int offset, final int
outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6));
outputBuffer[outputPtr++] = (byte) (0x80 | (ch & 0x3f));
} else {
outputPtr = _outputMultiByteChar(ch, outputPtr);
// multibyte character
if (combineSurrogates && isSurrogatePair((char) ch) && offset < end) {
char highSurrogate = (char) ch;
char lowSurrogate = text.charAt(offset++);
outputPtr = _outputSurrogatePair(highSurrogate, lowSurrogate, outputPtr);
} else {
outputPtr = _outputMultiByteChar(ch, outputPtr);
}
}
}
_outputTail = outputPtr;
Expand Down Expand Up @@ -2133,6 +2156,13 @@ protected final void _outputSurrogates(int surr1, int surr2) throws IOException
bbuf[_outputTail++] = (byte) (0x80 | (c & 0x3f));
}

private int _outputSurrogatePair(char highSurrogate, char lowSurrogate, int outputPtr) {
String s = String.valueOf(highSurrogate) + lowSurrogate;
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
System.arraycopy(bytes, 0, _outputBuffer, outputPtr, bytes.length);
return outputPtr + bytes.length;
}

/**
*
* @param ch
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package com.fasterxml.jackson.failing;
package com.fasterxml.jackson.core.json;

import java.io.ByteArrayOutputStream;
import java.io.StringWriter;
Expand All @@ -8,6 +8,7 @@

import org.junit.jupiter.api.Test;

import static com.fasterxml.jackson.core.JsonGenerator.Feature;
import static org.junit.jupiter.api.Assertions.assertEquals;

class Surrogate223Test extends JUnit5TestBase
Expand All @@ -27,7 +28,7 @@ void surrogatesByteBacked() throws Exception
// assertFalse(JSON_F.isEnabled(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES));

out = new ByteArrayOutputStream();
g = JSON_F.createGenerator(out);
g = JSON_F.createGenerator(out).enable(Feature.COMBINE_UNICODE_SURROGATES);
g.writeStartArray();
g.writeString(toQuote);
g.writeEndArray();
Expand All @@ -43,8 +44,7 @@ void surrogatesByteBacked() throws Exception

// but may revert back to original behavior
out = new ByteArrayOutputStream();
g = JSON_F.createGenerator(out);
// g.enable(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES);
g = JSON_F.createGenerator(out).disable(Feature.COMBINE_UNICODE_SURROGATES);
g.writeStartArray();
g.writeString(toQuote);
g.writeEndArray();
Expand Down Expand Up @@ -78,15 +78,5 @@ void surrogatesCharBacked() throws Exception
assertToken(JsonToken.VALUE_STRING, p.nextToken());
assertToken(JsonToken.END_ARRAY, p.nextToken());
p.close();

// but may revert back to original behavior
out = new StringWriter();
g = JSON_F.createGenerator(out);
// g.enable(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES);
g.writeStartArray();
g.writeString(toQuote);
g.writeEndArray();
g.close();
assertEquals(2 + 2 + 12, out.toString().length()); // brackets, quotes, 2 x 6 byte JSON escape
}
}

0 comments on commit 6fad16d

Please sign in to comment.