Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve JSON parser demo #314

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 142 additions & 43 deletions Sources/swift-parsing-benchmark/JSON.swift
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@ import Parsing

/// This benchmark shows how to create a naive JSON parser with combinators.
///
/// It is mostly implemented according to the [spec](https://www.json.org/json-en.html) (we take a
/// shortcut and use `Double.parser()`, which behaves accordingly).
/// It is implemented according to the [spec](https://www.json.org/json-en.html).
let jsonSuite = BenchmarkSuite(name: "JSON") { suite in
#if swift(>=5.8)
struct JSONValue: ParserPrinter {
enum Output: Equatable {
case array([Self])
case boolean(Bool)
case float(Double)
case integer(Int)
case null
case number(Double)
Comment on lines +14 to -16
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain this change? I was under the impression that JSON/JavaScript only has the concept of a "number" and doesn't distinguish integer and float values.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm yes, true. JSON only has the concept of a number, which is a sequence of digits, optionally followed by a decimal point and a fractional part. Now, if we model this as just a Double in Swift, we lose information about the number. 1 and 1.0, for example, becomes indistinguishable. By modeling numbers that does contain a fractional part as a Double, and numbers that doesn't as an Int, we at least keep that information, which I think is often desirable in Swift.

To not lose any information we could model the numbers as a String (case number(String)), but that seems less convenient in practice. I think just separating numbers with and without any fractional part strikes a good balance of what information to keep about the numbers. This is at least how I'm thinking about this :)

case object([String: Self])
case string(String)
}
Expand All @@ -24,14 +24,56 @@ let jsonSuite = BenchmarkSuite(name: "JSON") { suite in
JSONObject().map(.case(Output.object))
JSONArray().map(.case(Output.array))
JSONString().map(.case(Output.string))
Double.parser().map(.case(Output.number))
JSONNumber().pipe {
OneOf {
Rest().map(.string.lossless(Int.self)).map(.case(Output.integer))
Rest().map(.string.lossless(Double.self)).map(.case(Output.float))
}
}
Bool.parser().map(.case(Output.boolean))
"null".utf8.map { Output.null }
}
Whitespace()
}
}

struct EscapedSingleChar: ParserPrinter {
var body: some ParserPrinter<Substring.UTF8View, String> {
"\\".utf8

OneOf {
"\"".utf8.map { "\"" }
"\\".utf8.map { "\\" }
"/".utf8.map { "/" }
"b".utf8.map { "\u{8}" }
"f".utf8.map { "\u{c}" }
"n".utf8.map { "\n" }
"r".utf8.map { "\r" }
"t".utf8.map { "\t" }
}
}
}

struct UnescapedString: ParserPrinter {
var body: some ParserPrinter<Substring.UTF8View, String> {
Prefix(1...) { $0.isUnescapedJSONStringByte }
.printing { output, input in
try Prefix(output.count) { $0.isUnescapedJSONStringByte }.print(output, into: &input)
}
.map(.string)
}
}

struct LiteralUnicodeCodePoint: ParserPrinter {
var body: some ParserPrinter<Substring.UTF8View, UInt32> {
"\\".utf8
Parse {
"u".utf8
Prefix(4) { $0.isHexDigit }.map(.base16Int)
}
}
}

struct JSONString: ParserPrinter {
var body: some ParserPrinter<Substring.UTF8View, String> {
"\"".utf8
Expand All @@ -41,32 +83,54 @@ let jsonSuite = BenchmarkSuite(name: "JSON") { suite in
string.map(String.init).reversed().makeIterator()
} element: {
OneOf {
Prefix(1) { $0.isUnescapedJSONStringByte }.map(.string)

Parse {
"\\".utf8

OneOf {
"\"".utf8.map { "\"" }
"\\".utf8.map { "\\" }
"/".utf8.map { "/" }
"b".utf8.map { "\u{8}" }
"f".utf8.map { "\u{c}" }
"n".utf8.map { "\n" }
"r".utf8.map { "\r" }
"t".utf8.map { "\t" }
ParsePrint(.unicode) {
Prefix(4) { $0.isHexDigit }
}
OneOf {
// surrogate pair
Parse(.surrogateCodePoint) {
LiteralUnicodeCodePoint()
.filter((0xD800 ... 0xDBFF).contains)
LiteralUnicodeCodePoint()
.filter((0xDC00 ... 0xDFFF).contains)
}

// single unicode scalar
LiteralUnicodeCodePoint()
}
.map(.codePointToString)

EscapedSingleChar()

UnescapedString()
}
} terminator: {
"\"".utf8
}
}
}

struct JSONNumber: ParserPrinter {
var body: some ParserPrinter<Substring.UTF8View, Substring.UTF8View> {
Consumed {
Optionally { "-".utf8 }

OneOf {
"0".utf8
Parse {
Digits(1).filter { $0 != 0 }
Digits(0...)
}.map { _ in }
}

Optionally { ".".utf8; Digits(1...) }

Optionally {
OneOf { "e".utf8; "E".utf8 }
Optionally { OneOf { "+".utf8; "-".utf8 } }
Digits(1...)
}
}
}
}

struct JSONObject: ParserPrinter {
var body: some ParserPrinter<Substring.UTF8View, [String: JSONValue.Output]> {
"{".utf8
Expand Down Expand Up @@ -109,12 +173,12 @@ let jsonSuite = BenchmarkSuite(name: "JSON") { suite in
{
"hello": true,
"goodbye": 42.42,
"whatever": null,
"xs": [1, "hello", null, false],
"ys": {
"0": 2,
"1": "goodbye\n"
}
},
"\uD834\uDD1E": null
}
"""#
var jsonOutput: JSONValue.Output!
Expand All @@ -126,13 +190,13 @@ let jsonSuite = BenchmarkSuite(name: "JSON") { suite in
jsonOutput
== .object([
"hello": .boolean(true),
"goodbye": .number(42.42),
"whatever": .null,
"xs": .array([.number(1), .string("hello"), .null, .boolean(false)]),
"goodbye": .float(42.42),
"xs": .array([.integer(1), .string("hello"), .null, .boolean(false)]),
"ys": .object([
"0": .number(2),
"0": .integer(2),
"1": .string("goodbye\n"),
]),
"𝄞": .null,
])
)
precondition(
Expand All @@ -141,9 +205,9 @@ let jsonSuite = BenchmarkSuite(name: "JSON") { suite in
{\
"goodbye":42.42,\
"hello":true,\
"whatever":null,\
"xs":[1.0,"hello",null,false],\
"ys":{"0":2.0,"1":"goodbye\\n"}\
"xs":[1,"hello",null,false],\
"ys":{"0":2,"1":"goodbye\\n"},\
"𝄞":null\
}
"""
)
Expand All @@ -159,42 +223,77 @@ let jsonSuite = BenchmarkSuite(name: "JSON") { suite in
(objectOutput as! NSDictionary) == [
"hello": true,
"goodbye": 42.42,
"whatever": NSNull(),
"xs": [1, "hello", nil, false] as [Any?],
"ys": [
"0": 2,
"1": "goodbye\n",
] as [String: Any],
"𝄞": NSNull(),
]
)
}
#endif
}

extension UTF8.CodeUnit {
fileprivate var isHexDigit: Bool {
private extension UTF8.CodeUnit {
var isHexDigit: Bool {
(.init(ascii: "0") ... .init(ascii: "9")).contains(self)
|| (.init(ascii: "A") ... .init(ascii: "F")).contains(self)
|| (.init(ascii: "a") ... .init(ascii: "f")).contains(self)
}

fileprivate var isUnescapedJSONStringByte: Bool {
var isUnescapedJSONStringByte: Bool {
self != .init(ascii: "\"") && self != .init(ascii: "\\") && self >= .init(ascii: " ")
}
}

extension Conversion where Self == AnyConversion<Substring.UTF8View, String> {
fileprivate static var unicode: Self {
private extension Conversion where Self == AnyConversion<Substring.UTF8View, UInt32> {
static var base16Int: Self {
Self(
apply: { UInt32(Substring($0), radix: 16) },
unapply: { int in
var utf8View = String(int, radix: 16)[...].utf8
utf8View.prepend(contentsOf: Array("000"[...].utf8))
return utf8View.suffix(4)
}
)
}
}

private extension Conversion where Self == AnyConversion<(UInt32, UInt32), UInt32> {
static var surrogateCodePoint: Self {
Self(
apply: {
UInt32(Substring($0), radix: 16)
.flatMap(UnicodeScalar.init)
.map(String.init)
apply: { (h, l) in
let a = (h - 0xD800) * 0x400
let b = (l - 0xDC00) + 0x10000
return a + b
},
unapply: {
$0.unicodeScalars.first
.map { String(UInt32($0), radix: 16)[...].utf8 }
unapply: { codePoint in
let h = (codePoint - 0x10000) / 0x400 + 0xD800
let l = (codePoint - 0x10000) % 0x400 + 0xDC00
return (h, l)
}
)
}
}

private extension Conversion where Self == AnyConversion<UInt32, String> {
static var codePointToString: Self {
Self(.unicodeScalar.map(.unicodeScalarView.substring.string))
}
}

private extension Conversion where Self == AnyConversion<UInt32, UnicodeScalar> {
static var unicodeScalar: Self {
Self(apply: { UnicodeScalar($0) }, unapply: { UInt32($0) })
}
}

private extension Conversion where Self == AnyConversion<UnicodeScalar, Substring.UnicodeScalarView> {
static var unicodeScalarView: Self {
Self(
apply: { .init([$0]) },
unapply: { $0.count == 1 ? $0.first : nil }
)
}
}