Skip to content

Commit

Permalink
UTF-16
Browse files Browse the repository at this point in the history
  • Loading branch information
LunaTheFoxgirl committed Jul 22, 2024
1 parent b428924 commit bf3a13b
Show file tree
Hide file tree
Showing 3 changed files with 218 additions and 13 deletions.
10 changes: 9 additions & 1 deletion source/numem/unicode/package.d
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,16 @@ alias codepoint = uint;
Validates whether the codepoint is within spec
*/
bool validate(codepoint code) {
return code <= 0x10FFFF;
return code <= 0x10FFFF && !hasSurrogatePairs(code);
}

/**
Gets whether the codepoint mistakenly has surrogate pairs encoded within it.
*/
bool hasSurrogatePairs(codepoint code) {
return (code >= 0x0000D800 && code <= 0x0000DFFF);
}

/**
Validates whether the codepoint is within spec
*/
Expand Down
165 changes: 165 additions & 0 deletions source/numem/unicode/utf16.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
module numem.unicode.utf16;
import numem.unicode;
import numem.mem.string;
import numem.mem.vector;

nothrow @nogc:

private {

// Surrogate mask
enum ushort utf16_smask = 0b11111100_00000000;

// Data mask
enum ushort utf16_dmask = cast(ushort)(~utf16_smask);

/// Leading surrogate
enum wchar utf16_lead = 0b11011000_00000000;

/// Trailing surrogate
enum wchar utf16_trail = 0b11011100_00000000;
}

/**
Validates whether the given character is a valid UTF-16 sequence
*/
bool validate(wchar[2] c) {
return
((c[0] >= 0 && c[0] <= 0xD7FF) || (c[0] >= 0xE000 && c[0] <= 0xFFFF)) ||
((c[0] & utf16_smask) == utf16_lead && ((c[1] & utf16_smask) == utf16_trail));
}

/**
Gets how many utf-16 units are in the specified character
*/
size_t getLength(wchar c) {
if ((c >= 0 && c <= 0xD7FF) || (c >= 0xE000 && c <= 0xFFFF)) return 1;
if ((c & utf16_smask) == utf16_lead) return 2;
return 0;
}

@("UTF-16 char len")
unittest {
assert('a'.getLength == 1);
assert(''.getLength == 1);
assert(utf16_trail.getLength() == 0); // Malformed leading byte
}

/**
Gets how many utf-16 units are in the specified codepoint
Returns 0 if the codepoint can't be represented.
*/
size_t getUTF16Length(codepoint code) {
if (code <= 0xD7FF || (code >= 0xE000 && code <= 0xFFFF)) return 1;
else if (code >= 0x010000 && code <= 0x10FFFF) return 2;
return 0;
}

@("UTF-16 codepoint len")
unittest {
assert(0xF4.getUTF16Length == 1);
assert(0x10FFFF.getUTF16Length == 2);
assert(0x11FFFF.getUTF16Length == 0);
}

/**
Decodes a single utf-16 character
*/
codepoint decode(wchar[2] chr, ref size_t read) {
read = chr[0].getLength();
switch(read) {
default:
read = 1;
return unicodeReplacementCharacter;

case 1:
return cast(codepoint)chr[0];

case 2:
codepoint code =
((chr[0] & utf16_dmask) + 0x400) +
((chr[1] & utf16_dmask) + 0x37) +
0x10000;
return code;
}
}

/**
Decodes a utf-16 string
*/
UnicodeSequence decode(nwstring str) {
UnicodeSequence code;

size_t i = 0;
while(i < str.size()) {
wchar[2] txt;

// Validate length, add FFFD if invalid.
size_t clen = str[i].getLength();
if (clen >= i+str.size() || clen == 0) {
code ~= unicodeReplacementCharacter;
i++;
}

txt[0..clen] = str[i..i+clen];
code ~= txt.decode(clen);
i += clen;
}

return code;
}

@("UTF-16 decode string")
unittest {
codepoint[8] seq1 = [0x3053, 0x3093, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01];
codepoint[8] seq2 = [0x3053, unicodeReplacementCharacter, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01];
assert(decode(nwstring("こんにちは世界!"w))[0..$] == seq1);
assert(decode(nwstring("\uFFFDにちは世界!"w))[0..$] == seq2);
}

/**
Encodes a unicode sequence to UTF-16
*/
nwstring encode(UnicodeSlice slice) {
nwstring out_;

size_t i = 0;
while(i < slice.length) {
wchar[2] txt;

size_t clen = slice[i].getUTF16Length();
if (clen == 1) {
txt[0] = cast(wchar)slice[i];
out_ ~= txt[0];
} if (clen == 2) {
codepoint c = slice[i] - 0x10000;

txt[0] = cast(wchar)((c >> 10) + 0xD800);
txt[1] = cast(wchar)((c << 10) + 0xDC00);
out_ ~= cast(wstring)txt[0..$];
} else {
i++;
continue;
}

i++;
}

return out_;
}

/**
Encodes a series of unicode codepoints to UTF-16
*/
nwstring encode(UnicodeSequence sequence) {
return encode(sequence[0..$]);
}

@("UTF-16 encode")
unittest {
codepoint[8] seq1 = [0x3053, 0x3093, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01];
codepoint[8] seq2 = [0x3053, unicodeReplacementCharacter, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01];
assert(encode(seq1) == "こんにちは世界!"w);
assert(encode(seq2) == "\uFFFDにちは世界!"w);
}
56 changes: 44 additions & 12 deletions source/numem/unicode/utf8.d
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ import numem.unicode;
import numem.mem.string;
import numem.mem.vector;

// For some reason D really wants this import.
import numem.unicode : validate;

@nogc nothrow:

private {
Expand Down Expand Up @@ -170,6 +173,28 @@ unittest {
assert((0b10010101).getLength() == 0); // Malformed leading byte
}

/**
Gets how many utf-16 units are in the specified codepoint
Returns 0 if the codepoint can't be represented.
*/
size_t getUTF8Length(codepoint code) {
if (code <= 0x7F) return 1;
else if (code >= 0x0080 && code <= 0x07FF) return 2;
else if (code >= 0x0800 && code <= 0xFFFF) return 3;
else if (code >= 0x010000 && code <= 0x10FFFF) return 4;
return 0;
}

@("UTF-8 codepoint len")
unittest {
assert(0x1.getUTF8Length == 1);
assert(0xF4.getUTF8Length == 2);
assert(0x0801.getUTF8Length == 3);
assert(0x010001.getUTF8Length == 4);
assert(0x11FFFF.getUTF8Length == 0);
}


/**
Decodes a UTF-8 character
Expand Down Expand Up @@ -282,43 +307,50 @@ unittest {
}

/**
Encodes a series of unicode sequences to UTF-8
Encodes a series of unicode codepoints to UTF-8
*/
nstring encode(UnicodeSlice sequence) {
nstring encode(UnicodeSlice slice) {
nstring out_;

size_t i = 0;
while(i < sequence.length) {
while(i < slice.length) {
ptrdiff_t count = 0;
ptrdiff_t offset = 0;
if (sequence[i] <= utf8_ascii) {

// Skip invalid codepoints.
if (!slice[i].validate()) {
i++;
continue;
}

if (slice[i] <= utf8_ascii) {

// Single-byte ascii
out_ ~= cast(char)sequence[i++];
out_ ~= cast(char)slice[i++];
continue;
} else if (sequence[i] >= 0x0080 && sequence[i] <= 0x07FF) {
} else if (slice[i] >= 0x0080 && slice[i] <= 0x07FF) {

// 2 byte
count = 1;
offset = 0xC0;
} else if (sequence[i] >= 0x0800 && sequence[i] <= 0xFFFF) {
} else if (slice[i] >= 0x0800 && slice[i] <= 0xFFFF) {

// 2 byte
count = 2;
offset = 0xE0;
} else if (sequence[i] >= 0x10000 && sequence[i] <= 0x10FFFF) {
} else if (slice[i] >= 0x10000 && slice[i] <= 0x10FFFF) {

// 2 byte
count = 3;
offset = 0xF0;
}


// The magic where things get stitched back together.
char[4] bytes;
bytes[0] = cast(ubyte)((sequence[i] >> (6 * count)) + offset);
bytes[0] = cast(ubyte)((slice[i] >> (6 * count)) + offset);
size_t ix = 1;
while (count > 0) {
size_t temp = sequence[i] >> (6 * (count - 1));
size_t temp = slice[i] >> (6 * (count - 1));
bytes[ix++] = 0x80 | (temp & 0x3F);
count--;
}
Expand All @@ -331,7 +363,7 @@ nstring encode(UnicodeSlice sequence) {
}

/**
Encodes a series of unicode sequences to UTF-8
Encodes a series of unicode codepoints to UTF-8
*/
nstring encode(UnicodeSequence sequence) {
return encode(sequence[0..$]);
Expand Down

0 comments on commit bf3a13b

Please sign in to comment.