godot-xterm/addons/godot_xterm/input/text_decoder.gd
2020-05-17 17:32:06 +07:00

266 lines
7.1 KiB
GDScript

# Copyright (c) 2020 The GodotXterm authors.
# Copyright (c) 2019 The xterm.js authors. All rights reserved.
# License MIT
extends Reference
# Convert a given to a utf8 PoolByteArray.
# The code for this function is based on the stackoverflow
# answer by user Schwern https://stackoverflow.com/a/42013984.
static func utf32_to_utf8(codepoint: int):
var utf8 = PoolByteArray([])
if codepoint <= 0x007F:
utf8.append(codepoint)
elif codepoint <= 0x07FF:
utf8.append(0b11000000 | codepoint >> 6 & 0b00011111)
utf8.append(0b10000000 | codepoint & 0b00111111)
elif codepoint <= 0xFFFF:
utf8.append(0b11100000 | codepoint >> 12 & 0b00001111)
utf8.append(0b10000000 | codepoint >> 6 & 0b00111111)
utf8.append(0b10000000 | codepoint & 0b00111111)
elif codepoint <= 0x10FFFF:
utf8.append(0b11110000 | codepoint >> 18 & 0b00000111)
utf8.append(0b10000000 | codepoint >> 12 & 0b00111111)
utf8.append(0b10000000 | codepoint >> 6 & 0b00111111)
utf8.append(0b10000000 | codepoint & 0b00111111)
else:
push_warning("Codepoint " + String(codepoint) + " is out of UTF-8 range")
return utf8
# Covert UTF32 char codes into a String.
# Basically the same as `char` but for multiple codepoints
# in a loop (which is a lot faster).
static func utf32_to_string(data: Array, start: int = 0, end: int = -1):
if end == -1:
end = data.size()
var result = ''
for i in range(start, end):
result += char(data[i])
return result
# Utf8Decoder - decodes UTF8 byte sequences into UTF32 codepoints.
class Utf8ToUtf32:
var interim = PoolByteArray()
func _init():
interim.resize(3)
# Clears interim bytes and resets decoder to clean state.
func clear():
for i in interim.size():
interim[i] = 0
# Decodes UTF8 byte sequences in `input` to UTF32 codepoints in `target`.
# The methods assumes stream input and will store partly transmitted bytes
# and decode them with the next data chunk.
# Note: The method does no bound checks for target, therefore make sure
# the provided data chunk does not exceed the size of `target`.
# Returns the number of written codepoints in `target`.
func decode(input: PoolByteArray, target: Array):
var length = input.size()
if !length:
return 0
if length > target.size():
target.resize(length)
var size = 0
var byte1: int
var byte2: int
var byte3: int
var byte4: int
var codepoint = 0
var start_pos = 0
# handle leftover bytes
if interim[0]:
var discard_interim = false
var cp = interim[0]
cp &= 0x1F if (cp & 0xE0) == 0xC0 else 0x0F if (cp & 0xF0) == 0xE0 else 0x07
var pos = 1
var tmp = interim[pos] & 0x3F
while tmp && pos < 4:
cp <<= 6
cp |= tmp
pos += 1
tmp = interim[pos] & 0x3F if interim.size() < pos else 0
# missing bytes - read from input
var type = 2 if (interim[0] & 0xE0) == 0xC0 else 3 if (interim[0] & 0xF0) == 0xE0 else 4
var missing = type - pos
while start_pos < missing:
if start_pos >= length:
return 0
tmp = input[start_pos]
start_pos += 1
if (tmp & 0xC0) != 0x80:
# wrong continuation, discard interim bytes completely
start_pos -= 1
discard_interim = true
break
else:
# need to save so we can continue short inputs in next call
interim[pos + 1] = tmp
pos += 1
cp <<= 6
cp |= tmp & 0x3F
if not discard_interim:
# final test is type dependent
match type:
2:
if cp < 0x80:
# wrong starter byte
start_pos -= 1
else:
target[size] = cp
size += 1
3:
if cp < 0x0800 or (cp >= 0xD800 and cp <= 0xDFFF):
# illegal codepoint
pass
else:
target[size] = cp
size += 1
_:
if cp < 0x10000 or cp > 0x10FFFF:
# illegal codepoint
pass
else:
target[size] = cp
size += 1
clear()
# loop through input
var four_stop = length - 4
var i = start_pos
while i < length:
# ASCII shortcut with loop unrolled to 4 consecutive ASCII chars.
# This is a compromise between speed gain for ASCII
# and penalty for non ASCII:
# For best ASCII performance the char should be stored directly into target,
# but even a single attempt to write to target and compare afterwards
# penalizes non ASCII really bad (-50%), thus we load the char into byteX first,
# which reduces ASCII performance by ~15%.
# This trial for ASCII reduces non ASCII performance by ~10% which seems acceptible
# compared to the gains.
# Note that this optimization only takes place for 4 consecutive ASCII chars,
# for any shorter it bails out. Worst case - all 4 bytes being read but
# thrown away due to the last being a non ASCII char (-10% performance).
while i < four_stop:
byte1 = input[i]
byte2 = input[i + 1]
byte3 = input[i + 2]
byte4 = input[i + 3]
if not (byte1 & 0x80) | (byte2 & 0x80) | (byte3 & 0x80) | (byte4 & 0x80):
target[size] = byte1
target[size+1] = byte2
target[size+2] = byte3
target[size+3] = byte4
size += 4
i += 4
else:
break
# reread byte1
byte1 = input[i]
i += 1
# 1 byte
if byte1 < 0x80:
target[size] = byte1
size += 1
# 2 bytes
elif (byte1 & 0xE0) == 0xC0:
if i >= length:
interim[0] = byte1
return size
byte2 = input[i]
i+=1
if (byte2 & 0xC0) != 0x80:
# wrong continuation
i-=1
continue
codepoint = (byte1 & 0x1F) << 6 | (byte2 & 0x3F)
if (codepoint < 0x80):
# wrong starter byte
i-=1
continue
target[size] = codepoint
size+=1
# 3 bytes
elif (byte1 & 0xF0) == 0xE0:
if i >= length:
interim[0] = byte1
return size
byte2 = input[i]
i+=1
if (byte2 & 0xC0) != 0x80:
# wrong continuation
i-=1
continue
if i >= length:
interim[0] = byte1
interim[1] = byte2
return size
byte3 = input[i]
i+=1
if (byte3 & 0xC0) != 0x80:
# wrong continuation
i-=1
continue
codepoint = (byte1 & 0x0F) << 12 | (byte2 & 0x3F) << 6 | (byte3 & 0x3F)
if codepoint < 0x0800 or (codepoint >=0xD800 and codepoint <= 0xDFFF):
# illegal codepoint, no i-- here
continue
target[size] = codepoint
size+=1
# 4 bytes
elif (byte1 & 0xF8) == 0xF0:
if i >= length:
interim[0] = byte1
return size
byte2 = input[i]
i += 1
if (byte2 & 0xC0) != 0x80:
# wrong continuation
i -= 1
continue
if i >= length:
interim[0] = byte1
interim[1] = byte2
return size
byte3 = input[i]
i += 1
if (byte3 & 0xC0) != 0x80:
# wrong continuation
i -= 1
continue
if i >= length:
interim[0] = byte1
interim[1] = byte2
interim[2] = byte3
return size
byte4 = input[i]
i += 1
if (byte4 & 0xC0) != 0x80:
# wrong continuation
i -= 1
continue
codepoint = (byte1 & 0x07) << 18 | (byte2 & 0x3F) << 12 | (byte3 & 0x3F) << 6 | (byte4 & 0x3F)
if codepoint < 0x010000 or codepoint > 0x10FFFF:
# illegal codepoint, no i-- here
continue
target[size] = codepoint
size += 1
else:
# illegal byte, just skip
pass
target.resize(size)
return size