godot-xterm/addons/godot_xterm/input/text_decoder.gd
Leroy Hopson 8d76d3500c Update license text in file headers
With the exception of text_decoder.gd the code in these files follows
the original so closely that it doesn't qualify as an original work
and so there is nothing new to copyright.

Instead, the original license text is kept with a note mentioning the
port to GDScript.
2020-05-11 04:05:37 +12:00

269 lines
7.3 KiB
GDScript

# Copyright (c) 2020 The GodotXterm authors.
# Copyright (c) 2019 The xterm.js authors. All rights reserved.
# License MIT
extends Reference
# Convert a given to a utf8 PoolByteArray.
# The code for this function is based on the stackoverflow
# answer by user Schwern https://stackoverflow.com/a/42013984.
static func utf32_to_utf8(codepoint: int):
var utf8 = PoolByteArray([])
if codepoint <= 0x007F:
utf8.append(codepoint)
elif codepoint <= 0x07FF:
utf8.append(0b11000000 | codepoint >> 6 & 0b00011111)
utf8.append(0b10000000 | codepoint & 0b00111111)
elif codepoint <= 0xFFFF:
utf8.append(0b11100000 | codepoint >> 12 & 0b00001111)
utf8.append(0b10000000 | codepoint >> 6 & 0b00111111)
utf8.append(0b10000000 | codepoint & 0b00111111)
elif codepoint <= 0x10FFFF:
utf8.append(0b11110000 | codepoint >> 18 & 0b00000111)
utf8.append(0b10000000 | codepoint >> 12 & 0b00111111)
utf8.append(0b10000000 | codepoint >> 6 & 0b00111111)
utf8.append(0b10000000 | codepoint & 0b00111111)
else:
push_warning("Codepoint " + String(codepoint) + " is out of UTF-8 range")
return utf8
# Convert UTF32 codepoint into a String.
static func string_from_codepoint(codepoint: int):
var utf8 = utf32_to_utf8(codepoint)
return utf8.get_string_from_utf8()
# Covert UTF32 char codes into a String.
# Basically the same as `string_from_codepoint` but for multiple codepoints
# in a loop (which is a lot faster).
static func utf32_to_string(data: Array, start: int = 0, end: int = -1):
if end == -1:
end = data.size()
var result = ''
for i in range(start, end):
result += string_from_codepoint(data[i])
return result
# Utf8Decoder - decodes UTF8 byte sequences into UTF32 codepoints.
class Utf8ToUtf32:
var interim = PoolByteArray()
func _init():
interim.resize(3)
# Clears interim bytes and resets decoder to clean state.
func clear():
for i in interim.size():
interim[i] = 0
# Decodes UTF8 byte sequences in `input` to UTF32 codepoints in `target`.
# The methods assumes stream input and will store partly transmitted bytes
# and decode them with the next data chunk.
# Note: The method does no bound checks for target, therefore make sure
# the provided data chunk does not exceed the size of `target`.
# Returns the number of written codepoints in `target`.
func decode(input: PoolByteArray, target: Array):
var length = input.size()
if !length:
return 0
if length > target.size():
target.resize(length)
var size = 0
var byte1: int
var byte2: int
var byte3: int
var byte4: int
var codepoint = 0
var start_pos = 0
# handle leftover bytes
if interim[0]:
var discard_interim = false
var cp = interim[0]
cp &= 0x1F if (cp & 0xE0) == 0xC0 else 0x0F if (cp & 0xF0) == 0xE0 else 0x07
var pos = 1
var tmp = interim[pos] & 0x3F
while tmp && pos < 4:
cp <<= 6
cp |= tmp
pos += 1
tmp = interim[pos] & 0x3F if interim.size() < pos else 0
# missing bytes - read from input
var type = 2 if (interim[0] & 0xE0) == 0xC0 else 3 if (interim[0] & 0xF0) == 0xE0 else 4
var missing = type - pos
while start_pos < missing:
if start_pos >= length:
return 0
tmp = input[start_pos]
start_pos += 1
if (tmp & 0xC0) != 0x80:
# wrong continuation, discard interim bytes completely
start_pos -= 1
discard_interim = true
break
else:
# need to save so we can continue short inputs in next call
interim[pos + 1] = tmp
pos += 1
cp <<= 6
cp |= tmp & 0x3F
if not discard_interim:
# final test is type dependent
match type:
2:
if cp < 0x80:
# wrong starter byte
start_pos -= 1
else:
target[size] = cp
size += 1
3:
if cp < 0x0800 or (cp >= 0xD800 and cp <= 0xDFFF):
# illegal codepoint
pass
else:
target[size] = cp
size += 1
_:
if cp < 0x10000 or cp > 0x10FFFF:
# illegal codepoint
pass
else:
target[size] = cp
size += 1
clear()
# loop through input
var four_stop = length - 4
var i = start_pos
while i < length:
# ASCII shortcut with loop unrolled to 4 consecutive ASCII chars.
# This is a compromise between speed gain for ASCII
# and penalty for non ASCII:
# For best ASCII performance the char should be stored directly into target,
# but even a single attempt to write to target and compare afterwards
# penalizes non ASCII really bad (-50%), thus we load the char into byteX first,
# which reduces ASCII performance by ~15%.
# This trial for ASCII reduces non ASCII performance by ~10% which seems acceptible
# compared to the gains.
# Note that this optimization only takes place for 4 consecutive ASCII chars,
# for any shorter it bails out. Worst case - all 4 bytes being read but
# thrown away due to the last being a non ASCII char (-10% performance).
while i < four_stop:
byte1 = input[i]
byte2 = input[i + 1]
byte3 = input[i + 2]
byte4 = input[i + 3]
if not (byte1 & 0x80) | (byte2 & 0x80) | (byte3 & 0x80) | (byte4 & 0x80):
target[size] = byte1
target[size+1] = byte2
target[size+2] = byte3
target[size+3] = byte4
size += 4
i += 4
else:
break
# reread byte1
byte1 = input[i]
i += 1
# 1 byte
if byte1 < 0x80:
target[size] = byte1
size += 1
# 2 bytes
elif (byte1 & 0xE0) == 0xC0:
if i >= length:
interim[0] = byte1
return size
byte2 = input[i]
i+=1
if (byte2 & 0xC0) != 0x80:
# wrong continuation
i-=1
continue
codepoint = (byte1 & 0x1F) << 6 | (byte2 & 0x3F)
if (codepoint < 0x80):
# wrong starter byte
i-=1
continue
target[size] = codepoint
size+=1
# 3 bytes
elif (byte1 & 0xF0) == 0xE0:
if i >= length:
interim[0] = byte1
return size
byte2 = input[i]
i+=1
if (byte2 & 0xC0) != 0x80:
# wrong continuation
i-=1
continue
if i >= length:
interim[0] = byte1
interim[1] = byte2
return size
byte3 = input[i]
i+=1
if (byte3 & 0xC0) != 0x80:
# wrong continuation
i-=1
continue
codepoint = (byte1 & 0x0F) << 12 | (byte2 & 0x3F) << 6 | (byte3 & 0x3F)
if codepoint < 0x0800 or (codepoint >=0xD800 and codepoint <= 0xDFFF):
# illegal codepoint, no i-- here
continue
target[size] = codepoint
size+=1
# 4 bytes
elif (byte1 & 0xF8) == 0xF0:
if i >= length:
interim[0] = byte1
return size
byte2 = input[i]
i += 1
if (byte2 & 0xC0) != 0x80:
# wrong continuation
i -= 1
continue
if i >= length:
interim[0] = byte1
interim[1] = byte2
return size
byte3 = input[i]
i += 1
if (byte3 & 0xC0) != 0x80:
# wrong continuation
i -= 1
continue
if i >= length:
interim[0] = byte1
interim[1] = byte2
interim[2] = byte3
return size
byte4 = input[i]
i += 1
if (byte4 & 0xC0) != 0x80:
# wrong continuation
i -= 1
continue
codepoint = (byte1 & 0x07) << 18 | (byte2 & 0x3F) << 12 | (byte3 & 0x3F) << 6 | (byte4 & 0x3F)
if codepoint < 0x010000 or codepoint > 0x10FFFF:
# illegal codepoint, no i-- here
continue
target[size] = codepoint
size += 1
else:
# illegal byte, just skip
pass
target.resize(size)
return size