mirror of
https://github.com/lihop/godot-xterm.git
synced 2024-09-20 16:36:19 +02:00
8d76d3500c
With the exception of text_decoder.gd the code in these files follows the original so closely that it doesn't qualify as an original work and so there is nothing new to copyright. Instead, the original license text is kept with a note mentioning the port to GDScript.
269 lines
7.3 KiB
GDScript
269 lines
7.3 KiB
GDScript
# Copyright (c) 2020 The GodotXterm authors.
|
|
# Copyright (c) 2019 The xterm.js authors. All rights reserved.
|
|
# License MIT
|
|
extends Reference
|
|
|
|
# Convert a given to a utf8 PoolByteArray.
|
|
# The code for this function is based on the stackoverflow
|
|
# answer by user Schwern https://stackoverflow.com/a/42013984.
|
|
static func utf32_to_utf8(codepoint: int):
|
|
var utf8 = PoolByteArray([])
|
|
|
|
if codepoint <= 0x007F:
|
|
utf8.append(codepoint)
|
|
elif codepoint <= 0x07FF:
|
|
utf8.append(0b11000000 | codepoint >> 6 & 0b00011111)
|
|
utf8.append(0b10000000 | codepoint & 0b00111111)
|
|
elif codepoint <= 0xFFFF:
|
|
utf8.append(0b11100000 | codepoint >> 12 & 0b00001111)
|
|
utf8.append(0b10000000 | codepoint >> 6 & 0b00111111)
|
|
utf8.append(0b10000000 | codepoint & 0b00111111)
|
|
elif codepoint <= 0x10FFFF:
|
|
utf8.append(0b11110000 | codepoint >> 18 & 0b00000111)
|
|
utf8.append(0b10000000 | codepoint >> 12 & 0b00111111)
|
|
utf8.append(0b10000000 | codepoint >> 6 & 0b00111111)
|
|
utf8.append(0b10000000 | codepoint & 0b00111111)
|
|
else:
|
|
push_warning("Codepoint " + String(codepoint) + " is out of UTF-8 range")
|
|
|
|
return utf8
|
|
|
|
# Convert UTF32 codepoint into a String.
|
|
static func string_from_codepoint(codepoint: int):
|
|
var utf8 = utf32_to_utf8(codepoint)
|
|
return utf8.get_string_from_utf8()
|
|
|
|
# Covert UTF32 char codes into a String.
|
|
# Basically the same as `string_from_codepoint` but for multiple codepoints
|
|
# in a loop (which is a lot faster).
|
|
static func utf32_to_string(data: Array, start: int = 0, end: int = -1):
|
|
if end == -1:
|
|
end = data.size()
|
|
var result = ''
|
|
for i in range(start, end):
|
|
result += string_from_codepoint(data[i])
|
|
return result
|
|
|
|
# Utf8Decoder - decodes UTF8 byte sequences into UTF32 codepoints.
|
|
class Utf8ToUtf32:
|
|
var interim = PoolByteArray()
|
|
|
|
func _init():
|
|
interim.resize(3)
|
|
|
|
# Clears interim bytes and resets decoder to clean state.
|
|
func clear():
|
|
for i in interim.size():
|
|
interim[i] = 0
|
|
|
|
# Decodes UTF8 byte sequences in `input` to UTF32 codepoints in `target`.
|
|
# The methods assumes stream input and will store partly transmitted bytes
|
|
# and decode them with the next data chunk.
|
|
# Note: The method does no bound checks for target, therefore make sure
|
|
# the provided data chunk does not exceed the size of `target`.
|
|
# Returns the number of written codepoints in `target`.
|
|
func decode(input: PoolByteArray, target: Array):
|
|
var length = input.size()
|
|
|
|
if !length:
|
|
return 0
|
|
|
|
if length > target.size():
|
|
target.resize(length)
|
|
|
|
var size = 0
|
|
var byte1: int
|
|
var byte2: int
|
|
var byte3: int
|
|
var byte4: int
|
|
var codepoint = 0
|
|
var start_pos = 0
|
|
|
|
# handle leftover bytes
|
|
if interim[0]:
|
|
var discard_interim = false
|
|
var cp = interim[0]
|
|
cp &= 0x1F if (cp & 0xE0) == 0xC0 else 0x0F if (cp & 0xF0) == 0xE0 else 0x07
|
|
var pos = 1
|
|
var tmp = interim[pos] & 0x3F
|
|
while tmp && pos < 4:
|
|
cp <<= 6
|
|
cp |= tmp
|
|
pos += 1
|
|
tmp = interim[pos] & 0x3F if interim.size() < pos else 0
|
|
# missing bytes - read from input
|
|
var type = 2 if (interim[0] & 0xE0) == 0xC0 else 3 if (interim[0] & 0xF0) == 0xE0 else 4
|
|
var missing = type - pos
|
|
while start_pos < missing:
|
|
if start_pos >= length:
|
|
return 0
|
|
tmp = input[start_pos]
|
|
start_pos += 1
|
|
if (tmp & 0xC0) != 0x80:
|
|
# wrong continuation, discard interim bytes completely
|
|
start_pos -= 1
|
|
discard_interim = true
|
|
break
|
|
else:
|
|
# need to save so we can continue short inputs in next call
|
|
interim[pos + 1] = tmp
|
|
pos += 1
|
|
cp <<= 6
|
|
cp |= tmp & 0x3F
|
|
if not discard_interim:
|
|
# final test is type dependent
|
|
match type:
|
|
2:
|
|
if cp < 0x80:
|
|
# wrong starter byte
|
|
start_pos -= 1
|
|
else:
|
|
target[size] = cp
|
|
size += 1
|
|
3:
|
|
if cp < 0x0800 or (cp >= 0xD800 and cp <= 0xDFFF):
|
|
# illegal codepoint
|
|
pass
|
|
else:
|
|
target[size] = cp
|
|
size += 1
|
|
_:
|
|
if cp < 0x10000 or cp > 0x10FFFF:
|
|
# illegal codepoint
|
|
pass
|
|
else:
|
|
target[size] = cp
|
|
size += 1
|
|
clear()
|
|
|
|
# loop through input
|
|
var four_stop = length - 4
|
|
var i = start_pos
|
|
while i < length:
|
|
# ASCII shortcut with loop unrolled to 4 consecutive ASCII chars.
|
|
# This is a compromise between speed gain for ASCII
|
|
# and penalty for non ASCII:
|
|
# For best ASCII performance the char should be stored directly into target,
|
|
# but even a single attempt to write to target and compare afterwards
|
|
# penalizes non ASCII really bad (-50%), thus we load the char into byteX first,
|
|
# which reduces ASCII performance by ~15%.
|
|
# This trial for ASCII reduces non ASCII performance by ~10% which seems acceptible
|
|
# compared to the gains.
|
|
# Note that this optimization only takes place for 4 consecutive ASCII chars,
|
|
# for any shorter it bails out. Worst case - all 4 bytes being read but
|
|
# thrown away due to the last being a non ASCII char (-10% performance).
|
|
while i < four_stop:
|
|
byte1 = input[i]
|
|
byte2 = input[i + 1]
|
|
byte3 = input[i + 2]
|
|
byte4 = input[i + 3]
|
|
if not (byte1 & 0x80) | (byte2 & 0x80) | (byte3 & 0x80) | (byte4 & 0x80):
|
|
target[size] = byte1
|
|
target[size+1] = byte2
|
|
target[size+2] = byte3
|
|
target[size+3] = byte4
|
|
size += 4
|
|
i += 4
|
|
else:
|
|
break
|
|
|
|
# reread byte1
|
|
byte1 = input[i]
|
|
i += 1
|
|
|
|
# 1 byte
|
|
if byte1 < 0x80:
|
|
target[size] = byte1
|
|
size += 1
|
|
|
|
# 2 bytes
|
|
elif (byte1 & 0xE0) == 0xC0:
|
|
if i >= length:
|
|
interim[0] = byte1
|
|
return size
|
|
byte2 = input[i]
|
|
i+=1
|
|
if (byte2 & 0xC0) != 0x80:
|
|
# wrong continuation
|
|
i-=1
|
|
continue
|
|
codepoint = (byte1 & 0x1F) << 6 | (byte2 & 0x3F)
|
|
if (codepoint < 0x80):
|
|
# wrong starter byte
|
|
i-=1
|
|
continue
|
|
target[size] = codepoint
|
|
size+=1
|
|
|
|
# 3 bytes
|
|
elif (byte1 & 0xF0) == 0xE0:
|
|
if i >= length:
|
|
interim[0] = byte1
|
|
return size
|
|
byte2 = input[i]
|
|
i+=1
|
|
if (byte2 & 0xC0) != 0x80:
|
|
# wrong continuation
|
|
i-=1
|
|
continue
|
|
if i >= length:
|
|
interim[0] = byte1
|
|
interim[1] = byte2
|
|
return size
|
|
byte3 = input[i]
|
|
i+=1
|
|
if (byte3 & 0xC0) != 0x80:
|
|
# wrong continuation
|
|
i-=1
|
|
continue
|
|
codepoint = (byte1 & 0x0F) << 12 | (byte2 & 0x3F) << 6 | (byte3 & 0x3F)
|
|
if codepoint < 0x0800 or (codepoint >=0xD800 and codepoint <= 0xDFFF):
|
|
# illegal codepoint, no i-- here
|
|
continue
|
|
target[size] = codepoint
|
|
size+=1
|
|
|
|
# 4 bytes
|
|
elif (byte1 & 0xF8) == 0xF0:
|
|
if i >= length:
|
|
interim[0] = byte1
|
|
return size
|
|
byte2 = input[i]
|
|
i += 1
|
|
if (byte2 & 0xC0) != 0x80:
|
|
# wrong continuation
|
|
i -= 1
|
|
continue
|
|
if i >= length:
|
|
interim[0] = byte1
|
|
interim[1] = byte2
|
|
return size
|
|
byte3 = input[i]
|
|
i += 1
|
|
if (byte3 & 0xC0) != 0x80:
|
|
# wrong continuation
|
|
i -= 1
|
|
continue
|
|
if i >= length:
|
|
interim[0] = byte1
|
|
interim[1] = byte2
|
|
interim[2] = byte3
|
|
return size
|
|
byte4 = input[i]
|
|
i += 1
|
|
if (byte4 & 0xC0) != 0x80:
|
|
# wrong continuation
|
|
i -= 1
|
|
continue
|
|
codepoint = (byte1 & 0x07) << 18 | (byte2 & 0x3F) << 12 | (byte3 & 0x3F) << 6 | (byte4 & 0x3F)
|
|
if codepoint < 0x010000 or codepoint > 0x10FFFF:
|
|
# illegal codepoint, no i-- here
|
|
continue
|
|
target[size] = codepoint
|
|
size += 1
|
|
else:
|
|
# illegal byte, just skip
|
|
pass
|
|
|
|
target.resize(size)
|
|
return size
|