# Copyright (c) 2020 The GodotXterm authors. # Copyright (c) 2019 The xterm.js authors. All rights reserved. # License MIT extends Reference # Convert a given to a utf8 PoolByteArray. # The code for this function is based on the stackoverflow # answer by user Schwern https://stackoverflow.com/a/42013984. static func utf32_to_utf8(codepoint: int): var utf8 = PoolByteArray([]) if codepoint <= 0x007F: utf8.append(codepoint) elif codepoint <= 0x07FF: utf8.append(0b11000000 | codepoint >> 6 & 0b00011111) utf8.append(0b10000000 | codepoint & 0b00111111) elif codepoint <= 0xFFFF: utf8.append(0b11100000 | codepoint >> 12 & 0b00001111) utf8.append(0b10000000 | codepoint >> 6 & 0b00111111) utf8.append(0b10000000 | codepoint & 0b00111111) elif codepoint <= 0x10FFFF: utf8.append(0b11110000 | codepoint >> 18 & 0b00000111) utf8.append(0b10000000 | codepoint >> 12 & 0b00111111) utf8.append(0b10000000 | codepoint >> 6 & 0b00111111) utf8.append(0b10000000 | codepoint & 0b00111111) else: push_warning("Codepoint " + String(codepoint) + " is out of UTF-8 range") return utf8 # Convert UTF32 codepoint into a String. static func string_from_codepoint(codepoint: int): var utf8 = utf32_to_utf8(codepoint) return utf8.get_string_from_utf8() # Covert UTF32 char codes into a String. # Basically the same as `string_from_codepoint` but for multiple codepoints # in a loop (which is a lot faster). static func utf32_to_string(data: Array, start: int = 0, end: int = -1): if end == -1: end = data.size() var result = '' for i in range(start, end): result += string_from_codepoint(data[i]) return result # Utf8Decoder - decodes UTF8 byte sequences into UTF32 codepoints. class Utf8ToUtf32: var interim = PoolByteArray() func _init(): interim.resize(3) # Clears interim bytes and resets decoder to clean state. func clear(): for i in interim.size(): interim[i] = 0 # Decodes UTF8 byte sequences in `input` to UTF32 codepoints in `target`. # The methods assumes stream input and will store partly transmitted bytes # and decode them with the next data chunk. # Note: The method does no bound checks for target, therefore make sure # the provided data chunk does not exceed the size of `target`. # Returns the number of written codepoints in `target`. func decode(input: PoolByteArray, target: Array): var length = input.size() if !length: return 0 if length > target.size(): target.resize(length) var size = 0 var byte1: int var byte2: int var byte3: int var byte4: int var codepoint = 0 var start_pos = 0 # handle leftover bytes if interim[0]: var discard_interim = false var cp = interim[0] cp &= 0x1F if (cp & 0xE0) == 0xC0 else 0x0F if (cp & 0xF0) == 0xE0 else 0x07 var pos = 1 var tmp = interim[pos] & 0x3F while tmp && pos < 4: cp <<= 6 cp |= tmp pos += 1 tmp = interim[pos] & 0x3F if interim.size() < pos else 0 # missing bytes - read from input var type = 2 if (interim[0] & 0xE0) == 0xC0 else 3 if (interim[0] & 0xF0) == 0xE0 else 4 var missing = type - pos while start_pos < missing: if start_pos >= length: return 0 tmp = input[start_pos] start_pos += 1 if (tmp & 0xC0) != 0x80: # wrong continuation, discard interim bytes completely start_pos -= 1 discard_interim = true break else: # need to save so we can continue short inputs in next call interim[pos + 1] = tmp pos += 1 cp <<= 6 cp |= tmp & 0x3F if not discard_interim: # final test is type dependent match type: 2: if cp < 0x80: # wrong starter byte start_pos -= 1 else: target[size] = cp size += 1 3: if cp < 0x0800 or (cp >= 0xD800 and cp <= 0xDFFF): # illegal codepoint pass else: target[size] = cp size += 1 _: if cp < 0x10000 or cp > 0x10FFFF: # illegal codepoint pass else: target[size] = cp size += 1 clear() # loop through input var four_stop = length - 4 var i = start_pos while i < length: # ASCII shortcut with loop unrolled to 4 consecutive ASCII chars. # This is a compromise between speed gain for ASCII # and penalty for non ASCII: # For best ASCII performance the char should be stored directly into target, # but even a single attempt to write to target and compare afterwards # penalizes non ASCII really bad (-50%), thus we load the char into byteX first, # which reduces ASCII performance by ~15%. # This trial for ASCII reduces non ASCII performance by ~10% which seems acceptible # compared to the gains. # Note that this optimization only takes place for 4 consecutive ASCII chars, # for any shorter it bails out. Worst case - all 4 bytes being read but # thrown away due to the last being a non ASCII char (-10% performance). while i < four_stop: byte1 = input[i] byte2 = input[i + 1] byte3 = input[i + 2] byte4 = input[i + 3] if not (byte1 & 0x80) | (byte2 & 0x80) | (byte3 & 0x80) | (byte4 & 0x80): target[size] = byte1 target[size+1] = byte2 target[size+2] = byte3 target[size+3] = byte4 size += 4 i += 4 else: break # reread byte1 byte1 = input[i] i += 1 # 1 byte if byte1 < 0x80: target[size] = byte1 size += 1 # 2 bytes elif (byte1 & 0xE0) == 0xC0: if i >= length: interim[0] = byte1 return size byte2 = input[i] i+=1 if (byte2 & 0xC0) != 0x80: # wrong continuation i-=1 continue codepoint = (byte1 & 0x1F) << 6 | (byte2 & 0x3F) if (codepoint < 0x80): # wrong starter byte i-=1 continue target[size] = codepoint size+=1 # 3 bytes elif (byte1 & 0xF0) == 0xE0: if i >= length: interim[0] = byte1 return size byte2 = input[i] i+=1 if (byte2 & 0xC0) != 0x80: # wrong continuation i-=1 continue if i >= length: interim[0] = byte1 interim[1] = byte2 return size byte3 = input[i] i+=1 if (byte3 & 0xC0) != 0x80: # wrong continuation i-=1 continue codepoint = (byte1 & 0x0F) << 12 | (byte2 & 0x3F) << 6 | (byte3 & 0x3F) if codepoint < 0x0800 or (codepoint >=0xD800 and codepoint <= 0xDFFF): # illegal codepoint, no i-- here continue target[size] = codepoint size+=1 # 4 bytes elif (byte1 & 0xF8) == 0xF0: if i >= length: interim[0] = byte1 return size byte2 = input[i] i += 1 if (byte2 & 0xC0) != 0x80: # wrong continuation i -= 1 continue if i >= length: interim[0] = byte1 interim[1] = byte2 return size byte3 = input[i] i += 1 if (byte3 & 0xC0) != 0x80: # wrong continuation i -= 1 continue if i >= length: interim[0] = byte1 interim[1] = byte2 interim[2] = byte3 return size byte4 = input[i] i += 1 if (byte4 & 0xC0) != 0x80: # wrong continuation i -= 1 continue codepoint = (byte1 & 0x07) << 18 | (byte2 & 0x3F) << 12 | (byte3 & 0x3F) << 6 | (byte4 & 0x3F) if codepoint < 0x010000 or codepoint > 0x10FFFF: # illegal codepoint, no i-- here continue target[size] = codepoint size += 1 else: # illegal byte, just skip pass target.resize(size) return size