"""utf-8 correct truncation"""

def utf8trunc(s, code_length):
	"""returns the utf8 encoding of a string s cropped to code_length bytes while
	maintaining utf-8 correctness.

	This works by removing continuation bytes (10......b) from the end of the
	encoded string until we reach a non-continuation one.  This, then, is
	also removed.

	>>> utf8trunc("äuß", 6)
	b'\xc3\xa4u\xc3\x9f'
	>>> utf8trunc("äuß", 5)
	b'\xc3\xa4u\xc3\x9f'
	>>> utf8trunc("äuß", 4)
	b'\xc3\xa4u'
	>>> utf8trunc("äuß", 3)
	b'\xc3\xa4u'
	>>> utf8trunc("äuß", 2)
	b'\xc3\xa4'
	>>> utf8trunc("äuß", 1)
	b''
	>>> utf8trunc("äuß", 0)
	b''
	>>> utf8trunc("ß⨁", 5)
	b'\xc3\x9f\xe2\xa8\x81'
	>>> utf8trunc("⨁ß", 3)
	b'\xe2\xa8\x81'
	>>> utf8trunc("a⨁", 2)
	b'a'
	>>> utf8trunc('x𝄞', 5)
	b'x\xf0\x9d\x84\x9e'
	>>> utf8trunc('x𝄞', 4)
	b'x'
	>>> utf8trunc('𝄞', 4)
	b'\xf0\x9d\x84\x9e'
	>>> utf8trunc('𝄞', 3)
	b''
	"""
	enc = s.encode("utf-8")[:code_length]
	if not enc:
		return b''

	# find the start character of the last utf-8 sequence
	intro = len(enc)-1
	while enc[intro] & 0xc0 == 0x80 and intro>0:
		intro -= 1

	# 0....... is 1 byte, 110..... is 2 bytes, 1110.... is 3 bytes,
	# and 11110xxx is four bytes; cut to current if there's not enough bytes
	# left.
	if enc[intro] & 0x80 == 0:
		req_len = 1
	elif enc[intro] & 0xe0 == 0xc0:
		req_len = 2
	elif enc[intro] & 0xf0 == 0xe0:
		req_len = 3
	elif enc[intro] & 0xf7 == 0xf0:
		req_len = 4
	else:
		raise NotImplementedError("Invalid UTF-8 sequence?")

	if len(enc)-intro == req_len:
		return enc
	else:
		return enc[:intro]
	
	return enc


if __name__=="__main__":
	#utf8trunc("äuß", 3); ddt
	import doctest
	doctest.testmod()