#!/usr/bin/python """Badly named script. Converts non-ascii UTF-8 text into equivalent decimal entities for use in HTML and XML. I have C and perl versions of this too. Requires Python >=2.0. Now works with Python 3.x.""" # Iain Murray 2005, 2019 import fileinput, sys if sys.version_info[0] >= 3: for txt in fileinput.input(): sys.stdout.buffer.write(txt.encode('ascii','xmlcharrefreplace')) elif (sys.version_info[0] > 2) or ((sys.version_info[0] == 2) and (sys.version_info[1] >= 3)): # From python 2.3 xml conversion is built in. for txt in fileinput.input(): txt = unicode(txt, 'utf-8') sys.stdout.write(txt.encode('ascii', 'xmlcharrefreplace')) else: # Horrible version for pythons 2.0--2.2 (not tested recently, might be broken) import re def xmlescape(x): return ''.join(('&#', str(ord(x.group(0))), ';')) nonascii=re.compile('[^\x00-\x7F]') for txt in fileinput.input(): txt = unicode(txt, 'utf-8') sys.stdout.write(nonascii.sub(xmlescape,txt).encode('ascii'))