1
|
|
# -*- coding: utf-8 -*-
|
2
|
|
# flake8: noqa: E501
|
3
|
2
|
"""Conversion functions for various CJK encodings and representations.
|
4
|
|
|
5
|
|
Notes
|
6
|
|
-----
|
7
|
|
Original methods and docs based upon `ltchinese`_, license `MIT`_ Steven
|
8
|
|
Kryskalla.
|
9
|
|
|
10
|
|
.. versionadded:: 0.1
|
11
|
|
Python 2/3 compatibility.
|
12
|
|
|
13
|
|
- PEP8, PEP257.
|
14
|
|
- ``int()`` casting for comparisons
|
15
|
|
- Python 3 support.
|
16
|
|
- Python 3 fix for :meth:`~.ucn_to_python`.
|
17
|
|
- Python 3 ``__future__`` statements.
|
18
|
|
- All methods converting to ``_python`` will return ``Unicode``.
|
19
|
|
- All methods converting Unicode to x will return bytestring.
|
20
|
|
- Add :meth:`~.ucnstring_to_python`
|
21
|
|
- Any other change upon @ `conversion.py @9227813`_.
|
22
|
|
|
23
|
|
The following terms are used to represent the encodings / representation used
|
24
|
|
in the conversion functions (the samples on the right are for the character
|
25
|
|
U+4E00 (yi1; "one")):
|
26
|
|
|
27
|
|
========================== ====================================================
|
28
|
|
GB2312 (Kuten/Quwei form) "5027" [used in the "GB2312" field of Unihan.txt]
|
29
|
|
GB2312 (ISO-2022 form) "523B" [the "internal representation" of GB code]
|
30
|
|
EUC-CN "D2BB" [this is the "external encoding" of GB2312-
|
31
|
|
ISO2022's "internal representation"; also
|
32
|
|
the form that Ocrat uses]
|
33
|
|
UTF-8 "E4 B8 80" [used in the "UTF-8" field in Unihan.txt]
|
34
|
|
-------------------------- ----------------------------------------------------
|
35
|
|
Unihan UCN "U+4E00" [used by Unicode Inc.]
|
36
|
|
-------------------------- ----------------------------------------------------
|
37
|
|
internal Python unicode u"\u4e00" [this is the most useful form!]
|
38
|
|
internal Python 'utf8' "\\xe4\\xb8\\x80"
|
39
|
|
internal Python 'gb2312' "\\xd2\\xbb"
|
40
|
|
internal Python 'euc-cn' "\\xd2\\xbb"
|
41
|
|
internal Python 'gb18030' "\\xd2\\xbb"
|
42
|
|
========================== ====================================================
|
43
|
|
|
44
|
|
See these resources for more information:
|
45
|
|
* Wikipedia "Extended_Unix_Code" article
|
46
|
|
|
47
|
|
* "EUC-CN is the usual way to use the GB2312 standard for simplified Chinese
|
48
|
|
characters ... the ISO-2022 form of GB2312 is not normally used"
|
49
|
|
|
50
|
|
* Wikipedia "HZ_(encoding)" article (the example conversion)
|
51
|
|
|
52
|
|
* Wikipedia "Numeric_character_reference" article
|
53
|
|
|
54
|
|
* Unihan (look for "Encoding forms", "Mappings to Major Standards")
|
55
|
|
|
56
|
|
* e.g. http://www.unicode.org/cgi-bin/GetUnihanData.pl?codepoint=4E00
|
57
|
|
|
58
|
|
.. _ltchinese: https://bitbucket.org/lost_theory/ltchinese
|
59
|
|
.. _MIT: https://bitbucket.org/lost_theory/ltchinese/src/9227813/LICENSE.txt
|
60
|
|
.. _conversion.py @9227813: https://bitbucket.org/lost_theory/ltchinese/raw/9227813/ltchinese/conversion.py
|
61
|
|
"""
|
62
|
2
|
from __future__ import absolute_import, print_function, unicode_literals
|
63
|
|
|
64
|
2
|
import logging
|
65
|
2
|
import re
|
66
|
|
|
67
|
2
|
from ._compat import string_types, text_type, unichr
|
68
|
|
|
69
|
2
|
log = logging.getLogger(__name__)
|
70
|
|
|
71
|
|
|
72
|
2
|
def hexd(n):
|
73
|
|
"""Return hex digits (strip '0x' at the beginning)."""
|
74
|
2
|
return hex(n)[2:]
|
75
|
|
|
76
|
|
|
77
|
2
|
def kuten_to_gb2312(kuten):
|
78
|
|
"""
|
79
|
|
Convert GB kuten / quwei form (94 zones * 94 points) to GB2312-1980 /
|
80
|
|
ISO-2022-CN hex (internal representation)
|
81
|
|
"""
|
82
|
0
|
zone, point = int(kuten[:2]), int(kuten[2:])
|
83
|
0
|
hi, lo = hexd(zone + 0x20), hexd(point + 0x20)
|
84
|
|
|
85
|
0
|
gb2312 = "%s%s" % (hi, lo)
|
86
|
|
|
87
|
0
|
assert isinstance(gb2312, bytes)
|
88
|
0
|
return gb2312
|
89
|
|
|
90
|
|
|
91
|
2
|
def gb2312_to_euc(gb2312hex):
|
92
|
|
"""
|
93
|
|
Convert GB2312-1980 hex (internal representation) to EUC-CN hex (the
|
94
|
|
"external encoding")
|
95
|
|
"""
|
96
|
0
|
hi, lo = int(gb2312hex[:2], 16), int(gb2312hex[2:], 16)
|
97
|
0
|
hi, lo = hexd(hi + 0x80), hexd(lo + 0x80)
|
98
|
|
|
99
|
0
|
euc = "%s%s" % (hi, lo)
|
100
|
0
|
assert isinstance(euc, bytes)
|
101
|
0
|
return euc
|
102
|
|
|
103
|
|
|
104
|
2
|
def euc_to_python(hexstr):
|
105
|
|
"""
|
106
|
|
Convert a EUC-CN (GB2312) hex to a Python unicode string.
|
107
|
|
"""
|
108
|
2
|
hi = hexstr[0:2]
|
109
|
2
|
lo = hexstr[2:4]
|
110
|
2
|
gb_enc = b'\\x' + hi + b'\\x' + lo
|
111
|
2
|
return gb_enc.decode("gb2312")
|
112
|
|
|
113
|
|
|
114
|
2
|
def euc_to_utf8(euchex):
|
115
|
|
"""
|
116
|
|
Convert EUC hex (e.g. "d2bb") to UTF8 hex (e.g. "e4 b8 80").
|
117
|
|
"""
|
118
|
2
|
utf8 = euc_to_python(euchex).encode("utf-8")
|
119
|
2
|
uf8 = utf8.decode('unicode_escape')
|
120
|
|
|
121
|
2
|
uf8 = uf8.encode('latin1')
|
122
|
|
|
123
|
2
|
uf8 = uf8.decode('euc-jp')
|
124
|
2
|
return uf8
|
125
|
|
|
126
|
|
|
127
|
2
|
def ucn_to_unicode(ucn):
|
128
|
|
"""
|
129
|
|
Convert a Unicode Universal Character Number (e.g. "U+4E00" or "4E00") to
|
130
|
|
Python unicode (u'\\u4e00')
|
131
|
|
"""
|
132
|
2
|
if isinstance(ucn, string_types):
|
133
|
2
|
ucn = ucn.strip("U+")
|
134
|
2
|
if len(ucn) > int(4):
|
135
|
2
|
char = b'\\U' + format(int(ucn, 16), '08x').encode('latin1')
|
136
|
2
|
char = char.decode('unicode_escape')
|
137
|
|
else:
|
138
|
2
|
char = unichr(int(ucn, 16))
|
139
|
|
else:
|
140
|
0
|
char = unichr(ucn)
|
141
|
|
|
142
|
2
|
assert isinstance(char, text_type)
|
143
|
|
|
144
|
2
|
return char
|
145
|
|
|
146
|
|
|
147
|
2
|
def euc_to_unicode(hexstr):
|
148
|
|
"""
|
149
|
|
Return EUC-CN (GB2312) hex to a Python unicode.
|
150
|
|
|
151
|
|
Parameters
|
152
|
|
----------
|
153
|
|
hexstr : bytes
|
154
|
|
|
155
|
|
Returns
|
156
|
|
-------
|
157
|
|
unicode :
|
158
|
|
Python unicode e.g. ``u'\\u4e00'`` / '一'.
|
159
|
|
|
160
|
|
Examples
|
161
|
|
--------
|
162
|
|
|
163
|
|
>>> u'\u4e00'.encode('gb2312').decode('utf-8')
|
164
|
|
u'\u04bb'
|
165
|
|
|
166
|
|
>>> (b'\\x' + b'd2' + b'\\x' + b'bb').replace('\\x', '') \\
|
167
|
|
... .decode('hex').decode('utf-8')
|
168
|
|
u'\u04bb'
|
169
|
|
|
170
|
|
Note: bytes don't have a ``.replace``:
|
171
|
|
|
172
|
|
>>> gb_enc = gb_enc.replace('\\x', '').decode('hex')
|
173
|
|
>>> gb_enc.decode('string_escape') # Won't work with Python 3.x.
|
174
|
|
"""
|
175
|
2
|
hi = hexstr[0:2]
|
176
|
2
|
lo = hexstr[2:4]
|
177
|
|
# hi and lo are only 2 characters long, no risk with eval-ing them
|
178
|
|
|
179
|
2
|
gb_enc = b'\\x' + hi + b'\\x' + lo
|
180
|
2
|
assert isinstance(gb_enc, bytes)
|
181
|
|
|
182
|
|
# Requires coercing back to text_type in 2.7
|
183
|
2
|
gb_enc = gb_enc.decode('unicode_escape')
|
184
|
|
|
185
|
2
|
gb_enc = gb_enc.encode('latin1')
|
186
|
|
|
187
|
2
|
gb_enc = gb_enc.decode('gb2312')
|
188
|
|
|
189
|
2
|
assert isinstance(gb_enc, text_type)
|
190
|
2
|
return gb_enc
|
191
|
|
|
192
|
|
|
193
|
|
""" Convert from internal Python unicode / string objects """
|
194
|
|
|
195
|
|
|
196
|
2
|
def python_to_ucn(uni_char, as_bytes=False):
|
197
|
|
"""
|
198
|
|
Return UCN character from Python Unicode character.
|
199
|
|
|
200
|
|
Converts a one character Python unicode string (e.g. u'\\u4e00') to the
|
201
|
|
corresponding Unicode UCN ('U+4E00').
|
202
|
|
"""
|
203
|
2
|
ucn = uni_char.encode('unicode_escape').decode('latin1')
|
204
|
2
|
ucn = text_type(ucn).replace('\\', '').upper().lstrip('U')
|
205
|
2
|
if len(ucn) > int(4):
|
206
|
|
# get rid of the zeroes that Python uses to pad 32 byte UCNs
|
207
|
2
|
ucn = ucn.lstrip("0")
|
208
|
2
|
ucn = "U+" + ucn.upper()
|
209
|
|
|
210
|
2
|
if as_bytes:
|
211
|
2
|
ucn = ucn.encode('latin1')
|
212
|
|
|
213
|
2
|
return ucn
|
214
|
|
|
215
|
|
|
216
|
2
|
def python_to_euc(uni_char, as_bytes=False):
|
217
|
|
"""
|
218
|
|
Return EUC character from a Python Unicode character.
|
219
|
|
|
220
|
|
Converts a one character Python unicode string (e.g. u'\\u4e00') to the
|
221
|
|
corresponding EUC hex ('d2bb').
|
222
|
|
"""
|
223
|
2
|
euc = repr(uni_char.encode("gb2312"))[1:-1].replace("\\x", "").strip("'")
|
224
|
|
|
225
|
2
|
if as_bytes:
|
226
|
2
|
euc = euc.encode('utf-8')
|
227
|
2
|
assert isinstance(euc, bytes)
|
228
|
|
|
229
|
2
|
return euc
|
230
|
|
|
231
|
|
|
232
|
2
|
def ucnstring_to_unicode(ucn_string):
|
233
|
|
"""Return ucnstring as Unicode."""
|
234
|
2
|
ucn_string = ucnstring_to_python(ucn_string).decode('utf-8')
|
235
|
|
|
236
|
2
|
assert isinstance(ucn_string, text_type)
|
237
|
2
|
return ucn_string
|
238
|
|
|
239
|
|
|
240
|
2
|
def ucnstring_to_python(ucn_string):
|
241
|
|
"""
|
242
|
|
Return string with Unicode UCN (e.g. "U+4E00") to native Python Unicode
|
243
|
|
(u'\\u4e00').
|
244
|
|
"""
|
245
|
2
|
res = re.findall(r"U\+[0-9a-fA-F]*", ucn_string)
|
246
|
2
|
for r in res:
|
247
|
2
|
ucn_string = ucn_string.replace(text_type(r), text_type(ucn_to_unicode(r)))
|
248
|
|
|
249
|
2
|
ucn_string = ucn_string.encode('utf-8')
|
250
|
|
|
251
|
2
|
assert isinstance(ucn_string, bytes)
|
252
|
2
|
return ucn_string
|
253
|
|
|
254
|
|
|
255
|
2
|
def parse_var(var):
|
256
|
|
"""
|
257
|
|
Returns a tuple consisting of a string and a tag, or None, if none is
|
258
|
|
specified.
|
259
|
|
"""
|
260
|
2
|
bits = var.split("<", 1)
|
261
|
2
|
if len(bits) < 2:
|
262
|
2
|
tag = None
|
263
|
|
else:
|
264
|
2
|
tag = bits[1]
|
265
|
2
|
return ucn_to_unicode(bits[0]), tag
|
266
|
|
|
267
|
|
|
268
|
2
|
def parse_vars(_vars):
|
269
|
|
"""
|
270
|
|
Return an iterator of (char, tag) tuples.
|
271
|
|
"""
|
272
|
2
|
for var in _vars.split(" "):
|
273
|
2
|
yield parse_var(var)
|
274
|
|
|
275
|
|
|
276
|
2
|
def parse_untagged(_vars):
|
277
|
|
"""
|
278
|
|
Return an iterator of chars.
|
279
|
|
"""
|
280
|
2
|
return (char for char, _tag in parse_vars(_vars))
|