1
# -*- coding: utf-8 -*-
2
# flake8: noqa: E501
3 2
"""Conversion functions for various CJK encodings and representations.
4

5
Notes
6
-----
7
Original methods and docs based upon `ltchinese`_, license `MIT`_ Steven
8
Kryskalla.
9

10
.. versionadded:: 0.1
11
    Python 2/3 compatibility.
12

13
    - PEP8, PEP257.
14
    - ``int()`` casting for comparisons
15
    - Python 3 support.
16
    - Python 3 fix for :meth:`~.ucn_to_python`.
17
    - Python 3 ``__future__`` statements.
18
    - All methods converting to ``_python`` will return ``Unicode``.
19
    - All methods converting Unicode to x will return bytestring.
20
    - Add :meth:`~.ucnstring_to_python`
21
    - Any other change upon @ `conversion.py @9227813`_.
22

23
The following terms are used to represent the encodings / representation used
24
in the conversion functions (the samples on the right are for the character
25
U+4E00 (yi1; "one")):
26

27
========================== ====================================================
28
GB2312 (Kuten/Quwei form)  "5027" [used in the "GB2312" field of Unihan.txt]
29
GB2312 (ISO-2022 form)     "523B" [the "internal representation" of GB code]
30
EUC-CN                     "D2BB" [this is the "external encoding" of GB2312-
31
                                    ISO2022's "internal representation"; also
32
                                    the form that Ocrat uses]
33
UTF-8                      "E4 B8 80" [used in the "UTF-8" field in Unihan.txt]
34
-------------------------- ----------------------------------------------------
35
Unihan UCN                 "U+4E00"   [used by Unicode Inc.]
36
-------------------------- ----------------------------------------------------
37
internal Python unicode    u"\u4e00"  [this is the most useful form!]
38
internal Python 'utf8'     "\\xe4\\xb8\\x80"
39
internal Python 'gb2312'   "\\xd2\\xbb"
40
internal Python 'euc-cn'   "\\xd2\\xbb"
41
internal Python 'gb18030'  "\\xd2\\xbb"
42
========================== ====================================================
43

44
See these resources for more information:
45
 * Wikipedia "Extended_Unix_Code" article
46

47
   * "EUC-CN is the usual way to use the GB2312 standard for simplified Chinese
48
     characters ... the ISO-2022 form of GB2312 is not normally used"
49

50
 * Wikipedia "HZ_(encoding)" article (the example conversion)
51

52
 * Wikipedia "Numeric_character_reference" article
53

54
 * Unihan (look for "Encoding forms", "Mappings to Major Standards")
55

56
   * e.g. http://www.unicode.org/cgi-bin/GetUnihanData.pl?codepoint=4E00
57

58
.. _ltchinese: https://bitbucket.org/lost_theory/ltchinese
59
.. _MIT: https://bitbucket.org/lost_theory/ltchinese/src/9227813/LICENSE.txt
60
.. _conversion.py @9227813: https://bitbucket.org/lost_theory/ltchinese/raw/9227813/ltchinese/conversion.py
61
"""
62 2
from __future__ import absolute_import, print_function, unicode_literals
63

64 2
import logging
65 2
import re
66

67 2
from ._compat import string_types, text_type, unichr
68

69 2
log = logging.getLogger(__name__)
70

71

72 2
def hexd(n):
73
    """Return hex digits (strip '0x' at the beginning)."""
74 2
    return hex(n)[2:]
75

76

77 2
def kuten_to_gb2312(kuten):
78
    """
79
    Convert GB kuten / quwei form (94 zones * 94 points) to GB2312-1980 /
80
    ISO-2022-CN hex (internal representation)
81
    """
82 0
    zone, point = int(kuten[:2]), int(kuten[2:])
83 0
    hi, lo = hexd(zone + 0x20), hexd(point + 0x20)
84

85 0
    gb2312 = "%s%s" % (hi, lo)
86

87 0
    assert isinstance(gb2312, bytes)
88 0
    return gb2312
89

90

91 2
def gb2312_to_euc(gb2312hex):
92
    """
93
    Convert GB2312-1980 hex (internal representation) to EUC-CN hex (the
94
    "external encoding")
95
    """
96 0
    hi, lo = int(gb2312hex[:2], 16), int(gb2312hex[2:], 16)
97 0
    hi, lo = hexd(hi + 0x80), hexd(lo + 0x80)
98

99 0
    euc = "%s%s" % (hi, lo)
100 0
    assert isinstance(euc, bytes)
101 0
    return euc
102

103

104 2
def euc_to_python(hexstr):
105
    """
106
    Convert a EUC-CN (GB2312) hex to a Python unicode string.
107
    """
108 2
    hi = hexstr[0:2]
109 2
    lo = hexstr[2:4]
110 2
    gb_enc = b'\\x' + hi + b'\\x' + lo
111 2
    return gb_enc.decode("gb2312")
112

113

114 2
def euc_to_utf8(euchex):
115
    """
116
    Convert EUC hex (e.g. "d2bb") to UTF8 hex (e.g. "e4 b8 80").
117
    """
118 2
    utf8 = euc_to_python(euchex).encode("utf-8")
119 2
    uf8 = utf8.decode('unicode_escape')
120

121 2
    uf8 = uf8.encode('latin1')
122

123 2
    uf8 = uf8.decode('euc-jp')
124 2
    return uf8
125

126

127 2
def ucn_to_unicode(ucn):
128
    """
129
    Convert a Unicode Universal Character Number (e.g. "U+4E00" or "4E00") to
130
    Python unicode (u'\\u4e00')
131
    """
132 2
    if isinstance(ucn, string_types):
133 2
        ucn = ucn.strip("U+")
134 2
        if len(ucn) > int(4):
135 2
            char = b'\\U' + format(int(ucn, 16), '08x').encode('latin1')
136 2
            char = char.decode('unicode_escape')
137
        else:
138 2
            char = unichr(int(ucn, 16))
139
    else:
140 0
        char = unichr(ucn)
141

142 2
    assert isinstance(char, text_type)
143

144 2
    return char
145

146

147 2
def euc_to_unicode(hexstr):
148
    """
149
    Return EUC-CN (GB2312) hex to a Python unicode.
150

151
    Parameters
152
    ----------
153
    hexstr : bytes
154

155
    Returns
156
    -------
157
    unicode :
158
        Python unicode  e.g. ``u'\\u4e00'`` / '一'.
159

160
    Examples
161
    --------
162

163
    >>> u'\u4e00'.encode('gb2312').decode('utf-8')
164
    u'\u04bb'
165

166
    >>> (b'\\x' + b'd2' + b'\\x' + b'bb').replace('\\x', '') \\
167
    ... .decode('hex').decode('utf-8')
168
    u'\u04bb'
169

170
    Note: bytes don't have a ``.replace``:
171

172
    >>> gb_enc = gb_enc.replace('\\x', '').decode('hex')
173
    >>> gb_enc.decode('string_escape')  # Won't work with Python 3.x.
174
    """
175 2
    hi = hexstr[0:2]
176 2
    lo = hexstr[2:4]
177
    # hi and lo are only 2 characters long, no risk with eval-ing them
178

179 2
    gb_enc = b'\\x' + hi + b'\\x' + lo
180 2
    assert isinstance(gb_enc, bytes)
181

182
    # Requires coercing back to text_type in 2.7
183 2
    gb_enc = gb_enc.decode('unicode_escape')
184

185 2
    gb_enc = gb_enc.encode('latin1')
186

187 2
    gb_enc = gb_enc.decode('gb2312')
188

189 2
    assert isinstance(gb_enc, text_type)
190 2
    return gb_enc
191

192

193
""" Convert from internal Python unicode / string objects """
194

195

196 2
def python_to_ucn(uni_char, as_bytes=False):
197
    """
198
    Return UCN character from Python Unicode character.
199

200
    Converts a one character Python unicode string (e.g. u'\\u4e00') to the
201
    corresponding Unicode UCN ('U+4E00').
202
    """
203 2
    ucn = uni_char.encode('unicode_escape').decode('latin1')
204 2
    ucn = text_type(ucn).replace('\\', '').upper().lstrip('U')
205 2
    if len(ucn) > int(4):
206
        # get rid of the zeroes that Python uses to pad 32 byte UCNs
207 2
        ucn = ucn.lstrip("0")
208 2
    ucn = "U+" + ucn.upper()
209

210 2
    if as_bytes:
211 2
        ucn = ucn.encode('latin1')
212

213 2
    return ucn
214

215

216 2
def python_to_euc(uni_char, as_bytes=False):
217
    """
218
    Return EUC character from a Python Unicode character.
219

220
    Converts a one character Python unicode string (e.g. u'\\u4e00') to the
221
    corresponding EUC hex ('d2bb').
222
    """
223 2
    euc = repr(uni_char.encode("gb2312"))[1:-1].replace("\\x", "").strip("'")
224

225 2
    if as_bytes:
226 2
        euc = euc.encode('utf-8')
227 2
        assert isinstance(euc, bytes)
228

229 2
    return euc
230

231

232 2
def ucnstring_to_unicode(ucn_string):
233
    """Return ucnstring as Unicode."""
234 2
    ucn_string = ucnstring_to_python(ucn_string).decode('utf-8')
235

236 2
    assert isinstance(ucn_string, text_type)
237 2
    return ucn_string
238

239

240 2
def ucnstring_to_python(ucn_string):
241
    """
242
    Return string with Unicode UCN (e.g. "U+4E00") to native Python Unicode
243
    (u'\\u4e00').
244
    """
245 2
    res = re.findall(r"U\+[0-9a-fA-F]*", ucn_string)
246 2
    for r in res:
247 2
        ucn_string = ucn_string.replace(text_type(r), text_type(ucn_to_unicode(r)))
248

249 2
    ucn_string = ucn_string.encode('utf-8')
250

251 2
    assert isinstance(ucn_string, bytes)
252 2
    return ucn_string
253

254

255 2
def parse_var(var):
256
    """
257
    Returns a tuple consisting of a string and a tag, or None, if none is
258
    specified.
259
    """
260 2
    bits = var.split("<", 1)
261 2
    if len(bits) < 2:
262 2
        tag = None
263
    else:
264 2
        tag = bits[1]
265 2
    return ucn_to_unicode(bits[0]), tag
266

267

268 2
def parse_vars(_vars):
269
    """
270
    Return an iterator of (char, tag) tuples.
271
    """
272 2
    for var in _vars.split(" "):
273 2
        yield parse_var(var)
274

275

276 2
def parse_untagged(_vars):
277
    """
278
    Return an iterator of chars.
279
    """
280 2
    return (char for char, _tag in parse_vars(_vars))

Read our documentation on viewing source code .

Loading