chrisjsewell / ipypublish
1 3
from collections import deque
2 3
import io
3 3
import logging
4

5 3
logger = logging.getLogger(__name__)
6

7

8 3
def import_texsoup():
9 3
    try:
10 3
        from TexSoup import TexSoup
11 3
        from TexSoup.utils import TokenWithPosition
12 3
        from TexSoup.data import RArg, OArg
13 0
    except ImportError:
14 0
        raise ImportError(
15
            "to parse tex files, TexSoup must be installed: \n"
16
            "pip install texsoup\n"
17
            "conda install -c conda-forge texsoup"
18
        )
19 0
    except SyntaxError:
20 0
        raise ImportError(
21
            "TexSoup package is broken on python 2.7, "
22
            "so can not be imported for tex parsing"
23
        )
24 3
    return {
25
        "TexSoup": TexSoup,
26
        "RArg": RArg,
27
        "OArg": OArg,
28
        "TokenWithPosition": TokenWithPosition,
29
    }
30

31

32 3
def _create_msg_error(msg, node=None, row=None):
33
    """create error message, optionally including TexNode and row"""
34 3
    text = msg.strip()
35 3
    if row is not None:
36 0
        text = "(row {}) ".format(row) + text
37 3
    if hasattr(node, "name"):
38 3
        text = text + ": {}".format(node.name)
39 3
    return text
40

41

42 3
def extract_required_val(rarg):
43
    """extract the value of a TexSoup RArg"""
44 3
    RArg = import_texsoup()["RArg"]
45 3
    if not isinstance(rarg, RArg):
46 0
        raise ValueError("expected {} to be a required argument".format(type(rarg)))
47 3
    return rarg.value
48

49

50 3
def _extract_parameters(texsoup_exprs):
51
    """extract the parameters from a TexSoup expression list"""
52 3
    RArg = import_texsoup()["RArg"]
53 3
    TokenWithPosition = import_texsoup()["TokenWithPosition"]
54 3
    expressions = deque(texsoup_exprs)
55 3
    param_name = None
56 3
    params = {}
57 3
    errors = []
58 3
    while expressions:
59 3
        expr = expressions.popleft()
60 3
        if isinstance(expr, TokenWithPosition):
61
            # TODO is this the best way to extract parameter name?
62 3
            param_name = expr.text.replace(",", "").replace("=", "").strip()
63 3
        elif isinstance(expr, RArg):
64 3
            if param_name is None:
65 0
                errors.append(
66
                    "expected expression "
67
                    "'{}' to precede a parameter name".format(expr)
68
                )
69 0
                break
70 3
            if param_name in params:
71 0
                errors.append("parameter '{}' already defined".format(param_name))
72
            else:
73 3
                params[param_name] = expr.value
74 3
            param_name = None
75
        else:
76 0
            errors.append(
77
                "expected expression '{}' ".format(expr)
78
                + "to be a parameter name or required argument"
79
            )
80 0
            break
81

82 3
    if param_name is not None:
83
        pass  # allowed since last expr may be new line
84
        # errors.append(
85
        #     "parameter '{}' is not assigned a value".format(param_name))
86

87 3
    return params, errors
88

89

90 3
def extract_parameters(argument):
91
    """extract parameters from a TexSoup OArg or Arg"""
92 3
    RArg = import_texsoup()["RArg"]
93 3
    OArg = import_texsoup()["OArg"]
94 3
    if not isinstance(argument, (OArg, RArg)):
95 0
        raise ValueError(
96
            "expected {} to be of type OArg or RArg".format(type(argument))
97
        )
98

99 3
    opt_params, errors = _extract_parameters(argument.exprs)
100

101 3
    return opt_params, errors
102

103

104 3
def create_newgloss_dict(gterm, row=None):
105
    """
106
    """
107 3
    arguments = list(gterm.args)
108 3
    fields = {}
109

110 3
    if len(arguments) != 2:
111 3
        msg = _create_msg_error(
112
            "could not parse newglossaryterm (arguments != 2)", gterm, row
113
        )
114 3
        raise IOError(msg)
115

116 3
    key = extract_required_val(arguments[0])
117

118 3
    params, errors = extract_parameters(arguments[1])
119

120 3
    for error in errors:
121 0
        msg = _create_msg_error(
122
            "error reading 'parameter' block: {}".format(error), gterm, row
123
        )
124 0
        raise IOError(msg)
125

126 3
    for param_name, param_value in params.items():
127

128 3
        if param_name in fields:
129 0
            raise IOError(
130
                "duplicate parameter '{0}' in key '{1}'".format(param_name, key)
131
            )
132

133 3
        fields[param_name] = param_value
134

135 3
    return key, fields
136

137

138 3
def create_newacronym_dict(acronym, row=None):
139
    """
140
    """
141 3
    OArg = import_texsoup()["OArg"]
142

143 3
    arguments = list(acronym.args)
144 3
    fields = {}
145

146 3
    if len(arguments) < 3:
147 3
        msg = _create_msg_error(
148
            "could not parse newacronym (too few arguments)", acronym, row
149
        )
150 3
        raise IOError(msg)
151 3
    if len(arguments) > 4:
152 0
        msg = _create_msg_error(
153
            "could not parse newacronym (too many arguments)", acronym, row
154
        )
155 0
        raise IOError(msg)
156

157 3
    key = extract_required_val(arguments[-3])
158 3
    abbreviation = extract_required_val(arguments[-2])
159 3
    name = extract_required_val(arguments[-1])
160

161 3
    if len(arguments) == 4:
162 3
        options = arguments[0]
163

164 3
        if not isinstance(options, OArg):
165 0
            msg = _create_msg_error(
166
                "expected first argument of newacronym to be 'optional", acronym, row
167
            )
168 0
            raise IOError(msg)
169

170 3
        opt_params, errors = extract_parameters(options)
171

172 3
        for error in errors:
173 0
            msg = _create_msg_error(
174
                "error reading newacronym 'optional' block: {}".format(error),
175
                acronym,
176
                row,
177
            )
178 0
            raise IOError(msg)
179

180 3
        for opt_name, opt_value in opt_params.items():
181 3
            if opt_name in fields:
182 0
                raise IOError(
183
                    "duplicate parameter '{0}' in key '{1}'".format(opt_name, key)
184
                )
185 3
            fields[opt_name] = opt_value
186

187 3
    return key, abbreviation, name, fields
188

189

190 3
def parse_tex(
191
    text_str=None,
192
    path=None,
193
    encoding="utf8",
194
    abbrev_field="abbreviation",
195
    fname_field="longname",
196
    skip_ioerrors=False,
197
):
198
    """parse a tex file containing newglossaryentry and/or newacronym to dict
199

200
    Parameters
201
    ----------
202
    text_str=None: str
203
        string representing the tex file
204
    path=None: str
205
        path to the tex file
206
    encoding='utf8': str
207
        tex file encoding
208
    abbrev_field="abbreviation": str
209
        field key for acronym abbreviation
210
    fname_field="longname": str
211
        field key for acronym full name
212
    skip_ioerrors=False: bool
213
        skip errors on reading a single entry
214

215
    Returns
216
    -------
217
    dict: glossaryterms
218
        {key: fields}
219
    dict: acronyms
220
        {key: fields}
221

222
    """
223 3
    TexSoup = import_texsoup()["TexSoup"]
224

225 3
    if sum([e is not None for e in [text_str, path]]) != 1:
226 0
        raise ValueError("only one of text_str or path must be supplied")
227 3
    elif path is not None:
228 3
        if text_str is not None:
229 0
            raise ValueError("text_str and path cannot be set at the same time")
230 3
        with io.open(path, encoding=encoding) as fobj:
231 3
            text_str = fobj.read()
232

233 3
    latex_tree = TexSoup(text_str)
234

235 3
    keys = []
236 3
    gterms = {}
237 3
    acronyms = {}
238

239 3
    for gterm in latex_tree.find_all("newglossaryentry"):
240 3
        try:
241 3
            key, fields = create_newgloss_dict(gterm)
242 3
        except IOError:
243 3
            if skip_ioerrors:
244 0
                continue
245 3
            raise
246 3
        if key in keys:
247 0
            raise KeyError("duplicate key: {}".format(key))
248 3
        keys.append(key)
249 3
        gterms[key] = fields
250

251 3
    for acronym in latex_tree.find_all("newacronym"):
252 3
        try:
253 3
            key, abbreviation, name, fields = create_newacronym_dict(acronym)
254 3
        except IOError:
255 3
            if skip_ioerrors:
256 3
                continue
257 3
            raise
258 3
        if key in keys:
259 3
            raise KeyError("duplicate key: {}".format(key))
260 3
        keys.append(key)
261 3
        fields[abbrev_field] = abbreviation
262 3
        fields[fname_field] = name
263 3
        acronyms[key] = fields
264

265 3
    return gterms, acronyms

Read our documentation on viewing source code .

Loading