chrisjsewell / ipypublish
1 3
from collections import OrderedDict
2 3
import copy
3 3
import io
4 3
import json
5 3
import re
6

7 3
from six import string_types
8 3
from nbconvert.utils.pandoc import get_pandoc_version
9 3
from distutils.version import LooseVersion
10 3
import panflute as pf
11

12 3
from panflute import Element, Doc  # noqa: F401
13 3
from types import FunctionType  # noqa: F401
14

15 3
from ipypublish.filters_pandoc.definitions import IPUB_META_ROUTE
16

17

18 3
def apply_filter(
19
    in_object,
20
    filter_func=None,
21
    out_format="panflute",
22
    in_format="markdown",
23
    strip_meta=False,
24
    strip_blank_lines=False,
25
    replace_api_version=True,
26
    dry_run=False,
27
    **kwargs
28
):
29
    # type: (list[str], FunctionType) -> str
30
    """convenience function to apply a panflute filter(s)
31
    to a string, list of string lines, pandoc AST or panflute.Doc
32

33
    Parameters
34
    ----------
35
    in_object: str or list[str] or dict
36
        can also be panflute.Doc
37
    filter_func:
38
        the filter function or a list of filter functions
39
    out_format: str
40
        for use by pandoc or, if 'panflute', return the panflute.Doc
41
    in_format="markdown": str
42
    strip_meta=False: bool
43
        strip the document metadata before final conversion
44
    strip_blank_lines: bool
45
    strip_ends: bool
46
        strip any blank lines or space from the start and end
47
    replace_api_version: bool
48
        for dict input only, if True,
49
        find the api_version of the available pandoc and
50
        reformat the json as appropriate
51
    dry_run: bool
52
        If True, return the Doc object, before applying the filter
53
    kwargs:
54
        to parse to filter func
55

56
    Returns
57
    -------
58
    str
59

60
    """
61 3
    if isinstance(in_object, pf.Doc):
62 3
        pass
63 3
    elif isinstance(in_object, dict):
64 3
        if not in_format == "json":
65 0
            raise AssertionError(
66
                "the in_format for a dict should be json, " "not {}".format(in_format)
67
            )
68 3
        if "meta" not in in_object:
69 0
            raise ValueError("the in_object does contain a 'meta' key")
70 3
        if "blocks" not in in_object:
71 0
            raise ValueError("the in_object does contain a 'blocks' key")
72 3
        if "pandoc-api-version" not in in_object:
73 0
            raise ValueError("the in_object does contain a 'pandoc-api-version' key")
74 3
        if replace_api_version:
75
            # run pandoc on a null object, to get the correct api version
76 3
            null_raw = pf.run_pandoc("", args=["-t", "json"])
77 3
            null_stream = io.StringIO(null_raw)
78 3
            api_version = pf.load(null_stream).api_version
79

80
            # see panflute.load, w.r.t to legacy version
81 3
            if api_version is None:
82 0
                in_object = [{"unMeta": in_object["meta"]}, in_object["blocks"]]
83
            else:
84 3
                ans = OrderedDict()
85 3
                ans["pandoc-api-version"] = api_version
86 3
                ans["meta"] = in_object["meta"]
87 3
                ans["blocks"] = in_object["blocks"]
88 3
                in_object = ans
89 3
        in_str = json.dumps(in_object)
90 3
    elif isinstance(in_object, (list, tuple)):
91 3
        in_str = "\n".join(in_object)
92 3
    elif isinstance(in_object, string_types):
93 3
        in_str = in_object
94
    else:
95 0
        raise TypeError("object not accepted: {}".format(in_object))
96

97 3
    if not isinstance(in_object, pf.Doc):
98 3
        doc = pf.convert_text(in_str, input_format=in_format, standalone=True)
99
        # f = io.StringIO(in_json)
100
        # doc = pf.load(f)
101
    else:
102 3
        doc = in_object
103

104 3
    doc.format = out_format
105

106 3
    if dry_run:
107 3
        return doc
108

109 3
    if not isinstance(filter_func, (list, tuple, set)):
110 3
        filter_func = [filter_func]
111

112 3
    out_doc = doc
113 3
    for func in filter_func:
114 3
        out_doc = func(out_doc, **kwargs)  # type: Doc
115

116
    # post-process Doc
117 3
    if strip_meta:
118 3
        out_doc.metadata = {}
119 3
    if out_format == "panflute":
120 0
        return out_doc
121

122
    # create out str
123
    # with io.StringIO() as f:
124
    #     pf.dump(doc, f)
125
    #     jsonstr = f.getvalue()
126
    # jsonstr = json.dumps(out_doc.to_json()
127 3
    out_str = pf.convert_text(
128
        out_doc, input_format="panflute", output_format=out_format
129
    )
130

131
    # post-process final str
132 3
    if strip_blank_lines:
133 0
        out_str = out_str.replace("\n\n", "\n")
134

135 3
    return out_str
136

137

138 3
def compare_version(target, comparison):
139
    """Set docstring here.
140

141
    Parameters
142
    ----------
143
    target: str
144
        target version of pandoc
145
    comparison: str
146
        one of '>', '<', '<=', '>=', '=='
147

148
    Returns
149
    -------
150
    bool
151

152
    """
153
    # TODO this only works if you are
154
    # converting json in the same environment
155
    # from pandocxnos import init as get_pandoc_version
156 3
    version = LooseVersion(get_pandoc_version())
157 3
    required = LooseVersion(target)
158 3
    if comparison == ">=":
159 3
        return version >= required
160 0
    elif comparison == "<=":
161 0
        return version <= required
162 0
    elif comparison == ">":
163 0
        return version > required
164 0
    elif comparison == "<":
165 0
        return version < required
166 0
    elif comparison == "==":
167 0
        return version == required
168
    else:
169 0
        raise ValueError("comparison not recognised: {}".format(comparison))
170

171

172 3
def strip_quotes(string):
173
    # type: (str) -> str
174 3
    if string.startswith("'") and string.endswith("'"):
175 0
        string = string[1:-1]
176 3
    if string.startswith('"') and string.endswith('"'):
177 0
        string = string[1:-1]
178 3
    return string
179

180

181 3
def find_attributes(
182
    element, allow_space=True, search_left=False, include_element=False
183
):
184
    """find an attribute 'container' for an element,
185
    of the form <element><space>{#id .class1 .class2 a=1 b="a string"}
186
    and extract its content
187

188
    Parameters
189
    ----------
190
    element:
191
        the element to find attributes for
192
    allow_space=True: bool
193
        whether to allow space between the element and attribute container
194
    search_left=False: bool
195
        search to the left of the element, rather than the right
196
    include_element=False: bool
197
        whether to include the element in the search
198

199

200
    Returns
201
    -------
202
    dict or None:
203
        {"classes": list[str], "attributes": dict[str],
204
         "id": str, "elements": list[Element]}, where elements is
205
         the elements containing the attributes (including space)
206

207
    """
208 3
    if search_left:
209 3
        return _search_attribute_left(element, include_element, allow_space)
210
    else:
211 3
        return _search_attribute_right(element, include_element, allow_space)
212

213

214 3
def _search_attribute_right(element, include_element, allow_space):
215 3
    if (not element.next) and not include_element:
216 3
        return None
217

218 3
    if include_element:
219 0
        adjacent = element
220
    else:
221 3
        adjacent = element.next
222

223 3
    attr_elements = []
224 3
    found_start = False
225 3
    found_end = False
226 3
    while adjacent:
227 3
        if isinstance(adjacent, pf.Space) and allow_space:
228 3
            attr_elements.append(adjacent)
229 3
            adjacent = adjacent.next
230 3
            continue
231 3
        elif (
232
            isinstance(adjacent, pf.Str)
233
            #   and adjacent.text.startswith("{")
234
            #   and adjacent.text.endswith("}")):
235
            and re.search(r"^\{[^}]*\}", adjacent.text)
236
        ):
237
            # TODO this won't handle } in strings, e.g. {a="} "}
238 3
            found_start = True
239 3
            found_end = True
240 3
            attr_elements.append(adjacent)
241 3
            break
242 3
        elif (
243
            isinstance(adjacent, pf.Str)
244
            #   and adjacent.text.startswith("{")):
245
            and re.search(r"^[^\}]*\{", adjacent.text)
246
        ):
247 3
            found_start = True
248 3
            found_end = False
249 3
            attr_elements.append(adjacent)
250 3
            break
251 0
        break
252
        # adjacent = adjacent.next
253

254 3
    if found_start and not found_end:
255 3
        adjacent = adjacent.next
256 3
        while adjacent:
257 3
            if (
258
                isinstance(adjacent, pf.Str)
259
                # and adjacent.text.endswith("}")):
260
                and re.search(r"^[^\{]*\}", adjacent.text)
261
            ):
262
                # TODO this won't handle } in strings, e.g. {a="} "}
263 3
                found_end = True
264 3
                attr_elements.append(adjacent)
265 3
                break
266
            else:
267 3
                attr_elements.append(adjacent)
268 3
            adjacent = adjacent.next
269

270 3
    if not (found_start and found_end):
271 3
        return None
272

273 3
    attribute_str = pf.stringify(pf.Para(*attr_elements)).replace("\n", " ").strip()
274

275
    # split into the label and the rest
276 3
    match = re.match(r"^\{(#[^\s]+|)([^\}]*)\}", attribute_str)
277 3
    if not match:
278 0
        raise ValueError(attribute_str)
279 3
    classes, attributes = process_attributes(match.group(2))
280

281 3
    new_str = attribute_str[len(match.group(0)) :]
282

283 3
    return {
284
        "id": match.group(1)[1:],
285
        "classes": classes,
286
        "attributes": attributes,
287
        "elements": attr_elements,
288
        "append": pf.Str(new_str) if new_str else None,
289
    }
290

291

292 3
def _search_attribute_left(element, include_element, allow_space):
293 3
    if (not element.prev) and not include_element:
294 0
        return None
295

296 3
    if include_element:
297 3
        adjacent = element
298
    else:
299 0
        adjacent = element.prev
300

301 3
    attr_elements = []
302 3
    found_start = False
303 3
    found_end = False
304 3
    while adjacent:
305 3
        if isinstance(adjacent, pf.Space) and allow_space:
306 0
            attr_elements.append(adjacent)
307 0
            adjacent = adjacent.prev
308 0
            continue
309 3
        elif (
310
            isinstance(adjacent, pf.Str)
311
            and adjacent.text.endswith("}")
312
            and adjacent.text.startswith("{")
313
        ):
314
            # TODO this won't handle } in strings, e.g. {a="} "}
315
            # TODO this won't handle characters after } e.g. {a=1})
316 3
            found_start = True
317 3
            found_end = True
318 3
            attr_elements.append(adjacent)
319 3
            break
320 0
        elif isinstance(adjacent, pf.Str) and adjacent.text.endswith("}"):
321 0
            found_start = False
322 0
            found_end = True
323 0
            attr_elements.append(adjacent)
324 0
            break
325 0
        break
326
        # adjacent = adjacent.prev
327

328 3
    if found_end and not found_start:
329 0
        adjacent = adjacent.prev
330 0
        while adjacent:
331 0
            if isinstance(adjacent, pf.Str) and adjacent.text.startswith("{"):
332
                # TODO this won't handle { in strings, e.g. {a="{ "}
333
                # TODO this won't handle characters before { e.g. ({a=1}
334 0
                found_start = True
335 0
                attr_elements.append(adjacent)
336 0
                break
337
            else:
338 0
                attr_elements.append(adjacent)
339 0
            adjacent = adjacent.prev
340

341 3
    if not (found_start and found_end):
342 0
        return None
343

344 3
    attr_elements = list(reversed(attr_elements))
345

346 3
    attribute_str = pf.stringify(pf.Para(*attr_elements)).replace("\n", " ").strip()
347

348
    # split into the label and the rest
349 3
    match = re.match("^\\{(#[^\\s]+|)([^\\}]*)\\}$", attribute_str)
350 3
    if not match:
351 0
        raise ValueError(attribute_str)
352 3
    classes, attributes = process_attributes(match.group(2))
353

354 3
    return {
355
        "id": match.group(1)[1:],
356
        "classes": classes,
357
        "attributes": attributes,
358
        "elements": attr_elements,
359
        "append": None,
360
    }
361

362

363 3
def process_attributes(attr_string):
364
    """process a string of classes and attributes,
365
    e.g. '.class-name .other a=1 b="some text"' will be returned as:
366
    ["class-name", "other"], {"a": 1, "b": "some text"}
367

368
    Returns:
369
    list: classes
370
    dict: attributes
371
    """
372
    # find classes, denoted by .class-name
373 3
    classes = [c[1][1:] for c in re.findall("(^|\\s)(\\.[\\-\\_a-zA-Z]+)", attr_string)]
374
    # find attributes, denoted by a=b, respecting quotes
375 3
    attr = {
376
        c[1]: strip_quotes(c[2])
377
        for c in re.findall(
378
            "(^|\\s)([\\-\\_a-zA-Z]+)\\s*=\\s*(\\\".+\\\"|\\'.+\\'|[^\\s\\\"\\']+)",  # noqa: E501
379
            attr_string,
380
        )
381
    }
382
    # TODO this generally works, but should be stricter against any weird
383
    # fringe cases
384

385
    # TODO add tests
386 3
    return classes, attr
387

388

389 3
def convert_attributes(attr):
390
    """attempt to convert values to python types, e.g. float, list, dict"""
391 3
    attr = copy.deepcopy(attr)
392 3
    for key in list(attr.keys()):
393 0
        try:
394 0
            new_value = json.loads(attr[key])
395 0
            attr[key] = new_value
396 0
        except Exception:
397 0
            pass
398 3
    return attr
399

400

401 3
def convert_units(string, out_units):
402 3
    match = re.compile("^\\s*([0-9]+\\.?[0-9]*)([a-z\\%]*)\\s*$").match(str(string))
403 3
    if match is None:
404 0
        raise ValueError("string could not be resolved as a value: {}".format(string))
405 3
    value = float(match.group(1))
406 3
    in_units = match.group(2)
407 3
    in_units = "fraction" if not in_units else in_units
408

409 3
    if in_units == out_units:
410 0
        return value
411

412 3
    convert = {
413
        ("%", "fraction"): lambda x: x / 100.0,
414
        ("fraction", "%"): lambda x: x * 100.0,
415
    }.get((in_units, out_units), None)
416

417 3
    if convert is None:
418 0
        raise ValueError(
419
            "could not find a conversion for "
420
            "{0} to {1}: {2}".format(in_units, out_units, string)
421
        )
422

423 3
    return convert(value)
424

425

426 3
def get_option(locations, keypath, default=None, delimiter=".", error_on_missing=False):
427
    """ fetch an option variable from a hierarchy of preferred locations
428

429
    The value returned will be from the first available location or the default
430

431
    Parameters
432
    ----------
433
    locations: list[dict]
434
        a list of mappings to search in
435
    keypath: list[str] or str
436
        a key path to search in, if str, then split by delimiter
437
    default=None: object
438
        a default value to return
439
    delimiter: str
440
        if a str then the keypath is expected to be a str
441
    error_on_missing: bool
442
        raise KeyError if not found in any of the options
443

444
    Examples
445
    --------
446

447
    >>> a = {"m": 1}
448
    >>> b = {"x": {"y": 2}}
449
    >>> c = {"x": {"y": 3}}
450
    >>> get_option([a, b, c], keypath=("x", "y"))
451
    2
452
    >>> get_option([a, c, b], keypath=("x", "y"))
453
    3
454
    >>> get_option([a, c, b], keypath="x.y")
455
    3
456
    >>> get_option([a, c, b], keypath="l", default=4)
457
    4
458

459
    """
460 3
    if isinstance(keypath, string_types):
461 3
        keypath = keypath.split(delimiter)
462

463 3
    found_var = False
464 3
    variable = None
465

466 3
    for opt in locations:
467 3
        final_opt = opt
468 3
        found_key = True
469 3
        for key in keypath:
470 3
            try:
471 3
                final_opt = final_opt[key]
472 3
            except (KeyError, TypeError):
473 3
                found_key = False
474 3
                break
475 3
        if found_key:
476 3
            found_var = True
477 3
            variable = final_opt
478 3
            break
479

480 3
    if found_var:
481 3
        return variable
482 3
    elif error_on_missing:
483 0
        raise ValueError("could not retrieve the option keypath: {}".format(keypath))
484

485 3
    return default
486

487

488 3
def create_ipub_meta(options):
489 3
    meta = {}
490 3
    submeta = meta
491 3
    for key in IPUB_META_ROUTE.split(".")[:-1]:
492 3
        submeta[key] = {}
493 3
        submeta = submeta[key]
494 3
    submeta[IPUB_META_ROUTE.split(".")[-1]] = options
495 3
    return meta
496

497

498 3
def get_panflute_containers(element):
499
    """return list of all possible container classes for an element"""
500 3
    panflute_inline_containers = (
501
        pf.Cite,
502
        pf.Emph,
503
        pf.Header,
504
        pf.Image,
505
        pf.LineItem,
506
        pf.Link,
507
        pf.Para,
508
        pf.Plain,
509
        pf.Quoted,
510
        pf.SmallCaps,
511
        pf.Span,
512
        pf.Strikeout,
513
        pf.Strong,
514
        pf.Subscript,
515
        pf.Superscript,
516
    )
517

518 3
    panflute_block_containers = (
519
        pf.BlockQuote,
520
        pf.Definition,
521
        pf.Div,
522
        pf.Doc,
523
        pf.ListItem,
524
        pf.Note,
525
        pf.TableCell,
526
    )
527

528 3
    if issubclass(element, pf.Inline):
529 3
        return panflute_inline_containers
530

531 3
    elif issubclass(element, pf.Block):
532 3
        return panflute_block_containers
533

534 0
    raise TypeError("not Inline or Block: {}".format(element))
535

536

537 3
def get_pf_content_attr(container, target):
538

539 3
    panflute_inline_containers = [
540
        pf.Cite,
541
        pf.Emph,
542
        pf.Header,
543
        pf.Image,
544
        pf.LineItem,
545
        pf.Link,
546
        pf.Para,
547
        pf.Plain,
548
        pf.Quoted,
549
        pf.SmallCaps,
550
        pf.Span,
551
        pf.Strikeout,
552
        pf.Strong,
553
        pf.Subscript,
554
        pf.Superscript,
555
        pf.Table,
556
        pf.DefinitionItem,
557
    ]
558

559 3
    panflute_block_containers = (
560
        pf.BlockQuote,
561
        pf.Definition,
562
        pf.Div,
563
        pf.Doc,
564
        pf.ListItem,
565
        pf.Note,
566
        pf.TableCell,
567
    )
568

569 3
    if issubclass(target, pf.Cite):
570
        # we assume a Cite can't contain another Cite
571 3
        if not isinstance(container, tuple(panflute_inline_containers[1:])):
572 3
            return False
573

574 3
    if issubclass(target, pf.Inline):
575 3
        if isinstance(container, tuple(panflute_inline_containers)):
576 3
            if isinstance(container, pf.Table):
577 3
                return "caption"
578 3
            elif isinstance(container, pf.DefinitionItem):
579 0
                return "term"
580
            else:
581 3
                return "content"
582
        else:
583 3
            return False
584

585 3
    if issubclass(target, pf.Block):
586 3
        if isinstance(container, tuple(panflute_block_containers)):
587 3
            return "content"
588
        else:
589 0
            return False
590

591 0
    raise TypeError("target not Inline or Block: {}".format(target))

Read our documentation on viewing source code .

Loading