chrisjsewell / ipypublish
1
""" a panflute filter to find raw elements
2
and convert them to format agnostic Span elements
3
"""
4 3
import re
5 3
from typing import Union  # noqa: F401
6 3
from panflute import Element, Doc, Cite, RawInline, Link  # noqa: F401
7 3
import panflute as pf
8

9 3
from ipypublish.filters_pandoc.definitions import (
10
    ATTRIBUTE_CITE_CLASS,
11
    PREFIX_MAP,
12
    PREFIX_MAP_LATEX_R,
13
    PREFIX_MAP_RST_R,
14
    RST_KNOWN_ROLES,
15
    RAWSPAN_CLASS,
16
    RAWDIV_CLASS,
17
    CONVERTED_CITE_CLASS,
18
    CONVERTED_OTHER_CLASS,
19
    CONVERTED_DIRECTIVE_CLASS,
20
)
21 3
from ipypublish.filters_pandoc.utils import get_panflute_containers, get_pf_content_attr
22

23

24 3
def create_cite_span(identifiers, rawformat, is_block, prefix="", alt=None):
25
    """create a cite element from an identifier """
26 3
    citations = [pf.Citation(identifier) for identifier in identifiers]
27 3
    pmapping = dict(dict(PREFIX_MAP)[prefix])
28 3
    classes = list(pmapping["classes"])
29 3
    classes += [RAWSPAN_CLASS, CONVERTED_CITE_CLASS, ATTRIBUTE_CITE_CLASS]
30 3
    attributes = dict(pmapping["attributes"])
31 3
    attributes["raw-format"] = rawformat
32 3
    if alt is not None:
33 3
        attributes["alt"] = str(alt)
34 3
    cite = Cite(citations=citations)
35 3
    span = pf.Span(cite, classes=classes, attributes=attributes)
36 3
    if is_block:
37 0
        return pf.Plain(span)
38
    else:
39 3
        return span
40

41

42 3
def process_internal_links(link, doc):
43
    # type: (Link, Doc) -> Element
44
    """extract links that point to internal items, e.g. [text](#label)"""
45 3
    if not isinstance(link, pf.Link):
46 3
        return None
47 3
    match = re.match(r"#(.+)$", link.url)
48 3
    if not match:
49 3
        return None
50

51 3
    return create_cite_span(
52
        [match.group(1)],
53
        "markdown",
54
        False,
55
        prefix=dict(PREFIX_MAP_LATEX_R).get("cref"),
56
        alt=pf.stringify(pf.Plain(*list(link.content))).strip(),
57
    )
58

59

60 3
def process_html_cites(container, doc):
61
    # type: (pf.Block, Doc) -> Element
62
    """extract raw html <cite data-cite="cite_key">text</cite>"""
63
    # if not (isinstance(block, get_panflute_containers(pf.RawInline))
64
    #         or isinstance(block, get_panflute_containers(pf.RawBlock))):
65
    #     return None
66 3
    content_attr = get_pf_content_attr(container, pf.RawInline)
67 3
    if not content_attr:
68 3
        content_attr = get_pf_content_attr(container, pf.RawBlock)
69

70 3
    if not content_attr:
71 0
        return None
72 3
    initial_content = getattr(container, content_attr)
73

74 3
    if not initial_content:
75 3
        return None
76

77 3
    new_content = []
78 3
    skip = 0
79

80 3
    for element in initial_content:
81

82 3
        if skip > 0:
83 3
            skip = skip - 1
84 3
            continue
85

86 3
        if not (
87
            isinstance(element, (pf.RawInline, pf.RawBlock))
88
            and element.format in ("html", "html4", "html5")
89
        ):
90 3
            new_content.append(element)
91 3
            continue
92

93 3
        match = re.match(r"<cite\s*data-cite\s*=\"?([^>\"]*)\"?>", element.text)
94 3
        if not match:
95 0
            new_content.append(element)
96 0
            continue
97

98
        # look for the closing tag
99 3
        span_content = []
100 3
        closing = element.next
101

102 3
        while closing:
103 3
            if isinstance(closing, pf.RawInline) and closing.format in (
104
                "html",
105
                "html5",
106
            ):
107 3
                endmatch = re.match(r"^\s*</cite>\s*$", closing.text)
108 3
                if endmatch:
109 3
                    break
110 3
            span_content.append(closing)
111 3
            closing = closing.next
112

113 3
        if not closing:
114 0
            new_content.append(element)
115 0
            continue
116

117
        # TODO include original content
118 3
        new_content.append(
119
            create_cite_span([match.group(1)], "html", isinstance(element, pf.RawBlock))
120
        )
121 3
        skip = len(span_content) + 1
122

123 3
    setattr(container, content_attr, new_content)
124 3
    return container
125

126

127 3
def process_latex_raw(element, doc):
128
    # type: (Union[pf.RawInline, pf.RawBlock], pf.Doc) -> pf.Element
129
    """extract all latex adhering to \\tag{content} or \\tag[options]{content}
130
    to a Span element with class RAWSPAN_CLASS attributes:
131

132
    ::
133

134
        attributes={"format": "latex",
135
                    "tag": tag, "content": content, "options": options}
136

137
    - Cref, cref, ref, and cite will aslo have class CONVERTED_CITE_CLASS
138
    - everything else will also have class CONVERTED_OTHER_CLASS
139

140
    """
141 3
    if not (
142
        isinstance(element, (pf.RawInline, pf.RawBlock))
143
        and element.format in ("tex", "latex")
144
    ):
145 3
        return None
146

147 3
    return assess_latex(element.text, isinstance(element, pf.RawBlock))
148

149

150 3
def process_latex_str(block, doc):
151
    # type: (pf.Block, Doc) -> Union[pf.Block,None]
152
    """see process_latex_raw
153

154
    same but sometimes pandoc doesn't convert to a raw element
155
    """
156
    # TODO why is pandoc sometimes converting latex tags to Str?
157
    # >> echo "\cite{a}" | pandoc -f markdown -t json
158
    # {"blocks":[{"t":"Para","c":[{"t":"RawInline","c":["tex","\\cite{a}"]}]}],"pandoc-api-version":[1,17,5,4],"meta":{}}
159

160 3
    content_attr = get_pf_content_attr(block, pf.Str)
161 3
    if not content_attr:
162 0
        return None
163 3
    initial_content = getattr(block, content_attr)
164

165 3
    if not initial_content:
166 3
        return None
167

168 3
    new_content = []
169

170 3
    for element in initial_content:
171 3
        if not isinstance(element, pf.Str):
172 3
            new_content.append(element)
173 3
            continue
174 3
        for string in re.split(
175
            r"(\\[^\{\[]+\{[^\}]+\}|\\[^\{\[]+\[[^\]]*\]\{[^\}]+\})", element.text
176
        ):
177 3
            if not string:
178 0
                continue
179 3
            new_element = assess_latex(string, False)
180 3
            if new_element is None:
181 3
                new_content.append(pf.Str(string))
182
            else:
183 0
                new_content.append(assess_latex(string, False))
184

185 3
    setattr(block, content_attr, new_content)
186 3
    return block
187

188

189 3
def assess_latex(text, is_block):
190
    """ test if text is a latex command
191
    ``\\tag{content}`` or ``\\tag[options]{content}``
192

193
    if so return a panflute.Span, with attributes:
194

195
    - format: "latex"
196
    - tag: <tag>
197
    - options: <options>
198
    - content: <content>
199
    - original: <full text>
200

201
    """
202
    # TODO these regexes do not match labels containing nested {} braces
203
    # use recursive regexes (https://stackoverflow.com/a/26386070/5033292)
204
    # with https://pypi.org/project/regex/
205

206
    # find tags with no option, i.e \tag{label}
207 3
    match_latex_noopts = re.match(r"^\s*\\([^\{\[]+)\{([^\}]+)\}\s*$", text)
208 3
    if match_latex_noopts:
209 3
        tag = match_latex_noopts.group(1)
210 3
        content = match_latex_noopts.group(2)
211 3
        if tag in dict(PREFIX_MAP_LATEX_R):
212 3
            new_element = create_cite_span(
213
                content.split(","),
214
                "latex",
215
                is_block,
216
                prefix=dict(PREFIX_MAP_LATEX_R).get(tag, ""),
217
            )
218 3
            return new_element
219

220 3
        span = pf.Span(
221
            classes=[RAWSPAN_CLASS, CONVERTED_OTHER_CLASS],
222
            attributes={
223
                "format": "latex",
224
                "tag": tag,
225
                "content": content,
226
                "original": text,
227
            },
228
        )
229 3
        if is_block:
230 3
            return pf.Plain(span)
231
        else:
232 3
            return span
233

234
    # find tags with option, i.e \tag[options]{label}
235 3
    match_latex_wopts = re.match(r"^\s*\\([^\{\[]+)\[([^\]]*)\]\{([^\}]+)\}\s*$", text)
236 3
    if match_latex_wopts:
237 3
        tag = match_latex_wopts.group(1)
238 3
        options = match_latex_wopts.group(2)
239 3
        content = match_latex_wopts.group(3)
240

241 3
        span = pf.Span(
242
            classes=[RAWSPAN_CLASS, CONVERTED_OTHER_CLASS],
243
            attributes={
244
                "format": "latex",
245
                "tag": tag,
246
                "content": content,
247
                "options": options,
248
                "original": text,
249
            },
250
        )
251 3
        if is_block:
252 3
            return pf.Plain(span)
253
        else:
254 3
            return span
255

256 3
    return None
257

258

259 3
def process_rst_roles(block, doc):
260
    # type: (pf.Block, Doc) -> Union[pf.Block,None]
261
    """extract rst adhering to ``:role:`label```, where role is a known
262
    to a Cite element with class RAWSPAN_CLASS and CONVERTED_CITE_CLASS
263
    and attributes:
264

265
    ::
266

267
        attributes={"format": "rst",
268
                    "role": tag, "content": content}
269

270
    """
271
    # "a :ref:`label` b" is converted to:
272
    # (Str(a) Space Str(:ref:) Code(label) Space Str(b))
273
    # if not (isinstance(block, get_panflute_containers(pf.Str))):
274
    #     return None
275 3
    content_attr = get_pf_content_attr(block, pf.Str)
276 3
    if not content_attr:
277 0
        return None
278 3
    initial_content = getattr(block, content_attr)
279

280 3
    if not initial_content:
281 3
        return None
282

283
    # match_rst_role = re.match(
284
    #     "^\\s*\\:([a-z]+)\\:\\`([^\\`]+)\\`$", element.text)
285

286 3
    new_content = []
287 3
    skip_next = False
288

289 3
    for element in initial_content:
290

291 3
        if skip_next:
292 3
            skip_next = False
293 3
            continue
294

295 3
        if not (isinstance(element, pf.Str) and isinstance(element.next, pf.Code)):
296 3
            new_content.append(element)
297 3
            continue
298

299 3
        if not (
300
            len(element.text) > 2
301
            and element.text.startswith(":")
302
            and element.text.endswith(":")
303
        ):
304 0
            new_content.append(element)
305 0
            continue
306

307 3
        role = element.text[1:-1]
308 3
        content = element.next.text
309

310 3
        if role in dict(PREFIX_MAP_RST_R):
311 3
            new_element = create_cite_span(
312
                content.split(","),
313
                "rst",
314
                False,
315
                prefix=dict(PREFIX_MAP_RST_R).get(role, ""),
316
            )
317 3
            new_content.append(new_element)
318 3
            skip_next = True
319 3
        elif role in RST_KNOWN_ROLES:
320 3
            new_element = pf.Span(
321
                classes=[RAWSPAN_CLASS, CONVERTED_OTHER_CLASS],
322
                attributes={
323
                    "format": "rst",
324
                    "role": role,
325
                    "content": content,
326
                    "original": "{0}`{1}`".format(element.text, element.next.text),
327
                },
328
            )
329 3
            new_content.append(new_element)
330 3
            skip_next = True
331
        else:
332 0
            new_content.append(element)
333

334
    # if len(new_content) != len(block.content):
335
    #     block.content = new_content
336
    #     return block
337 3
    setattr(block, content_attr, new_content)
338 3
    return block
339

340

341 3
def gather_processors(element, doc):
342
    """ we gather the processors,
343
    so that we don't have to do multiple passes
344
    """
345

346
    # apply processors that change one elements
347

348 3
    new_element = process_internal_links(element, doc)
349 3
    if new_element is not None:
350 3
        return new_element
351

352 3
    new_element = process_latex_raw(element, doc)
353 3
    if new_element is not None:
354 3
        return new_element
355

356
    # apply processors that change multiple inline elements in a block
357

358 3
    if isinstance(element, get_panflute_containers(pf.Inline)) or isinstance(
359
        pf.Table, pf.DefinitionItem
360
    ):
361

362 3
        new_element = process_html_cites(element, doc)
363 3
        if new_element is not None:
364 3
            element = new_element
365 3
        new_element = process_latex_str(element, doc)
366 3
        if new_element is not None:
367 3
            element = new_element
368 3
        new_element = process_rst_roles(element, doc)
369 3
        if new_element is not None:
370 3
            element = new_element
371

372
    # apply processors that change multiple block elements
373 3
    if isinstance(element, get_panflute_containers(pf.Block)):
374

375 3
        new_element = process_html_cites(element, doc)
376 3
        if new_element is not None:
377 3
            element = new_element
378

379 3
    return element
380

381

382 3
def wrap_rst_directives(doc):
383
    """search for rst directives and wrap them in divs
384

385
    with top line starting ``Str(..)Space()Str(name::)``, above a CodeBlock,
386
    and rst labels of the form ``Str(..)Space()Str(_name:)``
387

388
    """
389 3
    final_blocks = []
390 3
    skip_next = False
391 3
    for block in doc.content:
392

393 3
        if skip_next:
394 3
            skip_next = False
395 3
            continue
396

397 3
        if not isinstance(block, pf.Para):
398 3
            final_blocks.append(block)
399 3
            continue
400

401 3
        if len(block.content) < 3:
402 3
            final_blocks.append(block)
403 3
            continue
404

405 3
        if (
406
            isinstance(block.content[0], pf.Str)
407
            and block.content[0].text == ".."
408
            and isinstance(block.content[1], pf.Space)
409
            and isinstance(block.content[2], pf.Str)
410
        ):
411

412 3
            if (
413
                len(block.content) == 3
414
                and block.content[2].text.startswith("_")
415
                and block.content[2].text.endswith(":")
416
            ):
417
                # the block is an rst label
418 3
                new_block = pf.Div(
419
                    block,
420
                    classes=[RAWDIV_CLASS, CONVERTED_OTHER_CLASS],
421
                    attributes={"format": "rst"},
422
                )
423 3
                final_blocks.append(new_block)
424 3
                continue
425

426 3
            if block.content[2].text.endswith("::") and isinstance(
427
                block.next, pf.CodeBlock
428
            ):
429
                # the block is a directive with body content
430
                # TODO at present we allow any directive name
431
                # the block may contain option directives, e.g. :width:
432 3
                skip_next = True
433

434 3
                inline_arg = ""
435 3
                if len(block.content) > 3:
436 3
                    inline_content = []
437 3
                    for el in block.content[3:]:
438 3
                        if isinstance(el, pf.SoftBreak):
439 3
                            break
440 3
                        inline_content.append(el)
441 3
                    if inline_content:
442 3
                        inline_arg = (
443
                            pf.stringify(pf.Para(*inline_content))
444
                            .replace("\n", "")
445
                            .strip()
446
                        )
447

448 3
                new_block = pf.Div(
449
                    block,
450
                    *pf.convert_text(block.next.text),
451
                    classes=[RAWDIV_CLASS, CONVERTED_DIRECTIVE_CLASS],
452
                    attributes={
453
                        "format": "rst",
454
                        "directive": block.content[2].text[:-2],
455
                        "inline": inline_arg,
456
                        "has_body": True,
457
                    }
458
                )
459 3
                final_blocks.append(new_block)
460 3
                continue
461

462 0
            if block.content[2].text.endswith("::"):
463
                # the block is a directive without body content
464
                # TODO at present we allow any directive name
465
                # the block may contain option directives, e.g. :width:
466

467 0
                inline_arg = ""
468 0
                if len(block.content) > 3:
469 0
                    inline_content = []
470 0
                    for el in block.content[3:]:
471 0
                        if isinstance(el, pf.SoftBreak):
472 0
                            break
473 0
                        inline_content.append(el)
474 0
                    if inline_content:
475 0
                        inline_arg = (
476
                            pf.stringify(pf.Para(*inline_content))
477
                            .replace("\n", "")
478
                            .strip()
479
                        )
480

481 0
                new_block = pf.Div(
482
                    block,
483
                    classes=[RAWDIV_CLASS, CONVERTED_DIRECTIVE_CLASS],
484
                    attributes={
485
                        "format": "rst",
486
                        "directive": block.content[2].text[:-2],
487
                        "inline": inline_arg,
488
                        "has_body": False,
489
                    },
490
                )
491 0
                final_blocks.append(new_block)
492 0
                continue
493

494 3
        final_blocks.append(block)
495

496 3
    doc.content = final_blocks
497

498

499 3
def prepare(doc):
500
    # type: (Doc) -> None
501 3
    wrap_rst_directives(doc)
502

503

504 3
def finalize(doc):
505
    # type: (Doc) -> None
506 3
    pass
507

508

509 3
def main(doc=None, extract_formats=True):
510
    # type: (Doc, bool) -> None
511
    """if extract_formats then convert citations defined in
512
    latex, rst or html formats to special Span elements
513
    """
514 3
    return pf.run_filter(gather_processors, prepare, finalize, doc=doc)
515

516

517 3
if __name__ == "__main__":
518 0
    main()

Read our documentation on viewing source code .

Loading