chrisjsewell / ipypublish
1 3
from copy import deepcopy
2 3
import six
3

4 3
try:
5 3
    from html.parser import HTMLParser
6 0
except ImportError:
7 0
    from HTMLParser import HTMLParser
8

9

10 3
class HTML2JSONParser(HTMLParser, object):
11
    """parses html content to a JSON object,
12
    of the form::
13

14
        {"attrs": {}, "data": [], "children": []}
15

16
    """
17

18 3
    _tag_key = "1_tag"
19 3
    _data_key = "2_data"
20 3
    _tag_attrs_key = "3_attributes"
21 3
    _children_key = "4_children"
22

23 3
    def __init__(
24
        self,
25
        ignore_tags=("head", "script", "style"),
26
        ignore_classes=("footer", "sphinxsidebar", "clearer"),
27
        rstrip_data=True,
28
        sort_class_attr=True,
29
        replace_data_lines=None,
30
        convert_charrefs=False,
31
    ):
32
        """parses html content to a JSON object,
33
        of the form::
34

35
            {"tag": "tag_name", "attrs": {}, "data": [], "children": []}
36

37
        Parameters
38
        ----------
39
        ignore_tags : list[str]
40
            HTML tags that will be ignored (and all their children)
41
        ignore_classes : list[str]
42
            HTML tags with one or more of these classes will be ignored (and all their children)
43
        rstrip_data : bool
44
            apply `rstrip()` to data text, and don't add if data == ''
45
        sort_class_attr : bool
46
            if an attribute is named 'class', its contents will be split and sorted
47
        replace_data_lines: None or dict
48
            mapping of data lines to replace (useful for mapping across different environment versions)
49
        convert_charrefs: bool
50
            If True, all character references (except the ones in script/style elements)
51
            are automatically converted to the corresponding Unicode characters.
52

53
        """
54 3
        if six.PY2:
55 0
            super(HTML2JSONParser, self).__init__()
56
        else:
57 3
            super(HTML2JSONParser, self).__init__(convert_charrefs=convert_charrefs)
58 3
        self._content = {}
59 3
        self._curr_tags = []
60 3
        self._curr_depth = 0
61 3
        self._ignore_depth = None
62 3
        self._rstrip_data = rstrip_data
63 3
        self._sort_class_attr = sort_class_attr
64 3
        self._ignore_tags = ignore_tags
65 3
        self._ignore_classes = set(ignore_classes)
66 3
        self._replace_data_lines = replace_data_lines or {}
67

68 3
    @property
69 3
    def parsed(self):
70 3
        return deepcopy(self._content)
71

72 3
    def reset(self):
73 3
        self._content = {}
74 3
        self._curr_tags = []
75 3
        self._curr_depth = 0
76 3
        self._ignore_depth = None
77 3
        super(HTML2JSONParser, self).reset()
78

79 3
    def _get_subcontent(self):
80 3
        sub_content = self._content
81 3
        for ptag in self._curr_tags:
82 3
            sub_content = sub_content[self._children_key][ptag]
83 3
        return sub_content
84

85 3
    def handle_starttag(self, tag, attrs):
86 3
        self._curr_depth += 1
87 3
        attr_dict = dict(attrs)
88 3
        classes = attr_dict.get("class", "").split()
89 3
        if self._ignore_depth is not None:
90 3
            return
91 3
        elif tag in self._ignore_tags or self._ignore_classes.intersection(classes):
92
            # we ignore any tags and data, until the current depth is less than the ignore depth
93 3
            self._ignore_depth = self._curr_depth
94 3
            return
95 3
        sub_content = self._get_subcontent()
96 3
        if self._sort_class_attr and "class" in attr_dict:
97 3
            attr_dict["class"] = sorted(classes)
98 3
        tag_dict = {self._tag_key: tag}
99 3
        if attr_dict:
100 3
            tag_dict[self._tag_attrs_key] = attr_dict
101 3
        sub_content.setdefault(self._children_key, []).append(tag_dict)
102 3
        self._curr_tags.append(len(sub_content[self._children_key]) - 1)
103

104 3
    def handle_endtag(self, tag):
105 3
        self._curr_depth -= 1
106

107 3
        if self._ignore_depth is not None:
108 3
            if self._ignore_depth > self._curr_depth:
109
                # print("exited ignore: {}".format(tag))
110 3
                self._ignore_depth = None
111 3
            return
112

113 3
        if tag != self._get_subcontent()[self._tag_key]:
114 0
            raise AssertionError(
115
                "{} != {} (current path: {})".format(
116
                    tag, self._get_subcontent()[self._tag_key], self._curr_tags
117
                )
118
            )
119 3
        self._curr_tags = self._curr_tags[:-1]
120

121 3
    def handle_data(self, data):
122 3
        if self._ignore_depth is not None:
123 3
            return
124 3
        if not data.strip():
125 3
            return
126 3
        data = data.splitlines()
127 3
        if self._rstrip_data:
128 3
            data = [d.rstrip() for d in data]
129 3
        if self._replace_data_lines:
130 0
            data = [self._replace_data_lines.get(d, d) for d in data]
131 3
        sub_content = self._get_subcontent()
132 3
        sub_content.setdefault(self._data_key, []).extend(data)
133

134 3
    def handle_entityref(self, name):
135 3
        pass
136

137 3
    def handle_charref(self, name):
138 3
        pass
139

140 3
    def handle_comment(self, data):
141 0
        pass

Read our documentation on viewing source code .

Loading