#8 Text summarization

Open enpuyou enpuyou
Coverage Reach
summarizer.py analyzer.py

No flags found

Use flags to group coverage reports by test type, project and/or folders.
Then setup custom commit statuses and notifications for each flag.

e.g., #unittest #integration

#production #enterprise

#frontend #backend

Learn more about Codecov Flags here.

Showing 1 of 6 files from the diff.
Other files ignored by Codecov
Pipfile.lock has changed.
test/conftest.py has changed.
src/util/run.py has changed.
Pipfile has changed.

@@ -0,0 +1,86 @@
Loading
1 +
"""Text summary"""
2 +
import os
3 +
import logging
4 +
from typing import Dict, List
5 +
from gensim.summarization import summarize
6 +
import commonmark
7 +
8 +
9 +
logging.basicConfig(
10 +
    format="[%(asctime)s]{%(pathname)s:%(lineno)d}\n\
11 +
%(levelname)s: %(message)s",
12 +
    datefmt="%Y-%m-%d:%H:%M:%S",
13 +
    level=logging.ERROR,
14 +
)
15 +
16 +
17 +
def summarize_text(text: str) -> str:
18 +
    """ Uses genim's summarization to summarize the given text """
19 +
    return summarize(text, word_count=30)
20 +
21 +
22 +
def read_file(path: str) -> str:
23 +
    """ read file from path """
24 +
    with open(path) as input_file:
25 +
        data = input_file.read()
26 +
        return data
27 +
28 +
29 +
def get_file_names(directory_name: str) -> List[str]:
30 +
    """ Uses os library to find all markdown files in given directory """
31 +
    file_list = []
32 +
    for file in os.listdir(directory_name):
33 +
        filename = os.fsdecode(file)
34 +
        if filename.endswith(".md") or filename.endswith("txt"):
35 +
            file_list.append(os.path.join(directory_name, filename))
36 +
        else:
37 +
            continue
38 +
39 +
    return file_list
40 +
41 +
42 +
def merge_dict(dict_1: Dict[str, str], dict_2) -> Dict[str, List[str]]:
43 +
    """Merge dictionaries and keep values of common keys in list"""
44 +
    new_dict = {**dict_1, **dict_2}
45 +
    for key, value in new_dict.items():
46 +
        if key in dict_1 and key in dict_2:
47 +
            new_dict[key] = [value, dict_1[key]]
48 +
    return new_dict
49 +
50 +
51 +
def summarizer(directory: str) -> Dict[str, List[str]]:
52 +
    """A summarizing pipeline"""
53 +
    file_names = get_file_names(directory)
54 +
    main_md_dict = {}
55 +
    for file in file_names:
56 +
        individual_dict = md_parser(read_file(file))
57 +
        main_md_dict = merge_dict(main_md_dict, individual_dict)
58 +
    del main_md_dict["Reflection by"]
59 +
    # initialize summarized dict with keys in sources
60 +
    summarized = {k: [] for k in main_md_dict.keys()}
61 +
    for key, values in main_md_dict.items():
62 +
        for item in values:
63 +
            try:
64 +
                summarized[key].append(summarize_text(item))
65 +
            except ValueError as err:
66 +
                logging.error(f"Cannot summarize text: {err}")
67 +
    return summarized
68 +
69 +
70 +
def md_parser(input_md: str) -> Dict[str, List[str]]:
71 +
    """Parse a markdown file and return as dict of headers and paragraphs"""
72 +
    ast = commonmark.Parser().parse(input_md)
73 +
    md_dict = {}
74 +
    cur_heading = ""
75 +
    for subnode, enter in ast.walker():
76 +
        if subnode.t == "heading" and enter:
77 +
            # set header as key name
78 +
            md_dict[subnode.first_child.literal] = ""
79 +
            cur_heading = subnode.first_child.literal
80 +
        elif subnode.literal is not None and subnode.literal != cur_heading:
81 +
            # add related text to the header
82 +
            md_dict[cur_heading] += subnode.literal + " "
83 +
        else:
84 +
            continue
85 +
86 +
    return md_dict

Learn more Showing 1 files with coverage changes found.

New file src/util/summarizer.py
New
Loading file...

50 Commits

Hiding 49 contexual commits
+1 Files
+53
+49
+4
Pull Request Base Commit
Files Coverage
src/util +13.06% 83.33%
Project Totals (2 files) 83.33%
Loading