kbase / relation_engine_importers
Showing 1 of 5 files from the diff.

@@ -2,7 +2,13 @@
Loading
2 2
Common code for dealing with NCBI taxonomy files.
3 3
"""
4 4
5 -
# TODO TEST
5 +
# Since this is KBase internal code we can be a bit less compassionate re good
6 +
# error messages, e.g. throwing KeyErrors or TypeErrors vs a more descriptive message.
7 +
# Similarly, since the input is NCBI taxa dumps we probably don't need to worry much about
8 +
# malformed input.
9 +
# As a result we get slightly less code to maintain and a completely trivial performance boost.
10 +
# And there was much rejoicing.
11 +
6 12
7 13
import re
8 14
from collections import defaultdict
@@ -45,12 +51,19 @@
Loading
45 51
        # both the names and nodes files are sorted by taxid. YAGNI for now
46 52
        name_table = defaultdict(lambda: defaultdict(list))
47 53
        for line in name_file:
54 +
            # this is pretty fragile, but we don't expect the NCBI dump files to have errors
55 +
            # and adding a lot of checking would be a lot of code to maintain for little purpose
48 56
            tax_id, name, _, category = re.split(_SEP, line)[0:4]
49 57
            name_table[tax_id.strip()][category.strip()].append(name.strip())
50 58
51 59
        return {k: dict(name_table[k]) for k in name_table.keys()}
52 60
53 61
    def _get_species_and_strain_ids(self):
62 +
        # Almost certainly faster to just load the tree into memory in one pass and recurse.
63 +
        # Originally written this way to avoid memory usage, but > 80% of the node
64 +
        # IDs are going to be stored in memory either way.
65 +
        # Alternately make a sqlite DB or something
66 +
        # Given the use case is a batch load ~ 1 / month, YAGNI
54 67
        not_converged = True
55 68
        count = 1
56 69
        while not_converged:
@@ -58,6 +71,7 @@
Loading
58 71
            count += 1
59 72
            not_converged = False
60 73
            for line in self._node_fh:
74 +
                # also fragile
61 75
                record = re.split(_SEP, line)
62 76
                id_, parent, rank = [record[i].strip() for i in [0, 1, 2]]
63 77
                if rank in _SPECIES_RANKS:
@@ -99,7 +113,7 @@
Loading
99 113
            yield node
100 114
101 115
102 -
class NCBIEdgeProvider:
116 +
class NCBIEdgeProvider:  # TODO test
103 117
    """
104 118
    NCBIEdgeProvider is an iterable that returns a new NCBI taxonomy edge as a dict where the
105 119
    from key is the child ID and the to key the parent ID with each iteration.
@@ -130,7 +144,7 @@
Loading
130 144
            yield edge
131 145
132 146
133 -
class NCBIMergeProvider:
147 +
class NCBIMergeProvider:  # TODO test
134 148
    """
135 149
    NCBIMergeProvider is an iterable that returns merged node information as a dict where the from
136 150
    key is the merged node ID and the to key the merge target node ID.
Files Coverage
relation_engine 93.12%
src 15.05%
Project Totals (22 files) 43.60%

No yaml found.

Create your codecov.yml to customize your Codecov experience

Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading