kbase / relation_engine_importers

Compare 7f01951 ... +15 ... fff2d53

Showing 1 of 5 files from the diff.

@@ -2,7 +2,13 @@
Loading
2 2
Common code for dealing with NCBI taxonomy files.
3 3
"""
4 4
5 -
# TODO TEST
5 +
# Since this is KBase internal code we can be a bit less compassionate re good
6 +
# error messages, e.g. throwing KeyErrors or TypeErrors vs a more descriptive message.
7 +
# Similarly, since the input is NCBI taxa dumps we probably don't need to worry much about
8 +
# malformed input.
9 +
# As a result we get slightly less code to maintain and a completely trivial performance boost.
10 +
# And there was much rejoicing.
11 +
6 12
7 13
import re
8 14
from collections import defaultdict
@@ -45,19 +51,27 @@
Loading
45 51
        # both the names and nodes files are sorted by taxid. YAGNI for now
46 52
        name_table = defaultdict(lambda: defaultdict(list))
47 53
        for line in name_file:
54 +
            # this is pretty fragile, but we don't expect the NCBI dump files to have errors
55 +
            # and adding a lot of checking would be a lot of code to maintain for little purpose
48 56
            tax_id, name, _, category = re.split(_SEP, line)[0:4]
49 57
            name_table[tax_id.strip()][category.strip()].append(name.strip())
50 58
51 59
        return {k: dict(name_table[k]) for k in name_table.keys()}
52 60
53 61
    def _get_species_and_strain_ids(self):
62 +
        # Almost certainly faster to just load the tree into memory in one pass and recurse.
63 +
        # Originally written this way to avoid memory usage, but > 80% of the node
64 +
        # IDs are going to be stored in memory either way.
65 +
        # Alternately make a sqlite DB or something
66 +
        # Given the use case is a batch load ~ 1 / month, YAGNI
54 67
        not_converged = True
55 68
        count = 1
56 69
        while not_converged:
57 70
            print(f'strain determination round {count}')
58 71
            count += 1
59 72
            not_converged = False
60 73
            for line in self._node_fh:
74 +
                # also fragile
61 75
                record = re.split(_SEP, line)
62 76
                id_, parent, rank = [record[i].strip() for i in [0, 1, 2]]
63 77
                if rank in _SPECIES_RANKS:
@@ -99,7 +113,7 @@
Loading
99 113
            yield node
100 114
101 115
102 -
class NCBIEdgeProvider:
116 +
class NCBIEdgeProvider:  # TODO test
103 117
    """
104 118
    NCBIEdgeProvider is an iterable that returns a new NCBI taxonomy edge as a dict where the
105 119
    from key is the child ID and the to key the parent ID with each iteration.
@@ -130,7 +144,7 @@
Loading
130 144
            yield edge
131 145
132 146
133 -
class NCBIMergeProvider:
147 +
class NCBIMergeProvider:  # TODO test
134 148
    """
135 149
    NCBIMergeProvider is an iterable that returns merged node information as a dict where the from
136 150
    key is the merged node ID and the to key the merge target node ID.

Learn more Showing 1 files with coverage changes found.

New file relation_engine/taxa/ncbi/parsers.py
New
Loading file...
Files Coverage
relation_engine -2.15% 93.12%
src 15.05%
Project Totals (22 files) 43.60%
Loading