sdpython / pandas_streaming
Showing 1 of 3 files from the diff.

@@ -5,6 +5,7 @@
Loading
5 5
"""
6 6
from io import StringIO, BytesIO
7 7
from inspect import isfunction
8 +
import numpy
8 9
import numpy.random as nrandom
9 10
import pandas
10 11
from pandas.testing import assert_frame_equal
@@ -56,16 +57,16 @@
Loading
56 57
    in some situations, it is more efficient not to keep
57 58
    that constraints. Draw a random @see me sample
58 59
    is one of these cases.
60 +
61 +
    :param iter_creation: function which creates an iterator or an
62 +
        instance of @see cl StreamingDataFrame
63 +
    :param check_schema: checks that the schema is the same
64 +
        for every :epkg:`dataframe`
65 +
    :param stable: indicates if the :epkg:`dataframe` remains the same
66 +
        whenever it is walked through
59 67
    """
60 68
61 69
    def __init__(self, iter_creation, check_schema=True, stable=True):
62 -
        """
63 -
        @param      iter_creation   function which creates an iterator or an instance of
64 -
                                    @see cl StreamingDataFrame
65 -
        @param      check_schema    checks that the schema is the same for every :epkg:`dataframe`
66 -
        @param      stable          indicates if the :epkg:`dataframe` remains the same whenever
67 -
                                    it is walked through
68 -
        """
69 70
        if isinstance(iter_creation, StreamingDataFrame):
70 71
            self.iter_creation = iter_creation.iter_creation
71 72
            self.stable = iter_creation.stable
@@ -372,14 +373,17 @@
Loading
372 373
                            rows, sch[0], list(it.columns)))  # pylint: disable=E1136
373 374
                if list(it.dtypes) != sch[1]:  # pylint: disable=E1136
374 375
                    errdf = pandas.DataFrame(
375 -
                        dict(names=sch[0], schema1=sch[1], schema2=list(it.dtypes)))  # pylint: disable=E1136
376 +
                        dict(names=sch[0], schema1=sch[1],  # pylint: disable=E1136
377 +
                             schema2=list(it.dtypes)))  # pylint: disable=E1136
376 378
                    tdf = StringIO()
377 379
                    errdf['diff'] = errdf['schema2'] != errdf['schema1']
378 380
                    errdf = errdf[errdf['diff']]
379 -
                    errdf.to_csv(tdf, sep=",")
380 -
                    msg = 'Column types are different after row {0}\n{1}'
381 +
                    errdf.to_csv(tdf, sep=",", index=False)
381 382
                    raise StreamingDataFrameSchemaError(
382 -
                        msg.format(rows, tdf.getvalue()))
383 +
                        'Column types are different after row {0}. You may use option '
384 +
                        'dtype={{"column_name": str}} to force the type on this column.'
385 +
                        '\n---\n{1}'.format(rows, tdf.getvalue()))
386 +
383 387
            rows += it.shape[0]
384 388
            yield it
385 389
@@ -988,3 +992,52 @@
Loading
988 992
989 993
        return StreamingDataFrame(
990 994
            lambda: iterate_na(self, **kwargs), **self.get_kwargs())
995 +
996 +
    def describe(self, percentiles=None, include=None, exclude=None,
997 +
                 datetime_is_numeric=False):
998 +
        """
999 +
        Calls :epkg:`pandas:DataFrame:describe` on every piece
1000 +
        of the datasets. *percentiles* are not really accurate
1001 +
        but just an indication.
1002 +
1003 +
        :param percentiles: see :epkg:`pandas:DataFrame:describe`
1004 +
        :param include: see :epkg:`pandas:DataFrame:describe`
1005 +
        :param exclude: see :epkg:`pandas:DataFrame:describe`
1006 +
        :param datetime_is_numeric: see :epkg:`pandas:DataFrame:describe`
1007 +
        :return: :epkg:`pandas:DataFrame:describe`
1008 +
        """
1009 +
        merged = None
1010 +
        stack = []
1011 +
        notper = ['count', 'mean', 'std']
1012 +
        for df in self:
1013 +
            desc = df.describe(
1014 +
                percentiles=percentiles, include=include, exclude=exclude,
1015 +
                datetime_is_numeric=datetime_is_numeric)
1016 +
            count = desc.loc['count', :]
1017 +
            rows = [name for name in desc.index if name not in notper]
1018 +
            stack.append(desc.loc[rows, :])
1019 +
            if merged is None:
1020 +
                merged = desc
1021 +
                merged.loc['std', :] = (
1022 +
                    merged.loc['std', :] ** 2 + merged.loc['mean', :] ** 2) * count
1023 +
                merged.loc['mean', :] *= count
1024 +
            else:
1025 +
                merged.loc['count', :] += desc.loc['count', :]
1026 +
                merged.loc['mean', :] += desc.loc['mean', :] * count
1027 +
                merged.loc['std', :] += (
1028 +
                    desc.loc['std', :] ** 2 + desc.loc['mean', :] ** 2) * count
1029 +
                merged.loc['max', :] = numpy.maximum(
1030 +
                    merged.loc['max', :], desc.loc['max', :])
1031 +
                merged.loc['min', :] = numpy.maximum(
1032 +
                    merged.loc['min', :], desc.loc['min', :])
1033 +
        merged.loc['mean', :] /= merged.loc['count', :]
1034 +
        merged.loc['std', :] = (
1035 +
            merged.loc['std', :] / merged.loc['count', :] -
1036 +
            merged.loc['mean', :] ** 2) ** 0.5
1037 +
        values = pandas.concat(stack)
1038 +
        summary = values.describe(percentiles=percentiles,
1039 +
                                  datetime_is_numeric=datetime_is_numeric)
1040 +
        merged = merged.loc[notper, :]
1041 +
        rows = [name for name in summary.index if name not in notper]
1042 +
        summary = summary.loc[rows, :]
1043 +
        return pandas.concat([merged, summary])
Files Coverage
pandas_streaming 94.21%
Project Totals (11 files) 94.21%
Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading