sdpython / pandas_streaming

@@ -97,7 +97,7 @@
Loading
97 97
    r = abs(train_weights - test_weights) / \
98 98
        (1.0 * (train_weights + test_weights))
99 99
    if r >= fail_imbalanced:
100 -
        raise ImbalancedSplitException(
100 +
        raise ImbalancedSplitException(  # pragma: no cover
101 101
            "Split is imbalanced: train_weights={0} test_weights={1} r={2}".format(train_weights, test_weights, r))
102 102
103 103
    return df.iloc[train_ids, :], df.iloc[test_ids, :]

@@ -102,7 +102,8 @@
Loading
102 102
    for b, c in zip(bufs, close):
103 103
        if c:
104 104
            b.close()
105 -
    return [st.getvalue() if isinstance(st, StringIO) else p for st, p in zip(bufs, path_or_buf)]
105 +
    return [st.getvalue() if isinstance(st, StringIO) else p
106 +
            for st, p in zip(bufs, path_or_buf)]
106 107
107 108
108 109
def sklearn_train_test_split_streaming(self, test_size=0.25, train_size=None,

@@ -10,6 +10,34 @@
Loading
10 10
from pandas import DataFrame, Index
11 11
12 12
13 +
def numpy_types():
14 +
    """
15 +
    Returns the list of :epkg:`numpy` available types.
16 +
17 +
    :return: list of types
18 +
    """
19 +
20 +
    return [numpy.bool_,
21 +
            numpy.int_,
22 +
            numpy.intc,
23 +
            numpy.intp,
24 +
            numpy.int8,
25 +
            numpy.int16,
26 +
            numpy.int32,
27 +
            numpy.int64,
28 +
            numpy.uint8,
29 +
            numpy.uint16,
30 +
            numpy.uint32,
31 +
            numpy.uint64,
32 +
            numpy.float_,
33 +
            numpy.float16,
34 +
            numpy.float32,
35 +
            numpy.float64,
36 +
            numpy.complex_,
37 +
            numpy.complex64,
38 +
            numpy.complex128]
39 +
40 +
13 41
def hash_str(c, hash_length):
14 42
    """
15 43
    Hashes a string.
@@ -21,15 +49,13 @@
Loading
21 49
    if isinstance(c, float):
22 50
        if numpy.isnan(c):
23 51
            return c
24 -
        else:
25 -
            raise ValueError("numpy.nan expected, not {0}".format(c))
26 -
    else:
27 -
        m = hashlib.sha256()
28 -
        m.update(c.encode("utf-8"))
29 -
        r = m.hexdigest()
30 -
        if len(r) >= hash_length:
31 -
            return r[:hash_length]
32 -
        return r
52 +
        raise ValueError("numpy.nan expected, not {0}".format(c))
53 +
    m = hashlib.sha256()
54 +
    m.update(c.encode("utf-8"))
55 +
    r = m.hexdigest()
56 +
    if len(r) >= hash_length:
57 +
        return r[:hash_length]
58 +
    return r
33 59
34 60
35 61
def hash_int(c, hash_length):
@@ -209,9 +235,9 @@
Loading
209 235
    """
210 236
    Shuffles a dataframe.
211 237
212 -
    @param      df              :epkg:`pandas:DataFrame`
213 -
    @param      random_state    seed
214 -
    @return                     new :epkg:`pandas:DataFrame`
238 +
    :param df: :epkg:`pandas:DataFrame`
239 +
    :param random_state: seed
240 +
    :return: new :epkg:`pandas:DataFrame`
215 241
216 242
    .. exref::
217 243
        :title: Shuffles the rows of a dataframe
@@ -257,11 +283,11 @@
Loading
257 283
    Replaces the :epkg:`nan` values for something not :epkg:`nan`.
258 284
    Mostly used by @see fn pandas_groupby_nan.
259 285
260 -
    @param      df      dataframe
261 -
    @param      by      list of columns for which we need to replace nan
262 -
    @param      hasna   None or list of columns for which we need to replace NaN
263 -
    @param      suffix  use a prefix for the NaN value
264 -
    @return             list of values chosen for each column, new dataframe (new copy)
286 +
    :param df: dataframe
287 +
    :param by: list of columns for which we need to replace nan
288 +
    :param hasna: None or list of columns for which we need to replace NaN
289 +
    :param suffix: use a prefix for the NaN value
290 +
    :return: list of values chosen for each column, new dataframe (new copy)
265 291
    """
266 292
    suffix = suffix if suffix else "²"
267 293
    df = df.copy()
@@ -291,10 +317,12 @@
Loading
291 317
            mi = abs(dr.min())
292 318
            ma = abs(dr.max())
293 319
            val = ma + mi
320 +
            if val == ma and not isinstance(val, str):
321 +
                val += ma + 1.
294 322
            if val <= ma:
295 323
                raise ValueError(  # pragma: no cover
296 -
                    "Unable to find a different value for column '{0}': min={1} max={2}"
297 -
                    "".format(val, mi, ma))
324 +
                    "Unable to find a different value for column '{}' v='{}: "
325 +
                    "min={} max={}".format(c, val, mi, ma))
298 326
            df[c].fillna(val, inplace=True)
299 327
            rep[c] = val
300 328
    return rep, df
@@ -304,19 +332,21 @@
Loading
304 332
    """
305 333
    Does a *groupby* including keeping missing values (:epkg:`nan`).
306 334
307 -
    @param      df          dataframe
308 -
    @param      by          column or list of columns
309 -
    @param      axis        only 0 is allowed
310 -
    @param      as_index    should be False
311 -
    @param      suffix      None or a string
312 -
    @param      nanback     put :epkg:`nan` back in the index,
313 -
                            otherwise it leaves a replacement for :epkg:`nan`.
314 -
                            (does not work when grouping by multiple columns)
315 -
    @param      kwargs      other parameters sent to
316 -
                            `groupby <http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html>`_
317 -
    @return                 groupby results
318 -
319 -
    See `groupby and missing values <http://pandas-docs.github.io/pandas-docs-travis/groupby.html#na-and-nat-group-handling>`_.
335 +
    :param df: dataframe
336 +
    :param by: column or list of columns
337 +
    :param axis: only 0 is allowed
338 +
    :param as_index: should be False
339 +
    :param suffix: None or a string
340 +
    :param nanback: put :epkg:`nan` back in the index,
341 +
        otherwise it leaves a replacement for :epkg:`nan`.
342 +
        (does not work when grouping by multiple columns)
343 +
    :param kwargs: other parameters sent to
344 +
        `groupby <http://pandas.pydata.org/pandas-docs/stable/
345 +
        generated/pandas.DataFrame.groupby.html>`_
346 +
    :return: groupby results
347 +
348 +
    See `groupby and missing values <http://pandas-docs.github.io/
349 +
    pandas-docs-travis/groupby.html#na-and-nat-group-handling>`_.
320 350
    If no :epkg:`nan` is detected, the function falls back in regular
321 351
    :epkg:`pandas:DataFrame:groupby` which has the following
322 352
    behavior.
@@ -411,7 +441,8 @@
Loading
411 441
                        break
412 442
                return res
413 443
            raise NotImplementedError(
414 -
                "Not yet implemented. Replacing pseudo nan values by real nan values is not as easy as it looks. Use nanback=False")
444 +
                "Not yet implemented. Replacing pseudo nan values by real nan "
445 +
                "values is not as easy as it looks. Use nanback=False")
415 446
416 447
            # keys = list(res.grouper.groups.keys())
417 448
            # didit = False
@@ -459,31 +490,3 @@
Loading
459 490
        return res
460 491
    else:
461 492
        return df.groupby(by, axis=axis, **kwargs)
462 -
463 -
464 -
def numpy_types():
465 -
    """
466 -
    Returns the list of :epkg:`numpy` available types.
467 -
468 -
    @return     list of types
469 -
    """
470 -
471 -
    return [numpy.bool_,
472 -
            numpy.int_,
473 -
            numpy.intc,
474 -
            numpy.intp,
475 -
            numpy.int8,
476 -
            numpy.int16,
477 -
            numpy.int32,
478 -
            numpy.int64,
479 -
            numpy.uint8,
480 -
            numpy.uint16,
481 -
            numpy.uint32,
482 -
            numpy.uint64,
483 -
            numpy.float_,
484 -
            numpy.float16,
485 -
            numpy.float32,
486 -
            numpy.float64,
487 -
            numpy.complex_,
488 -
            numpy.complex64,
489 -
            numpy.complex128]
Files Coverage
pandas_streaming 94.59%
Project Totals (11 files) 94.59%
Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading