scikit-learn / scikit-learn

@@ -302,7 +302,7 @@
Loading
302 302
                pred = _predict_binary(e, X)
303 303
                np.maximum(maxima, pred, out=maxima)
304 304
                argmaxima[maxima == pred] = i
305 -
            return self.classes_[np.array(argmaxima.T)]
305 +
            return self.classes_[argmaxima]
306 306
        else:
307 307
            indices = array.array('i')
308 308
            indptr = array.array('i', [0])
@@ -465,11 +465,15 @@
Loading
465 465
466 466
    Attributes
467 467
    ----------
468 -
    estimators_ : list of `n_classes * (n_classes - 1) / 2` estimators
468 +
    estimators_ : list of ``n_classes * (n_classes - 1) / 2`` estimators
469 469
        Estimators used for predictions.
470 470
471 471
    classes_ : numpy array of shape [n_classes]
472 472
        Array containing labels.
473 +
474 +
    pairwise_indices_ : list, length = ``len(estimators_)``, or ``None``
475 +
        Indices of samples used when training the estimators.
476 +
        ``None`` when ``estimator`` does not have ``_pairwise`` attribute.
473 477
    """
474 478
475 479
    def __init__(self, estimator, n_jobs=None):
@@ -505,11 +509,8 @@
Loading
505 509
            for i in range(n_classes) for j in range(i + 1, n_classes)))))
506 510
507 511
        self.estimators_ = estimators_indices[0]
508 -
        try:
509 -
            self.pairwise_indices_ = (
510 -
                estimators_indices[1] if self._pairwise else None)
511 -
        except AttributeError:
512 -
            self.pairwise_indices_ = None
512 +
        self.pairwise_indices_ = (
513 +
            estimators_indices[1] if self._pairwise else None)
513 514
514 515
        return self
515 516
@@ -630,10 +631,6 @@
Loading
630 631
        """Indicate if wrapped estimator is using a precomputed Gram matrix"""
631 632
        return getattr(self.estimator, "_pairwise", False)
632 633
633 -
    def _more_tags(self):
634 -
        # FIXME Remove once #10440 is merged
635 -
        return {'_skip_test': True}
636 -
637 634
638 635
class OutputCodeClassifier(BaseEstimator, ClassifierMixin, MetaEstimatorMixin):
639 636
    """(Error-Correcting) Output-Code multiclass strategy

@@ -29,6 +29,7 @@
Loading
29 29
from numpy.testing import assert_array_almost_equal
30 30
from numpy.testing import assert_array_equal
31 31
from sklearn.utils import IS_PYPY
32 +
from sklearn.exceptions import ChangedBehaviorWarning
32 33
from sklearn.utils.testing import (assert_equal, assert_not_equal,
33 34
                                   assert_almost_equal, assert_in,
34 35
                                   assert_less, assert_greater,
@@ -1196,3 +1197,47 @@
Loading
1196 1197
                                            .findall(doc),
1197 1198
                    stop_words=['and'])
1198 1199
    assert _check_stop_words_consistency(vec) is True
1200 +
1201 +
1202 +
@pytest.mark.parametrize('Estimator',
1203 +
                         [CountVectorizer, TfidfVectorizer, HashingVectorizer])
1204 +
@pytest.mark.parametrize(
1205 +
    'input_type, err_type, err_msg',
1206 +
    [('filename', FileNotFoundError, ''),
1207 +
     ('file', AttributeError, "'str' object has no attribute 'read'")]
1208 +
)
1209 +
def test_callable_analyzer_error(Estimator, input_type, err_type, err_msg):
1210 +
    data = ['this is text, not file or filename']
1211 +
    with pytest.raises(err_type, match=err_msg):
1212 +
        Estimator(analyzer=lambda x: x.split(),
1213 +
                  input=input_type).fit_transform(data)
1214 +
1215 +
1216 +
@pytest.mark.parametrize('Estimator',
1217 +
                         [CountVectorizer, TfidfVectorizer, HashingVectorizer])
1218 +
@pytest.mark.parametrize(
1219 +
    'analyzer', [lambda doc: open(doc, 'r'), lambda doc: doc.read()]
1220 +
)
1221 +
@pytest.mark.parametrize('input_type', ['file', 'filename'])
1222 +
def test_callable_analyzer_change_behavior(Estimator, analyzer, input_type):
1223 +
    data = ['this is text, not file or filename']
1224 +
    warn_msg = 'Since v0.21, vectorizer'
1225 +
    with pytest.raises((FileNotFoundError, AttributeError)):
1226 +
        with pytest.warns(ChangedBehaviorWarning, match=warn_msg) as records:
1227 +
            Estimator(analyzer=analyzer, input=input_type).fit_transform(data)
1228 +
    assert len(records) == 1
1229 +
    assert warn_msg in str(records[0])
1230 +
1231 +
1232 +
@pytest.mark.parametrize('Estimator',
1233 +
                         [CountVectorizer, TfidfVectorizer, HashingVectorizer])
1234 +
def test_callable_analyzer_reraise_error(tmpdir, Estimator):
1235 +
    # check if a custom exception from the analyzer is shown to the user
1236 +
    def analyzer(doc):
1237 +
        raise Exception("testing")
1238 +
1239 +
    f = tmpdir.join("file.txt")
1240 +
    f.write("sample content\n")
1241 +
1242 +
    with pytest.raises(Exception, match="testing"):
1243 +
        Estimator(analyzer=analyzer, input='file').fit_transform([f])

@@ -133,7 +133,8 @@
Loading
133 133
    See also
134 134
    --------
135 135
    load_svmlight_files: similar function for loading multiple files in this
136 -
    format, enforcing the same number of features/columns on all of them.
136 +
                         format, enforcing the same number of features/columns
137 +
                         on all of them.
137 138
138 139
    Examples
139 140
    --------

@@ -31,6 +31,7 @@
Loading
31 31
from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES
32 32
from ..utils import _IS_32BIT
33 33
from ..utils.fixes import _astype_copy_false
34 +
from ..exceptions import ChangedBehaviorWarning
34 35
35 36
36 37
__all__ = ['HashingVectorizer',
@@ -304,10 +305,34 @@
Loading
304 305
            self._stop_words_id = id(self.stop_words)
305 306
            return 'error'
306 307
308 +
    def _validate_custom_analyzer(self):
309 +
        # This is to check if the given custom analyzer expects file or a
310 +
        # filename instead of data.
311 +
        # Behavior changed in v0.21, function could be removed in v0.23
312 +
        import tempfile
313 +
        with tempfile.NamedTemporaryFile() as f:
314 +
            fname = f.name
315 +
        # now we're sure fname doesn't exist
316 +
317 +
        msg = ("Since v0.21, vectorizers pass the data to the custom analyzer "
318 +
               "and not the file names or the file objects. This warning "
319 +
               "will be removed in v0.23.")
320 +
        try:
321 +
            self.analyzer(fname)
322 +
        except FileNotFoundError:
323 +
            warnings.warn(msg, ChangedBehaviorWarning)
324 +
        except AttributeError as e:
325 +
            if str(e) == "'str' object has no attribute 'read'":
326 +
                warnings.warn(msg, ChangedBehaviorWarning)
327 +
        except Exception:
328 +
            pass
329 +
307 330
    def build_analyzer(self):
308 331
        """Return a callable that handles preprocessing and tokenization"""
309 332
        if callable(self.analyzer):
310 -
            return self.analyzer
333 +
            if self.input in ['file', 'filename']:
334 +
                self._validate_custom_analyzer()
335 +
            return lambda doc: self.analyzer(self.decode(doc))
311 336
312 337
        preprocess = self.build_preprocessor()
313 338
@@ -490,6 +515,11 @@
Loading
490 515
        If a callable is passed it is used to extract the sequence of features
491 516
        out of the raw, unprocessed input.
492 517
518 +
        .. versionchanged:: 0.21
519 +
        Since v0.21, if ``input`` is ``filename`` or ``file``, the data is
520 +
        first read from the file and then passed to the given callable
521 +
        analyzer.
522 +
493 523
    n_features : integer, default=(2 ** 20)
494 524
        The number of features (columns) in the output matrices. Small numbers
495 525
        of features are likely to cause hash collisions, but large numbers
@@ -745,6 +775,11 @@
Loading
745 775
        If a callable is passed it is used to extract the sequence of features
746 776
        out of the raw, unprocessed input.
747 777
778 +
        .. versionchanged:: 0.21
779 +
        Since v0.21, if ``input`` is ``filename`` or ``file``, the data is
780 +
        first read from the file and then passed to the given callable
781 +
        analyzer.
782 +
748 783
    max_df : float in range [0.0, 1.0] or int, default=1.0
749 784
        When building the vocabulary ignore terms that have a document
750 785
        frequency strictly higher than the given threshold (corpus-specific
@@ -1369,6 +1404,11 @@
Loading
1369 1404
        If a callable is passed it is used to extract the sequence of features
1370 1405
        out of the raw, unprocessed input.
1371 1406
1407 +
        .. versionchanged:: 0.21
1408 +
        Since v0.21, if ``input`` is ``filename`` or ``file``, the data is
1409 +
        first read from the file and then passed to the given callable
1410 +
        analyzer.
1411 +
1372 1412
    stop_words : string {'english'}, list, or None (default=None)
1373 1413
        If a string, it is passed to _check_stop_list and the appropriate stop
1374 1414
        list is returned. 'english' is currently the only supported string

@@ -537,7 +537,7 @@
Loading
537 537
    See also
538 538
    --------
539 539
    sklearn.metrics.pairwise.cosine_similarity
540 -
    scipy.spatial.distance.cosine (dense matrices only)
540 +
    scipy.spatial.distance.cosine : dense matrices only
541 541
    """
542 542
    # 1.0 - cosine_similarity(X, Y) without copy
543 543
    S = cosine_similarity(X, Y)
Files Coverage
sklearn 95.69%
Project Totals (383 files) 95.69%
Untitled
1
comment: false
2

3
coverage:
4
  status:
5
    project:
6
      default:
7
        # Commits pushed to master should not make the overall
8
        # project coverage decrease by more than 1%:
9
        target: auto
10
        threshold: 1%
11
    patch:
12
      default:
13
        # Be tolerant on slight code coverage diff on PRs to limit
14
        # noisy red coverage status on github PRs.
15
        # Note The coverage stats are still uploaded
16
        # to codecov so that PR reviewers can see uncovered lines
17
        # in the github diff if they install the codecov browser
18
        # extension:
19
        # https://github.com/codecov/browser-extension
20
        target: auto
21
        threshold: 1%
22

23
ignore:
24
- "sklearn/externals"
Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading